In [7]:
# imports
from google.colab import drive
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
drive.mount('/content/drive')
DATA_PATH = "/content/drive/My Drive"
infile = open(DATA_PATH+'/combined_df.pkl','rb')
df = pickle.load(infile)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
df

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Malicious Email
4,software at incredibly low prices ( 86 % lower...,Malicious Email
...,...,...
23816,Subject: put the 10 on the ft\r\nthe transport...,Safe Email
23817,Subject: 3 / 4 / 2000 and following noms\r\nhp...,Safe Email
23818,Subject: calpine daily gas nomination\r\n>\r\n...,Safe Email
23819,Subject: industrial worksheets for august 2000...,Safe Email


In [10]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [11]:
unique_types = df['Email Text'].apply(type).unique()
print(unique_types)

# Find rows where 'email_text' is not a string
non_string_entries = df[df['Email Text'].apply(lambda x: not isinstance(x, str))]

# Display these rows to understand what they contain
print(non_string_entries)

[<class 'str'> <class 'float'>]
      Email Text       Email Type
31           NaN  Malicious Email
387          NaN  Malicious Email
1883         NaN  Malicious Email
2049         NaN  Malicious Email
2451         NaN  Malicious Email
2972         NaN  Malicious Email
3627         NaN  Malicious Email
3806         NaN  Malicious Email
5763         NaN  Malicious Email
6299         NaN  Malicious Email
6821         NaN  Malicious Email
8594         NaN  Malicious Email
9999         NaN  Malicious Email
11069        NaN  Malicious Email
11320        NaN  Malicious Email
13843        NaN  Malicious Email


In [12]:
# remove NaNs
df = df.dropna(subset=['Email Text'])

In [13]:
def preprocess_text(text):
    # Cleaning: remove special characters and extra spaces
    # text = re.sub(r'[\t\n\r]+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,;\'\":-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Stop Words Removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]

    # Rejoin tokens into a single string
    return ' '.join(tokens)

df.loc[:, 'email_text_cleaned'] = df['Email Text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'email_text_cleaned'] = df['Email Text'].apply(preprocess_text)


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

In [15]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['email_text_cleaned'], df['Email Type'], test_size=0.2, random_state=42)

# Create a pipeline that combines TF-IDF vectorization with a Logistic Regression model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('mlp', MLPClassifier(hidden_layer_sizes=(100, 100, 100,), activation='relu', random_state=42)),
])

In [16]:
# Train the model on the training data
pipeline.fit(X_train, y_train)

In [26]:
# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print(report)

Accuracy: 0.9800462087796681
                 precision    recall  f1-score   support

Malicious Email       0.96      0.99      0.97      1803
     Safe Email       0.99      0.98      0.98      2958

       accuracy                           0.98      4761
      macro avg       0.98      0.98      0.98      4761
   weighted avg       0.98      0.98      0.98      4761



In [27]:
from joblib import dump, load

# Save the model to a file
model_filename = DATA_PATH + '/email_classifier_model_mlp.joblib'
dump(pipeline, model_filename)

print(f"Model saved to {model_filename}")

Model saved to /content/drive/My Drive/email_classifier_model_mlp.joblib


In [28]:
model = load(DATA_PATH + '/email_classifier_model_mlp.joblib')

In [34]:
test_text = 'Personal Assistant Needed\nYou are invited to participate in a Part-time work offer for current\nstaff/students. For more information About the task CLICK HERE\nApplication will be received and you will get a response between\n24- 48 hours.'

clean_text = preprocess_text(test_text)

predictions = pipeline.predict([clean_text])
print(predictions[0])

probabilities = pipeline.predict_proba([clean_text])
print(f"Confidence: {max(probabilities[0])*100:.2f}%")

Malicious Email
Confidence: 100.00%


In [35]:
test_text = """Hello Feng Du,
Thank you for shopping with us. We wanted to let you know that LaGOnlinestore has shipped your item(s) on 11/15 and that this completes your order. If you need to return an item or manage other orders, please visit Your Orders on Amazon.co.jp. You can print out a receipt here."""

clean_text = preprocess_text(test_text)

predictions = pipeline.predict([clean_text])
print(predictions[0])

probabilities = pipeline.predict_proba([clean_text])
print(f"Confidence: {max(probabilities[0])*100:.2f}%")

Safe Email
Confidence: 99.78%


In [36]:
test_text = """Verify and sign in
Hi there,

Verify yourself below to sign in to your Booking.com account for fengdu88@gmail.com.

The link can only be used once and expires in 10 minutes if you don’t use it."""

clean_text = preprocess_text(test_text)

predictions = pipeline.predict([clean_text])
print(predictions[0])

probabilities = pipeline.predict_proba([clean_text])
print(f"Confidence: {max(probabilities[0])*100:.2f}%")

Malicious Email
Confidence: 99.72%


In [37]:
test_text = """Hi Feng, you have a chat pending request from 马雯煊


Introduction from 马雯煊:
I'm very interested in you, can you talk with me

To confirm or delete this request, or block this user, follow the link below:

https://www.conversationexchange.com/members/friendRequestPending.php?lg=en

The Conversation Exchange Team"""

clean_text = preprocess_text(test_text)

predictions = pipeline.predict([clean_text])
print(predictions[0])

probabilities = pipeline.predict_proba([clean_text])
print(f"Confidence: {max(probabilities[0])*100:.2f}%")

Safe Email
Confidence: 68.13%


In [38]:
test_text = """Volley6'ers,

Only 72hrs remaining in our Captain's Pre-sale!!

Our pre-sale is now available to current captains and Public Registration (incl. individual player registration) will open up on Monday, March 4th at 10am. Your team space can be reserved with a deposit of $150.

 Please keep in mind that the deposit must be paid during the captain's pre-sale to secure your spot. """

clean_text = preprocess_text(test_text)

predictions = pipeline.predict([clean_text])
print(predictions[0])

probabilities = pipeline.predict_proba([clean_text])
print(f"Confidence: {max(probabilities[0])*100:.2f}%")

Malicious Email
Confidence: 92.51%
