In [8]:
# imports
from google.colab import drive, files
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [16]:
drive.mount('/content/drive')
DATA_PATH = "/content/drive/My Drive"
infile = open(DATA_PATH+'glove.6B.100d.txt','r')
df = pickle.load(infile)

Mounted at /content/drive


In [4]:
uploaded = files.upload()

Saving glove.6B.100d.txt to glove.6B.100d.txt


In [17]:
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

User uploaded file "glove.6B.100d.txt" with length 347117594 bytes


In [18]:
def load_embeddings(embedding_file):
    embeddings = {}
    with open(embedding_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

embedding_file = open(DATA_PATH+'/combined_df.pkl','rb')
embeddings_index = load_embeddings(embedding_file)

In [37]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmbeddingVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, embeddings):
        self.embeddings = embeddings
        self.dim = len(next(iter(embeddings.values())))  # Dimensionality of embeddings

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = np.array([
            np.mean([self.embeddings.get(word, np.zeros(self.dim))
                    for word in text.split() if word in self.embeddings] or [np.zeros(self.dim)], axis=0)
            for text in X
        ])
        return X_transformed

In [38]:
df

Unnamed: 0,Email Text,Email Type,email_text_cleaned
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,": 6 . 1100 , disc : uniformitarianism , : 1086..."
1,the other side of * galicismos * * galicismo *...,Safe Email,side galicismos galicismo spanish term name im...
2,re : equistar deal tickets are you still avail...,Safe Email,: equistar deal ticket still available assist ...
3,\nHello I am your hot lil horny toy.\n I am...,Malicious Email,"hello hot lil horny toy . one dream , open min..."
4,software at incredibly low prices ( 86 % lower...,Malicious Email,software incredibly low price 86 lower . drape...
...,...,...,...
23816,Subject: put the 10 on the ft\r\nthe transport...,Safe Email,subject : put 10 ft transport volume decreased...
23817,Subject: 3 / 4 / 2000 and following noms\r\nhp...,Safe Email,subject : 3 4 2000 following noms hpl ' take e...
23818,Subject: calpine daily gas nomination\r\n>\r\n...,Safe Email,"subject : calpine daily gas nomination julie ,..."
23819,Subject: industrial worksheets for august 2000...,Safe Email,subject : industrial worksheet august 2000 act...


In [39]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [40]:
unique_types = df['Email Text'].apply(type).unique()
print(unique_types)

# Find rows where 'email_text' is not a string
non_string_entries = df[df['Email Text'].apply(lambda x: not isinstance(x, str))]

# Display these rows to understand what they contain
print(non_string_entries)

[<class 'str'>]
Empty DataFrame
Columns: [Email Text, Email Type, email_text_cleaned]
Index: []


In [41]:
# remove NaNs
df = df.dropna(subset=['Email Text'])

In [42]:
def preprocess_text(text):
    # Cleaning: remove special characters and extra spaces
    # text = re.sub(r'[\t\n\r]+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,;\'\":-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Stop Words Removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]

    # Rejoin tokens into a single string
    return ' '.join(tokens)

df.loc[:, 'email_text_cleaned'] = df['Email Text'].apply(preprocess_text)

In [None]:
df

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [44]:
# # Split the dataset into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(df['email_text_cleaned'], df['Email Type'], test_size=0.3, random_state=42)

# # Create a pipeline that combines TF-IDF vectorization with a Logistic Regression model
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer(stop_words='english')),
#     ('clf', LogisticRegression(random_state=42)),
# ])

In [45]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['email_text_cleaned'], df['Email Type'], test_size=0.3, random_state=42)

pipeline = Pipeline([
    ('embedding_vectorizer', EmbeddingVectorizer(embeddings=embeddings_index)),
    ('classifier', LogisticRegression(random_state=42)),
])

In [None]:
# Train the model on the training data
pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print(report)

In [None]:
from joblib import dump, load

# Save the model to a file
model_filename = DATA_PATH + '/email_classifier_model.joblib'
dump(pipeline, model_filename)

print(f"Model saved to {model_filename}")

In [None]:
model = load(DATA_PATH + '/email_classifier_model.joblib')

In [None]:
test_text = 'Personal Assistant Needed\nYou are invited to participate in a Part-time work offer for current\nstaff/students. For more information About the task CLICK HERE\nApplication will be received and you will get a response between\n24- 48 hours.'

predictions = pipeline.predict([test_text])
print(predictions[0])

probabilities = pipeline.predict_proba([test_text])
print(f"Confidence: {max(probabilities[0])*100:.2f}%")

In [None]:
test_text = """Hello Feng Du,
Thank you for shopping with us. We wanted to let you know that LaGOnlinestore has shipped your item(s) on 11/15 and that this completes your order. If you need to return an item or manage other orders, please visit Your Orders on Amazon.co.jp. You can print out a receipt here."""

predictions = pipeline.predict([test_text])
print(predictions[0])

probabilities = pipeline.predict_proba([test_text])
print(f"Confidence: {max(probabilities[0])*100:.2f}%")

In [None]:
test_text = """Verify and sign in
Hi there,

Verify yourself below to sign in to your Booking.com account for fengdu88@gmail.com.

The link can only be used once and expires in 10 minutes if you don’t use it."""
predictions = pipeline.predict([test_text])
print(predictions[0])

probabilities = pipeline.predict_proba([test_text])
print(f"Confidence: {max(probabilities[0])*100:.2f}%")

In [None]:
test_text = """Hi Feng, you have a chat pending request from 马雯煊


Introduction from 马雯煊:
I'm very interested in you, can you talk with me

To confirm or delete this request, or block this user, follow the link below:

https://www.conversationexchange.com/members/friendRequestPending.php?lg=en

The Conversation Exchange Team"""
predictions = pipeline.predict([test_text])
print(predictions[0])

probabilities = pipeline.predict_proba([test_text])
print(f"Confidence: {max(probabilities[0])*100:.2f}%")

In [None]:
test_text = """Volley6'ers,

Only 72hrs remaining in our Captain's Pre-sale!!

Our pre-sale is now available to current captains and Public Registration (incl. individual player registration) will open up on Monday, March 4th at 10am. Your team space can be reserved with a deposit of $150.

 Please keep in mind that the deposit must be paid during the captain's pre-sale to secure your spot. """
predictions = pipeline.predict([test_text])
print(predictions[0])

probabilities = pipeline.predict_proba([test_text])
print(f"Confidence: {max(probabilities[0])*100:.2f}%")