#### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.utils import compute_sample_weight
from sklearn.naive_bayes import BernoulliNB

import joblib

from sklearn.metrics import accuracy_score, classification_report

#### Load dataset

In [2]:
df = pd.read_csv('cleaned_email_dataset.csv')

#### Explore/check cleaned data

##### Check for nulls, duplicated datasets, etc.

In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Message,Sender,Subject,Body,Sentence_count,Word_count,urls_found,Sentiment,Category
0,0,From: support@legitcompany.com\nSubject: Regar...,support@legitcompany.com,regard recent inquiry,thank reach regard [your inquiry]. review requ...,3,46,0,2,0
1,1,From: noreply@softwareupdates.com\nSubject: We...,noreply@softwareupdates.com,weekly newsletter late update,please find attached invoice service render ju...,5,57,0,2,0
2,2,From: noreply@softwareupdates.com\nSubject: Im...,noreply@softwareupdates.com,important: software update notification,thank order #6789. item ship within 2 business...,4,51,0,2,0
3,3,From: info@customerservice.co\nSubject: Team S...,info@customerservice.co,team stand-up 10,please find attached invoice service render ju...,5,57,0,2,0
4,4,From: info@customerservice.co\nSubject: Team S...,info@customerservice.co,team stand-up 10,here's weekly dose news update community. week...,4,57,0,2,0


In [4]:
df.isnull().sum()

Unnamed: 0        0
Message           0
Sender            0
Subject           0
Body              3
Sentence_count    0
Word_count        0
urls_found        0
Sentiment         0
Category          0
dtype: int64

In [5]:
# remove the blank columns
df = df.drop(columns=['Unnamed: 0'])
df.head(1)

Unnamed: 0,Message,Sender,Subject,Body,Sentence_count,Word_count,urls_found,Sentiment,Category
0,From: support@legitcompany.com\nSubject: Regar...,support@legitcompany.com,regard recent inquiry,thank reach regard [your inquiry]. review requ...,3,46,0,2,0


In [6]:
# fill the null values
df['Body'] = df['Body'].fillna(value='Missing Email Body')

In [7]:
df.isnull().sum()

Message           0
Sender            0
Subject           0
Body              0
Sentence_count    0
Word_count        0
urls_found        0
Sentiment         0
Category          0
dtype: int64

#### Prepare the data for the model

##### Declare the X and target (y) variables

In [8]:
X_num = df.drop(columns=['Category']).select_dtypes(include=np.number)
X_text = "Sender: " + df['Sender'].astype(str) + " Subject: " + df['Subject'].astype(str) + " Email Body: " + df['Body'].astype(str)
y = df['Category']

##### Do train-test-split

In [9]:
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(X_text, X_num, y, test_size=0.2, random_state=42)

##### TF-IDF vectorize

In [10]:
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=10000,
    ngram_range=(1,2)
)

X_train_tfidf = tfidf.fit_transform(X_text_train)
X_test_tfidf = tfidf.transform(X_text_test)

##### Scale numeric data

In [11]:
# scale the numeric data
scaler = StandardScaler()

X_num_train_sc = scaler.fit_transform(X_num_train)
X_num_test_sc = scaler.transform(X_num_test)

##### Combine into X_train and X_test

In [12]:
X_train = hstack([X_train_tfidf, X_num_train_sc])
X_test = hstack([X_test_tfidf, X_num_test_sc])

##### Define and fit SMOTE, the undersampler, and their datasets

In [13]:
sm = SMOTE(random_state=42)
rus = RandomUnderSampler(random_state=42)

In [14]:
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [15]:
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [16]:
# define the datasets
tfidf_datasets = {
    'no-balancing': (X_train, y_train),
    'smote': (X_train_sm, y_train_sm),
    'undersample': (X_train_rus, y_train_rus),
    'class-weighting': (X_train, y_train)
}

#### Do Logistic Regression and Naive Bayes, and compare

##### Logistic Regression

In [17]:
log_reg_results = {}

for name, (X_train, y_train) in tfidf_datasets.items():
    if name == 'class-weighting':
        # for when we are doing class weighting
        model = LogisticRegression(max_iter=1000, class_weight='balanced')
    else:
        model = LogisticRegression(max_iter=1000)
    
    # fit the model
    model.fit(X_train, y_train)
    
    # save model
    log_reg_name = f"log_reg_{name}_model.joblib"
    joblib.dump(model, log_reg_name)

    y_pred = model.predict(X_test)

    log_reg_results[name] = classification_report(y_test, y_pred, output_dict=True)

##### Naive Bayes

In [25]:
nb_results = {}

for name, (X_train, y_train) in tfidf_datasets.items():
    if name == 'class-weighting':
        sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
        model = BernoulliNB()

        # fit the model
        model.fit(X_train, y_train, sample_weight=sample_weights)
    else:
        model = BernoulliNB()
    
        # fit the model
        model.fit(X_train, y_train)
    
    # save model
    nb_name = f"nb_{name}_model.joblib"
    joblib.dump(model, nb_name)

    y_pred = model.predict(X_test)

    nb_results[name] = classification_report(y_test, y_pred, output_dict=True)

#### Compare models

In [21]:
df_log_reg_reports = {}

for name, result in log_reg_results.items():
    df_report = pd.DataFrame(result).transpose()

    df_log_reg_reports[name] = df_report

In [23]:
for name, report in df_log_reg_reports.items():
    print(name, ":\n",report)

no-balancing :
               precision    recall  f1-score      support
0              0.980502  1.000000  0.990155  2615.000000
1              1.000000  0.896000  0.945148   500.000000
accuracy       0.983307  0.983307  0.983307     0.983307
macro avg      0.990251  0.948000  0.967651  3115.000000
weighted avg   0.983632  0.983307  0.982931  3115.000000
smote :
               precision    recall  f1-score      support
0              0.992770  0.997706  0.995232  2615.000000
1              0.987680  0.962000  0.974671   500.000000
accuracy       0.991974  0.991974  0.991974     0.991974
macro avg      0.990225  0.979853  0.984951  3115.000000
weighted avg   0.991953  0.991974  0.991931  3115.000000
undersample :
               precision    recall  f1-score      support
0              0.991593  0.992352  0.991972  2615.000000
1              0.959839  0.956000  0.957916   500.000000
accuracy       0.986517  0.986517  0.986517     0.986517
macro avg      0.975716  0.974176  0.974944  311

In [26]:
df_nb_reports = {}

for name, result in nb_results.items():
    df_report = pd.DataFrame(result).transpose()

    df_nb_reports[name] = df_report

for name, report in df_nb_reports.items():
    print(name, ":\n", report)

no-balancing :
               precision    recall  f1-score      support
0              0.994257  0.993117  0.993687  2615.000000
1              0.964215  0.970000  0.967099   500.000000
accuracy       0.989406  0.989406  0.989406     0.989406
macro avg      0.979236  0.981558  0.980393  3115.000000
weighted avg   0.989435  0.989406  0.989419  3115.000000
smote :
               precision    recall  f1-score      support
0              0.993529  0.998088  0.995803  2615.000000
1              0.989754  0.966000  0.977733   500.000000
accuracy       0.992937  0.992937  0.992937     0.992937
macro avg      0.991641  0.982044  0.986768  3115.000000
weighted avg   0.992923  0.992937  0.992903  3115.000000
undersample :
               precision    recall  f1-score      support
0              0.996810  0.956023  0.975991  2615.000000
1              0.810544  0.984000  0.888889   500.000000
accuracy       0.960514  0.960514  0.960514     0.960514
macro avg      0.903677  0.970011  0.932440  311

##### Smote with both models performed the best. However even though Naive Bayes with smote performed the best out of all the models and also won without balancing, Logistic Regression pulls ahead with undersampling and class-weighting. Therfore the top models to be used are log_reg_smote_model, nb_smote_model, and log_reg_class-weighting_model