In [268]:
# External imports
import math
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB

# Load the dataset
data = pd.read_csv("shuffle_email_spam_classification.csv")

# Visualize the data
data

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [269]:
# Visualize data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [270]:
# Preprocessing

# Check data types 
data.dtypes

Email No.     object
the            int64
to             int64
ect            int64
and            int64
               ...  
military       int64
allowing       int64
ff             int64
dry            int64
Prediction     int64
Length: 3002, dtype: object

In [271]:
# Remove unnecessary column
data.drop(columns=['Email No.',], inplace=True)

# Visualize the updated data
data

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,1,0,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,1,0,1


In [272]:
# Create helping variables
rows = data.shape[0]
threshold = math.floor(0.75*rows)

# Visualize values
print(f'There are {rows} rows.\n')
print(f'There are {cols} columns.\n')
print(f'Threshold is: {threshold}')

There are 5172 rows.

There are 3001 columns.

Threshold is: 3879


In [273]:
# Create training data
X = data.iloc[0:threshold:, :-1]
Y = data.iloc[0:threshold:, -1]

# Check dimensions
assert X.shape[0] == Y.shape[0]

# Visualize data for verification
print("X data:\n", X.head(5))
print("Y data:\n", Y.head(5))

X data:
    the  to  ect  and  for  of    a  you  hou  in  ...  enhancements  connevey  \
0    0   0    1    0    0   0    2    0    0   0  ...             0         0   
1    8  13   24    6    6   2  102    1   27  18  ...             0         0   
2    0   0    1    0    0   0    8    0    0   4  ...             0         0   
3    0   5   22    0    5   1   51    2   10   1  ...             0         0   
4    7   6   17    1    5   2   57    0    9   3  ...             0         0   

   jay  valued  lay  infrastructure  military  allowing  ff  dry  
0    0       0    0               0         0         0   0    0  
1    0       0    0               0         0         0   1    0  
2    0       0    0               0         0         0   0    0  
3    0       0    0               0         0         0   0    0  
4    0       0    0               0         0         0   1    0  

[5 rows x 3000 columns]
Y data:
 0    0
1    0
2    0
3    0
4    0
Name: Prediction, dtype: int64


In [274]:
# Create testing data
X_test = data.iloc[threshold:rows:, :-1]
Y_test = data.iloc[threshold:rows:, -1]

# Check dimensions
assert X_test.shape[0] == Y_test.shape[0]

# Visualize data for verification
print("X Test data:\n", X_test.head(5))
print("Y Test data:\n", Y_test.head(5))

X Test data:
       the  to  ect  and  for  of   a  you  hou  in  ...  enhancements  \
3879    2   2    1    0    0   2   9    0    0   2  ...             0   
3880    1   1    1    0    0   0   7    0    0   1  ...             0   
3881    2   4    1    0    2   3  35    3    0   8  ...             0   
3882    3   3    1    2    2   1  30    1    0  10  ...             0   
3883    4   0    1    1    2   0  24    3    0   4  ...             0   

      connevey  jay  valued  lay  infrastructure  military  allowing  ff  dry  
3879         0    0       0    0               0         0         0   0    0  
3880         0    0       0    0               0         0         0   0    0  
3881         0    0       0    0               0         0         0   0    0  
3882         0    0       0    0               0         0         0   0    0  
3883         0    0       0    0               0         0         0   0    0  

[5 rows x 3000 columns]
Y Test data:
 3879    0
3880    1
3881    

In [275]:
# Classification algorithms

# 1) Logistic Regression
lr = LogisticRegression(solver='lbfgs', max_iter=1000) # Increased max_iter due to ConvergenceWarning

# Fit the model with the training data
lr.fit(X, Y)

# Predict based on the testing data
lr_predictions = lr.predict(X_test)

# 2) Naive Bayes
nb = GaussianNB()

# Fit the model with the training data
nb.fit(X, Y)

# Predict based on the testing data
nb_predictions = nb.predict(X_test)

In [282]:
# Create common function for results visualization
def show_stats(predictions, actual=Y_test):
    """Helper function to visualize model stats"""
    # Stats Summary
    stats = classification_report(actual, predictions, output_dict=False)

    # Visualize all the stats
    print(stats)

    # Accuracy
    accuracy = classification_report(actual, predictions, output_dict=True)['accuracy']

    # Visualize  Accuracy in percentage (%)
    print(f"> Accuracy: {math.ceil(accuracy*100)} %")

In [277]:
# Check the model results
print("[Logistic Regression]\n\nPredictions Vs. Actual:\n")

lr_table = pd.DataFrame({'Predictions': lr_predictions, 'Actual': Y_test})

print(lr_table)

print("[Naive Bayes]\n\nPredictions Vs. Actual:\n")

nb_table = pd.DataFrame({'Predictions': nb_predictions, 'Actual': Y_test})

print(nb_table)

[Logistic Regression]

Predictions Vs. Actual:

      Predictions  Actual
3879            0       0
3880            1       1
3881            0       0
3882            1       1
3883            0       0
...           ...     ...
5167            0       0
5168            0       0
5169            1       1
5170            1       1
5171            0       0

[1293 rows x 2 columns]
[Naive Bayes]

Predictions Vs. Actual:

      Predictions  Actual
3879            0       0
3880            1       1
3881            0       0
3882            1       1
3883            0       0
...           ...     ...
5167            0       0
5168            0       0
5169            1       1
5170            1       1
5171            0       0

[1293 rows x 2 columns]


In [278]:
# Visualize Logistic Regression Stats
show_stats(lr_predictions)

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       889
           1       0.89      0.94      0.91       404

    accuracy                           0.95      1293
   macro avg       0.93      0.94      0.94      1293
weighted avg       0.95      0.95      0.95      1293

> Accuracy: 95 %


In [279]:
# Visualize Naive Bayes Stats
show_stats(nb_predictions)

              precision    recall  f1-score   support

           0       0.98      0.93      0.95       889
           1       0.86      0.96      0.91       404

    accuracy                           0.94      1293
   macro avg       0.92      0.94      0.93      1293
weighted avg       0.94      0.94      0.94      1293

> Accuracy: 94 %
