Classify the email using the binary classification method. Email Spam detection has two states: a) Normal State – Not Spam, b) Abnormal State – Spam. Use K-Nearest Neighbors and Support Vector Machine for classification. Analyze their performance.
Dataset link: The emails.csv dataset on the Kaggle https://www.kaggle.com/datasets/balaka18/email-spam-classification-dataset-csv <br>
1 : spam
0 : not spam

In [18]:
import pandas as pd
import numpy  as np
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
df = pd.read_csv('data/emails.csv')

if ~df.empty == False:
    print("Loaded!")

In [20]:
df.shape

(5172, 3002)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [22]:
df.columns

Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3002)

In [23]:
df.head(5)

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [24]:
df.drop(columns = "Email No.",inplace=True)

In [25]:
df.head(3)

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0


In [28]:
df.isnull().sum().sum()

0

In [44]:
d = dict(df.sum(axis = 0))
d

{'the': 34345,
 'to': 32005,
 'ect': 26604,
 'and': 15907,
 'for': 16161,
 'of': 13587,
 'a': 287136,
 'you': 12757,
 'hou': 10470,
 'in': 54824,
 'on': 56560,
 'is': 27857,
 'this': 7182,
 'enron': 6906,
 'i': 237177,
 'be': 16702,
 'that': 4781,
 'will': 4401,
 'have': 4162,
 'with': 4860,
 'your': 4212,
 'at': 35854,
 'we': 10234,
 's': 216251,
 'are': 7285,
 'it': 23280,
 'by': 3400,
 'com': 9140,
 'as': 24864,
 'from': 4210,
 'gas': 3193,
 'or': 40078,
 'not': 4335,
 'me': 27579,
 'deal': 3799,
 'if': 6337,
 'meter': 2783,
 'hpl': 3285,
 'please': 3247,
 're': 47112,
 'e': 438561,
 'any': 4098,
 'our': 7895,
 'corp': 2136,
 'can': 3000,
 'd': 125875,
 'all': 6757,
 'has': 2531,
 'was': 1878,
 'know': 1758,
 'need': 2119,
 'an': 49889,
 'forwarded': 1297,
 'new': 2111,
 't': 312791,
 'may': 1492,
 'up': 4307,
 'j': 19556,
 'mmbtu': 1421,
 'should': 1329,
 'do': 6642,
 'am': 9870,
 'get': 1939,
 'out': 3319,
 'see': 1605,
 'no': 15403,
 'there': 1476,
 'price': 1669,
 'daren': 1904,

In [55]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

0       0
1       0
2       0
3       0
4       0
       ..
5167    0
5168    0
5169    1
5170    1
5171    0
Name: Prediction, Length: 5172, dtype: int64

In [57]:
X.shape,y.shape

((5172, 3000), (5172,))

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [61]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [70]:
neigh = KNeighborsClassifier(n_neighbors=2)
svmModel = SVC(gamma = 'auto')

In [71]:
knn = neigh.fit(X_train,y_train)
svm = svmModel.fit(X_train,y_train)

In [73]:
print("Train")
print(f"KNN mean accuracy : {knn.score(X_train,y_train)}")
print(f"SVM mean accuracy : {svm.score(X_train,y_train)}")

Train
KNN mean accuracy : 0.9591491418902587
SVM mean accuracy : 0.9632583998066232


In [77]:
print("Test")
print(f"KNN mean accuracy : {round(knn.score(X_test,y_test),4)}")
print(f"SVM mean accuracy : {round(svm.score(X_test,y_test),4)}")

Test
KNN mean accuracy : 0.8744
SVM mean accuracy : 0.9121


In [103]:
preds = svm.predict(X_test)

In [104]:
predictions = np.array(["spam" if i else "Not spam" for i in preds])

In [105]:
predictions

array(['Not spam', 'Not spam', 'spam', ..., 'Not spam', 'spam',
       'Not spam'], dtype='<U8')