In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics


In [2]:
df = pd.read_csv('emails.csv')
print("First 5 rows of the dataset:")
display(df.head())

First 5 rows of the dataset:


Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [3]:
# Data Preprocessing
print("\nColumn names:\n", df.columns)
print("\nMissing values in each column:\n", df.isnull().sum())


Column names:
 Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3002)

Missing values in each column:
 Email No.     0
the           0
to            0
ect           0
and           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3002, dtype: int64


In [4]:
# Drop missing values and unnecessary column
df.dropna(inplace=True)
df.drop(['Email No.'], axis=1, inplace=True)

In [5]:
# Splitting features and target
X = df.drop(['Prediction'], axis=1)
y = df['Prediction']

In [6]:
# Scale the features
X = scale(X)

In [7]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# K-Nearest Neighbors Classifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("KNN Accuracy:", metrics.accuracy_score(y_test, y_pred_knn))
print("KNN Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred_knn))

KNN Accuracy: 0.8009020618556701
KNN Confusion Matrix:
 [[804 293]
 [ 16 439]]


In [9]:
# Support Vector Machine Classifier
svm_model = SVC(C=1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("SVM Accuracy:", metrics.accuracy_score(y_test, y_pred_svm))
print("SVM Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred_svm))

SVM Accuracy: 0.9381443298969072
SVM Confusion Matrix:
 [[1091    6]
 [  90  365]]
