In [None]:
# Classify the email using the binary classification method. 
# Email Spam detection has two states: 
# a) Normal State – Not Spam, b) Abnormal State – Spam. 
# Use K-Nearest Neighbors and Support Vector Machine for classification. 
# Analyze their performance.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.pipeline import make_pipeline
import pandas as pd

In [2]:
df = pd.read_csv("emails.csv")
df

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [4]:
X = df.drop(columns=["Email No.", "Prediction"])
y = df['Prediction']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# KNN Model
knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

In [8]:
# SVM Model
svm = make_pipeline(SVC(kernel='linear', C=1, probability=True))
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

In [9]:
# Performance Analysis
def evaluate(y_true, y_pred, model_name):
    print(f"Evaluation for {model_name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("\n")

evaluate(y_test, y_pred_knn, "K-Nearest Neighbors")
evaluate(y_test, y_pred_svm, "Support Vector Machine")

Evaluation for K-Nearest Neighbors
Accuracy: 0.8820876288659794
Precision: 0.7869198312236287
Recall: 0.8197802197802198
F1 Score: 0.8030139935414424
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.91      0.92      1097
           1       0.79      0.82      0.80       455

    accuracy                           0.88      1552
   macro avg       0.86      0.86      0.86      1552
weighted avg       0.88      0.88      0.88      1552



Evaluation for Support Vector Machine
Accuracy: 0.9568298969072165
Precision: 0.9235807860262009
Recall: 0.9296703296703297
F1 Score: 0.9266155531215772
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      1097
           1       0.92      0.93      0.93       455

    accuracy                           0.96      1552
   macro avg       0.95      0.95      0.95      1552
weighted avg       0.96      0.96      0.96     