In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Step 1: Load the dataset
# Replace 'email.csv' with the path to your dataset file
df = pd.read_csv('./emails.csv')
df

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [3]:
# Step 2: Data Preprocessing
# Drop the 'Email No.' column as it is not a feature
df = df.drop(columns=['Email No.'])

# Separate features and labels
X = df.drop(columns=['Prediction'])  # Features
y = df['Prediction']  # Labels (0 for not spam, 1 for spam)
     


In [4]:
# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Step 4: Train K-Nearest Neighbors Classifier
knn = KNeighborsClassifier(n_neighbors=5)  # You can tune this parameter
knn.fit(X_train, y_train)

# Predict and evaluate KNN model
y_pred_knn = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)
knn_auc = roc_auc_score(y_test, y_pred_knn)

print("K-Nearest Neighbors Model Performance:")
print(f"Accuracy: {knn_accuracy:.4f}")
print(f"Precision: {knn_precision:.4f}")
print(f"Recall: {knn_recall:.4f}")
print(f"F1 Score: {knn_f1:.4f}")
print(f"ROC-AUC Score: {knn_auc:.4f}")
print("\nClassification Report for KNN:\n", classification_report(y_test, y_pred_knn))

K-Nearest Neighbors Model Performance:
Accuracy: 0.8638
Precision: 0.7273
Recall: 0.8378
F1 Score: 0.7786
ROC-AUC Score: 0.8560

Classification Report for KNN:
               precision    recall  f1-score   support

           0       0.93      0.87      0.90       739
           1       0.73      0.84      0.78       296

    accuracy                           0.86      1035
   macro avg       0.83      0.86      0.84      1035
weighted avg       0.87      0.86      0.87      1035



In [6]:
# Step 5: Train Support Vector Machine Classifier
svm = SVC(kernel='linear', probability=True)  # Using a linear kernel
svm.fit(X_train, y_train)

# Predict and evaluate SVM model
y_pred_svm = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)
svm_auc = roc_auc_score(y_test, y_pred_svm)

print("\nSupport Vector Machine Model Performance:")
print(f"Accuracy: {svm_accuracy:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall: {svm_recall:.4f}")
print(f"F1 Score: {svm_f1:.4f}")
print(f"ROC-AUC Score: {svm_auc:.4f}")
print("\nClassification Report for SVM:\n", classification_report(y_test, y_pred_svm))

# Analyzing Performance
if knn_accuracy > svm_accuracy:
    print("\nK-Nearest Neighbors performed better based on accuracy.")
else:
    print("\nSupport Vector Machine performed better based on accuracy.")



Support Vector Machine Model Performance:
Accuracy: 0.9594
Precision: 0.9205
Recall: 0.9392
F1 Score: 0.9298
ROC-AUC Score: 0.9534

Classification Report for SVM:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97       739
           1       0.92      0.94      0.93       296

    accuracy                           0.96      1035
   macro avg       0.95      0.95      0.95      1035
weighted avg       0.96      0.96      0.96      1035


Support Vector Machine performed better based on accuracy.
