In [4]:
# ---------------------- SUPPORT VECTOR MACHINE (SVM - SVC, Simple Explanation) ----------------------
# SVM is a supervised classification algorithm.
# It tries to find the best boundary (hyperplane) that separates classes.
# SVC (Support Vector Classifier) can use different kernels (linear, RBF, etc.)
# to separate data that is not linearly separable.

# ---------------------- 1. IMPORT REQUIRED LIBRARIES ----------------------

import pandas as pd                          # For loading and handling datasets
import numpy as np                           # For numerical computations

import matplotlib.pyplot as plt              # For plotting
import seaborn as sns                        # For nicer visualizations

from sklearn.model_selection import train_test_split   # Split into train/test sets
from sklearn.preprocessing import StandardScaler       # Scale features for SVM
from sklearn.svm import SVC                            # SVM classifier
from sklearn.metrics import (
    accuracy_score,                          # Overall accuracy
    confusion_matrix,                        # Correct vs incorrect predictions
    classification_report                    # Precision, recall, f1-score
)


In [5]:
# ---------------------- 2. LOAD THE DATASET ----------------------
# Replacing local path with a public Kaggle dataset link:
# Kaggle Dataset (Email Spam Classification):
# https://www.kaggle.com/datasets/balaka18/email-spam-classification-dataset-csv

df = pd.read_csv("emails.csv")   # Load the dataset
df.head()                        # Show first 5 rows


Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [6]:
# ---------------------- 3. CHECK CLASS DISTRIBUTION ----------------------
# df.iloc[:, -1] selects the LAST column (the label column: spam/ham).
# value_counts() shows how many emails belong to each class.

label_counts = df.iloc[:, -1].value_counts()
print(label_counts)


Prediction
0    3672
1    1500
Name: count, dtype: int64


In [7]:
# ---------------------- 4. SELECT FEATURES AND TARGET ----------------------
# X = all columns except the first (ID column) and the last (label)
# y = the last column → spam (1) or ham (0)

X = df.iloc[:, 1:-1]    # Features (email word frequencies, etc.)
y = df.iloc[:, -1]      # Target labels


In [9]:
# ---------------------- 5. SPLIT DATA INTO TRAIN & TEST SETS ----------------------
# stratify=y keeps spam/ham ratio equal in both sets.
# 80% → training, 20% → testing.

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,       # 20% test set
    random_state=42,     # Reproducible split
    stratify=y           # Maintain class balance
)


In [10]:
# ---------------------- 6. TRAIN SVM CLASSIFIER ----------------------
# Using kernel='linear' means SVM will try to separate spam vs ham emails
# with a straight-line (linear) boundary in high-dimensional space.

svm_clf = SVC(kernel='linear')   # Create SVM model with linear kernel
svm_clf.fit(X_train, y_train)    # Train the model


In [12]:
# ---------------------- 7. PREDICT & CHECK ACCURACY ----------------------
# model.predict() gives spam (1) or ham (0) for each email.
# accuracy_score compares predictions with actual labels.

y_pred = svm_clf.predict(X_test)              # Predict on test data
accuracy = accuracy_score(y_test, y_pred)     # Calculate accuracy
print(f"Accuracy is {accuracy: .3f}")


Accuracy is  0.967


In [13]:
# ---------------------- 8. CONFUSION MATRIX ----------------------
# Shows how many emails were correctly/incorrectly classified as spam or ham.

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Confusion Matrix:
 [[716  19]
 [ 15 285]]


In [14]:
# ---------------------- 9. CLASSIFICATION REPORT ----------------------
# Shows precision, recall, f1-score, and support for:
# - Not Spam (0)
# - Spam (1)

print("Classification Report:\n")
print(classification_report(
    y_test,
    y_pred,
    target_names=["Not Spam", "Spam"]
))


Classification Report:

              precision    recall  f1-score   support

    Not Spam       0.98      0.97      0.98       735
        Spam       0.94      0.95      0.94       300

    accuracy                           0.97      1035
   macro avg       0.96      0.96      0.96      1035
weighted avg       0.97      0.97      0.97      1035



In [15]:
# ---------------------- 10. EXAMPLE SPAM EMAIL FOR INFERENCE ----------------------
# This is a sample email text that looks like a phishing/spam message.
# We'll convert it using the same features and run prediction later.

spam_email_text = """
Dear Customer,

Your account has been TEMPORARILY SUSPENDED due to unusual login activity.  
To restore access, please verify your identity immediately by clicking the secure link below:

https://verify-secure-account-login.com/restore

If you do not complete the verification within 24 hours, your account will be permanently deactivated.  
This is an automated message. Please do not reply.

Thank you,
Account Security Team
"""


In [17]:
# ---------------------- 11. CONVERT NEW EMAIL INTO FEATURE VECTOR ----------------------
# The SVM model expects the SAME features (columns) as the training data.
# So we:
# 1. Extract all lowercase words from the email
# 2. Create a vector with the same columns as X (all zeros initially)
# 3. Increase count for each matching word found in the email

import re

vocab = X.columns                                        # Training vocabulary (feature names)

words = re.findall(r'\b[a-z]+\b', spam_email_text.lower())   # Extract only alphabetic words

# Create a single-row DataFrame with all features initialized to 0
new_email_vector = pd.DataFrame(data=[0]*len(vocab), index=vocab).T

# Increase count for any word that appears in vocabulary
for w in words:
    if w in new_email_vector.columns:
        new_email_vector[w] += 1


In [18]:
# ---------------------- 12. PREDICT SPAM OR NOT SPAM ----------------------
# The trained SVM model predicts whether the new email is spam (1) or not spam (0).

new_pred = svm_clf.predict(new_email_vector)[0]
print("Predicted label for the new email:", new_pred)
print("Meaning: 0 = Not Spam (Ham), 1 = Spam")


Predicted label for the new email: 1
Meaning: 0 = Not Spam (Ham), 1 = Spam
