In [2]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os

In [3]:
data = pd.read_csv("F:\\Projects_spam_email_ML&URL_models\\Data\\SpamEmail_data\\final_merged_spam_dataset.csv")

print("Shape of dataset:", data.shape)
print(data.head())
print("\nColumns:", data.columns)

Shape of dataset: (19588, 2)
                                                text  label
0  Subject: enron methanol ; meter # : 988291\r\n...      0
1  Subject: hpl nom for january 9 , 2001\r\n( see...      0
2  Subject: neon retreat\r\nho ho ho , we ' re ar...      0
3  Subject: photoshop , windows , office . cheap ...      1
4  Subject: re : indian springs\r\nthis deal is t...      0

Columns: Index(['text', 'label'], dtype='object')


In [4]:
print("Missing values in each column:\n", data.isnull().sum())

data = data.dropna(subset=['text'])
data = data.drop_duplicates(subset=['text'])
data = data.reset_index(drop=True)

print("\nShape after cleaning:", data.shape)

Missing values in each column:
 text     0
label    0
dtype: int64

Shape after cleaning: (18893, 2)


In [5]:
print("\nPercentage distribution:")
print(data['label'].value_counts(normalize=True) * 100)  # covert value count to proportions and *100-->convert into percentage


Percentage distribution:
label
0    73.593394
1    26.406606
Name: proportion, dtype: float64


In [6]:
"""
Step 7: Text Preprocessing Pipeline
Includes:
1. Cleaning
2. Tokenization
3. Stopword removal
4. POS-based lemmatization
"""

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

df = data.copy()

def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    text = re.sub(r'\r\n|\n|\r', ' ', text)
    text = re.sub(r'http\S+|www\.\S+', ' url ', text)
    text = re.sub(r'\S+@\S+', ' email ', text)
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    
    return text

df['clean_text'] = df['text'].astype(str).apply(clean_text)


STOP_WORDS = set(stopwords.words('english'))

def tokenize_remove_stopwords(text):
    tokens = text.split()
    return [t for t in tokens if t not in STOP_WORDS and len(t) > 1]

df['tokens'] = df['clean_text'].apply(tokenize_remove_stopwords)


lemmatizer = WordNetLemmatizer()

def map_pos_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_words(tokens):
    if not tokens:
        return []
    tagged_tokens = pos_tag(tokens)
    return [lemmatizer.lemmatize(word, map_pos_tag(tag)) 
            for word, tag in tagged_tokens]

df['lemmatized_tokens'] = df['tokens'].apply(lemmatize_words)

df['text_lemmatized'] = df['lemmatized_tokens'] \
                          .apply(lambda x: " ".join(x))


required_columns = [
    col for col in 
    ['ID', 'label', 'label_num', 'clean_text', 'text_lemmatized']
    if col in df.columns
]

data = df

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data['text_lemmatized'],   
    data['label'],            
    test_size=0.30,           
    stratify=data['label'],  # preserve class distribution  
    random_state=42           
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples : {len(X_test)}")


Training samples: 13225
Testing samples : 5668


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    max_features=20000,     
    ngram_range=(1, 2),     
    min_df=3,              
    stop_words=None       
)

X_train_tfidf = tfidf_vectorizer.fit_transform(
    X_train.astype(str)
)

X_test_tfidf = tfidf_vectorizer.transform(
    X_test.astype(str)
)

print("Training vector shape:", X_train_tfidf.shape)
print("Testing vector shape :", X_test_tfidf.shape)
print("Vocabulary size      :", len(tfidf_vectorizer.vocabulary_))


Training vector shape: (13225, 20000)
Testing vector shape : (5668, 20000)
Vocabulary size      : 20000


In [9]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold

svm_model = LinearSVC(
    max_iter=10000,
    random_state=42
)

param_grid = {
    "C": [0.1, 0.5, 1, 2, 5, 10],
    "class_weight": [None, "balanced"]
}

cv_strategy = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid,
    scoring="f1",   
    cv=cv_strategy,
    n_jobs=-1,   # Use all CPU cores          
    verbose=2,
    refit=True   # retrain best model on full train data          
)

grid_search.fit(X_train_tfidf, y_train)

print("\nBest hyperparameters:")
print(grid_search.best_params_)

# Best tuned model
best_svm_tfidf = grid_search.best_estimator_


Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best hyperparameters:
{'C': 2, 'class_weight': 'balanced'}


In [10]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

y_test_pred = best_svm_tfidf.predict(X_test_tfidf)

print("Test Performance: TF-IDF + Linear SVM --\n")
accuracy  = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, zero_division=0)
recall    = recall_score(y_test, y_test_pred, zero_division=0)
f1        = f1_score(y_test, y_test_pred, zero_division=0)

print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-score  : {f1:.4f}")


print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_test_pred)
print(cm)


print("\nClassification Report:")
print(classification_report(
    y_test,
    y_test_pred,
    digits=4,
    zero_division=0
))


Test Performance: TF-IDF + Linear SVM --

Accuracy  : 0.9903
Precision : 0.9756
Recall    : 0.9880
F1-score  : 0.9817

Confusion Matrix:
[[4134   37]
 [  18 1479]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9957    0.9911    0.9934      4171
           1     0.9756    0.9880    0.9817      1497

    accuracy                         0.9903      5668
   macro avg     0.9856    0.9896    0.9876      5668
weighted avg     0.9904    0.9903    0.9903      5668



In [None]:
#Probability Calibration for SVM


from sklearn.calibration import CalibratedClassifierCV


svm_calibrated = CalibratedClassifierCV(
    estimator=best_svm_tfidf,
    cv=5)  
           
svm_calibrated.fit(X_train_tfidf, y_train)



## Naive Bayes

In [18]:

from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()


param_grid_nb = {
    "alpha": [0.1, 0.3, 0.5, 1.0, 2.0, 5.0]
}

cv_strategy_nb = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


grid_search_nb = GridSearchCV(
    estimator=nb_model,
    param_grid=param_grid_nb,
    scoring="f1",     # can change to "f1"
    cv=cv_strategy_nb,
    n_jobs=-1,
    verbose=2,
    refit=True
)

grid_search_nb.fit(X_train_tfidf, y_train)

print("\nBest hyperparameters:")
print(grid_search_nb.best_params_)

print(f"Best CV score: {grid_search_nb.best_score_:.4f}")

best_nb_model = grid_search_nb.best_estimator_

y_test_pred_nb = best_nb_model.predict(X_test_tfidf)

print("\n Test Performance: TF-IDF + Naive Bayes: ")

accuracy  = accuracy_score(y_test, y_test_pred_nb)
precision = precision_score(y_test, y_test_pred_nb, zero_division=0)
recall    = recall_score(y_test, y_test_pred_nb, zero_division=0)
f1        = f1_score(y_test, y_test_pred_nb, zero_division=0)

print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-score  : {f1:.4f}")

cm = confusion_matrix(y_test, y_test_pred_nb)
tn, fp, fn, tp = cm.ravel()

print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(
    y_test,
    y_test_pred_nb,
    digits=4,
    zero_division=0
))


Fitting 5 folds for each of 6 candidates, totalling 30 fits

Best hyperparameters:
{'alpha': 0.1}
Best CV score: 0.9640

 Test Performance: TF-IDF + Naive Bayes: 
Accuracy  : 0.9820
Precision : 0.9722
Recall    : 0.9593
F1-score  : 0.9657

Confusion Matrix:
[[4130   41]
 [  61 1436]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9854    0.9902    0.9878      4171
           1     0.9722    0.9593    0.9657      1497

    accuracy                         0.9820      5668
   macro avg     0.9788    0.9747    0.9768      5668
weighted avg     0.9820    0.9820    0.9820      5668



## XGBoost

In [13]:
from xgboost import XGBClassifier



# Handling class imbalance
ham_count  = (y_train == 0).sum()
spam_count = (y_train == 1).sum()
scale_pos_weight = ham_count / spam_count
print("scale_pos_weight =", scale_pos_weight)


xgb_model = XGBClassifier(
    objective="binary:logistic",
    n_estimators=300,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    tree_method="hist",
    n_jobs=-1,
    random_state=42,
    eval_metric="logloss"
)

xgb_model.fit(X_train_tfidf, y_train)

y_test_pred_xgb = xgb_model.predict(X_test_tfidf)

print("\n Test Performance: TF-IDF + XGBoost: ")

accuracy  = accuracy_score(y_test, y_test_pred_xgb)
precision = precision_score(y_test, y_test_pred_xgb, zero_division=0)
recall    = recall_score(y_test, y_test_pred_xgb, zero_division=0)
f1        = f1_score(y_test, y_test_pred_xgb, zero_division=0)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_xgb))

print("\nClassification Report:")
print(classification_report(
    y_test,
    y_test_pred_xgb,
    digits=4,
    zero_division=0
))

scale_pos_weight = 2.7872279495990835

 Test Performance: TF-IDF + XGBoost: 

Confusion Matrix:
[[4063  108]
 [  17 1480]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9958    0.9741    0.9849      4171
           1     0.9320    0.9886    0.9595      1497

    accuracy                         0.9779      5668
   macro avg     0.9639    0.9814    0.9722      5668
weighted avg     0.9790    0.9779    0.9782      5668



In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


# meta learner
meta_model = LogisticRegression(max_iter=3000)

stack_cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


stacking_model = StackingClassifier(
    estimators=[
        ("svm", svm_calibrated),
        ("nb",  best_nb_model),
        ("xgb", xgb_model)
    ],
    final_estimator=meta_model,
    stack_method="auto",
    cv=stack_cv,          # enables K-Fold stacking
    n_jobs=-1
)
stacking_model.fit(X_train_tfidf, y_train)


y_pred_stack = stacking_model.predict(X_test_tfidf)

print("\n Test Performance: K-Fold Stacking (SVM + NB + XGB → LR): ")

accuracy  = accuracy_score(y_test, y_pred_stack)
precision = precision_score(y_test, y_pred_stack, zero_division=0)
recall    = recall_score(y_test, y_pred_stack, zero_division=0)
f1        = f1_score(y_test, y_pred_stack, zero_division=0)

print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")


cm = confusion_matrix(y_test, y_pred_stack)

print("\nConfusion Matrix:")
print(cm)


print("\nClassification Report:")
print(classification_report(
    y_test,
    y_pred_stack,
    digits=4,
    zero_division=0
))




 Model: K-Fold Stacking (SVM + NB + XGB → LR)
Accuracy : 0.9917
Precision: 0.9814
Recall   : 0.9873
F1-score : 0.9843

Confusion Matrix:
[[4143   28]
 [  19 1478]]

Binary Interpretation:
TP (spam → spam): 1478
TN (ham  → ham) : 4143
FP (ham  → spam): 28
FN (spam → ham): 19

Classification Report:
              precision    recall  f1-score   support

           0     0.9954    0.9933    0.9944      4171
           1     0.9814    0.9873    0.9843      1497

    accuracy                         0.9917      5668
   macro avg     0.9884    0.9903    0.9894      5668
weighted avg     0.9917    0.9917    0.9917      5668



In [26]:

# Train vs Test F1-Score
from sklearn.metrics import f1_score

# Predict on TRAIN
y_train_pred = stacking_model.predict(X_train_tfidf)

# Predict on TEST
y_test_pred  = stacking_model.predict(X_test_tfidf)

# Compute F1 scores
train_f1 = f1_score(y_train, y_train_pred)
test_f1  = f1_score(y_test, y_test_pred)

print("Train F1:", train_f1)
print("Test  F1:", test_f1)


Train F1: 0.9979977116704806
Test  F1: 0.9843489843489843
