# **ML : Assignment - 6 ( Q.3 )**

Mohd Talha Patrawala

CMPN - B

23102B0025

In [27]:
import pandas as pd
import numpy as np
import argparse
import os

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix, precision_score,
                             recall_score, f1_score, roc_auc_score)

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [28]:
data = pd.read_csv("SMSSpamCollection", sep="\t", names=["label", "message"])

data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

X = data['message']
y = data['label']

In [30]:
nb = MultinomialNB()
lr = LogisticRegression(max_iter=1000)
svm = LinearSVC()

In [31]:
def create_pipeline(model):
    return Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', model)
    ])

pipe_nb = create_pipeline(nb)
pipe_lr = create_pipeline(lr)
pipe_svm = create_pipeline(svm)

In [32]:
voting_hard = VotingClassifier(
    estimators=[('nb', pipe_nb),
                ('lr', pipe_lr),
                ('svm', pipe_svm)],
    voting='hard'
)

voting_soft = VotingClassifier(
    estimators=[('nb', pipe_nb),
                ('lr', pipe_lr)],
    voting='soft'
)

In [33]:
stacking = StackingClassifier(
    estimators=[('nb', pipe_nb),
                ('lr', pipe_lr)],
    final_estimator=LogisticRegression(),
    cv=5
)

In [34]:
stump = DecisionTreeClassifier(max_depth=1)

adaboost = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', AdaBoostClassifier(
        estimator=stump,
        n_estimators=100,
        learning_rate=1.0
    ))
])

In [35]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "Naive Bayes": pipe_nb,
    "Logistic Regression": pipe_lr,
    "Linear SVM": pipe_svm,
    "Voting Hard": voting_hard,
    "Voting Soft": voting_soft,
    "Stacking": stacking,
    "AdaBoost (Stumps)": adaboost
}

results = []

for name, model in models.items():
    print(f"Evaluating {name}...")

    precision_scores = cross_val_score(model, X, y, cv=skf, scoring='precision')
    recall_scores = cross_val_score(model, X, y, cv=skf, scoring='recall')
    f1_scores = cross_val_score(model, X, y, cv=skf, scoring='f1')
    roc_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')

    results.append([
        name,
        np.mean(precision_scores), np.std(precision_scores),
        np.mean(recall_scores), np.std(recall_scores),
        np.mean(f1_scores), np.std(f1_scores),
        np.mean(roc_scores), np.std(roc_scores)
    ])

results_df = pd.DataFrame(results, columns=[
    "Model",
    "Precision Mean", "Precision Std",
    "Recall Mean", "Recall Std",
    "F1 Mean", "F1 Std",
    "ROC Mean", "ROC Std"
])

results_df

Evaluating Naive Bayes...
Evaluating Logistic Regression...
Evaluating Linear SVM...
Evaluating Voting Hard...


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 379, in _score
    response_method = _check_response_method(estimator, self._response_method)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 2283, in _check_response_method
    raise AttributeError(
AttributeError: VotingClassifier has none of the following attributes: decision_function, predict_proba.

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_scorer.py", line 379, in _score
    response_method = _ch

Evaluating Voting Soft...
Evaluating Stacking...
Evaluating AdaBoost (Stumps)...


Unnamed: 0,Model,Precision Mean,Precision Std,Recall Mean,Recall Std,F1 Mean,F1 Std,ROC Mean,ROC Std
0,Naive Bayes,0.998291,0.003419,0.781754,0.020496,0.87671,0.013099,0.987838,0.003843
1,Logistic Regression,0.98551,0.009038,0.725566,0.015299,0.835686,0.010683,0.990921,0.005274
2,Linear SVM,0.984164,0.009233,0.903606,0.023886,0.941921,0.011389,0.992784,0.003582
3,Voting Hard,0.993586,0.005957,0.828635,0.005721,0.903632,0.004744,,
4,Voting Soft,0.994782,0.00692,0.769745,0.014961,0.867854,0.011179,0.991442,0.004375
5,Stacking,0.980759,0.007442,0.886201,0.021272,0.930945,0.01263,0.991383,0.004343
6,AdaBoost (Stumps),0.954853,0.01448,0.428438,0.024538,0.591192,0.025574,0.926984,0.014295


In [36]:
best_model = pipe_svm

y_pred = cross_val_predict(best_model, X, y, cv=skf)

cm = confusion_matrix(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print("Confusion Matrix:\n", cm)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Confusion Matrix:
 [[4814   11]
 [  72  675]]
Precision: 0.9839650145772595
Recall: 0.9036144578313253
F1: 0.942079553384508


In [37]:
results_df.to_csv("ensemble_comparison.csv", index=False)

best_model.fit(X, y)

probs = None
if hasattr(best_model, "predict_proba"):
    probs = best_model.predict_proba(X)[:,1]
else:
    probs = np.zeros(len(X))

final_predictions = pd.DataFrame({
    "MessageId": range(len(X)),
    "Actual": y,
    "Predicted": best_model.predict(X),
    "Probability": probs
})

final_predictions.to_csv("final_model_predictions.csv", index=False)

# **Recommendation**

Linear SVM is the best-performing model. It achieved the highest F1-score (0.9419), highest recall (0.9036), and highest ROC-AUC (0.9928), indicating strong class separation and a good balance between precision and recall. Although ensemble methods such as stacking and voting improved precision, they did not surpass Linear SVM in overall performance. AdaBoost with decision stumps performed significantly worse due to the high-dimensional sparse nature of TF-IDF text features. Therefore, Linear SVM is recommended as the most effective and stable model for spam detection in this dataset.