<a href="https://colab.research.google.com/github/Riyan081/Python-CDAC/blob/main/MLA6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Riyan Sayyad
# 23102B0002



In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.ensemble import VotingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [None]:
df = pd.read_csv("SMSSpamCollection", sep="\t", header=None, names=["label","message"])
df.to_csv("sms.csv", index=False)

print("Dataset converted successfully!")
df.head()

Dataset converted successfully!


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df = pd.read_csv("sms.csv")

texts = df["message"]
labels = df["label"]

# Convert spam/ham â†’ 1/0
le = LabelEncoder()
y = le.fit_transform(labels)

# TF-IDF
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X = tfidf.fit_transform(texts)

print("Number of messages:", X.shape[0])
print("Number of features:", X.shape[1])

Number of messages: 5572
Number of features: 5000


In [None]:
nb = MultinomialNB()
lr = LogisticRegression(max_iter=2000)
svm = LinearSVC()

In [None]:
# Hard Voting
hard_vote = VotingClassifier(
    estimators=[("nb", nb), ("lr", lr), ("svm", svm)],
    voting="hard"
)

# Soft Voting (only models with probability)
soft_vote = VotingClassifier(
    estimators=[("nb", nb), ("lr", lr)],
    voting="soft"
)

# Stacking
stack = StackingClassifier(
    estimators=[("nb", nb), ("lr", lr)],
    final_estimator=LogisticRegression()
)

# AdaBoost with Decision Stumps (VERY IMPORTANT REQUIREMENT)
stump = DecisionTreeClassifier(max_depth=1)
ada = AdaBoostClassifier(estimator=stump, n_estimators=100)

models = {
    "NaiveBayes": nb,
    "LogisticRegression": lr,
    "LinearSVM": svm,
    "HardVoting": hard_vote,
    "SoftVoting": soft_vote,
    "Stacking": stack,
    "AdaBoost_Stumps": ada
}

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for name, model in models.items():

    precision_list, recall_list, f1_list, auc_list = [], [], [], []

    for train_idx, test_idx in kfold.split(X, y):

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # ROC-AUC calculation
        try:
            y_prob = model.predict_proba(X_test)[:,1]
            auc = roc_auc_score(y_test, y_prob)
        except:
            auc = 0

        precision_list.append(precision_score(y_test, y_pred))
        recall_list.append(recall_score(y_test, y_pred))
        f1_list.append(f1_score(y_test, y_pred))
        auc_list.append(auc)

    results.append([
        name,
        np.mean(precision_list), np.std(precision_list),
        np.mean(recall_list), np.std(recall_list),
        np.mean(f1_list), np.std(f1_list),
        np.mean(auc_list), np.std(auc_list)
    ])

In [None]:
columns = [
    "Model",
    "Precision_mean","Precision_std",
    "Recall_mean","Recall_std",
    "F1_mean","F1_std",
    "AUC_mean","AUC_std"
]

results_df = pd.DataFrame(results, columns=columns)
results_df.to_csv("ensemble_comparison.csv", index=False)

print(results_df)

                Model  Precision_mean  Precision_std  Recall_mean  Recall_std  \
0          NaiveBayes        0.998473       0.003053     0.834004    0.025194   
1  LogisticRegression        0.988744       0.006895     0.708134    0.020543   
2           LinearSVM        0.989681       0.007266     0.884841    0.028388   
3          HardVoting        0.995307       0.003837     0.850076    0.022633   
4          SoftVoting        0.996595       0.004171     0.796510    0.015145   
5            Stacking        0.985394       0.004562     0.903588    0.019374   
6     AdaBoost_Stumps        0.955006       0.014014     0.428447    0.024691   

    F1_mean    F1_std  AUC_mean   AUC_std  
0  0.908620  0.013922  0.990053  0.003678  
1  0.825088  0.015011  0.990813  0.005068  
2  0.934024  0.014931  0.000000  0.000000  
3  0.916808  0.013212  0.000000  0.000000  
4  0.885339  0.010944  0.992231  0.004094  
5  0.942614  0.011275  0.992022  0.003989  
6  0.591181  0.025201  0.928148  0.013260  

In [None]:
# Holdout test set (for confusion matrix)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train best model (AdaBoost)
ada.fit(X_train, y_train)

y_pred = ada.predict(X_test)
probs = ada.predict_proba(X_test)[:,1]

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Save predictions file (assignment requirement)
pred_df = pd.DataFrame({
    "MessageId": range(len(y_test)),
    "Actual": y_test,
    "Predicted": y_pred,
    "Probability": probs
})

pred_df.to_csv("final_model_predictions.csv", index=False)

Confusion Matrix:
 [[965   1]
 [ 86  63]]
