In [1]:
from google.colab import files
uploaded = files.upload()

Saving readme to readme
Saving SMSSpamCollection to SMSSpamCollection


In [11]:


import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# ------------------------------
# 1. Load Dataset
# ------------------------------
df = pd.read_csv("SMSSpamCollection", sep="\t", names=["label", "message"])
df["label"] = df["label"].map({"ham": 0, "spam": 1})

X = df["message"]
y = df["label"]

# ------------------------------
# 2. TF-IDF (reduced for speed)
# ------------------------------
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')

# ------------------------------
# 3. Define Base Models
# ------------------------------
nb = MultinomialNB()
lr = LogisticRegression(max_iter=1000)
svm = LinearSVC()

# ------------------------------
# 4. Ensemble Models
# ------------------------------
voting_hard = VotingClassifier(
    estimators=[('nb', nb), ('lr', lr), ('svm', svm)],
    voting='hard'
)

voting_soft = VotingClassifier(
    estimators=[('nb', nb), ('lr', lr)],
    voting='soft'
)

stacking = StackingClassifier(
    estimators=[('nb', nb), ('lr', lr)],
    final_estimator=LogisticRegression()
)

# AdaBoost with Decision Stump (IMPORTANT)
stump = DecisionTreeClassifier(max_depth=1)
adaboost = AdaBoostClassifier(
    estimator=stump,
    n_estimators=100,
    random_state=42
)

models = {
    "NaiveBayes": nb,
    "LogisticRegression": lr,
    "LinearSVM": svm,
    "Voting_Hard": voting_hard,
    "Voting_Soft": voting_soft,
    "Stacking": stacking,
    "AdaBoost_Stump": adaboost
}

# ------------------------------
# 5. Evaluation Function
# ------------------------------
def evaluate_model(model, X, y, k=5):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    precision, recall, f1, roc_auc = [], [], [], []
    final_cm = np.zeros((2,2))

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        pipe = Pipeline([
            ('tfidf', vectorizer),
            ('clf', model)
        ])

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        final_cm += confusion_matrix(y_test, y_pred)

        # ROC only if available
        if hasattr(pipe.named_steps['clf'], "predict_proba"):
            y_prob = pipe.predict_proba(X_test)[:,1]
            roc_auc.append(roc_auc_score(y_test, y_prob))

    return {
        "Precision_mean": np.mean(precision),
        "Recall_mean": np.mean(recall),
        "F1_mean": np.mean(f1),
        "ROC_AUC_mean": np.mean(roc_auc) if len(roc_auc)>0 else None,
        "Confusion_Matrix": final_cm
    }

# ------------------------------
# 6. Run All Models
# ------------------------------
results = []

for name, model in models.items():
    print(f"Running {name}...")
    metrics = evaluate_model(model, X, y, k=5)
    metrics["Model"] = name
    results.append(metrics)

results_df = pd.DataFrame(results)
print("\nFinal Comparison:\n")
print(results_df)

# Save comparison file
results_df.to_csv("ensemble_comparison.csv", index=False)

# ------------------------------
# 7. Train Final Model (Best: Stacking)
# ------------------------------
best_model = stacking

final_pipe = Pipeline([
    ('tfidf', vectorizer),
    ('clf', best_model)
])

final_pipe.fit(X, y)

df["Predicted"] = final_pipe.predict(X)

if hasattr(final_pipe.named_steps['clf'], "predict_proba"):
    df["Probability"] = final_pipe.predict_proba(X)[:,1]
else:
    df["Probability"] = None

df["MessageId"] = range(len(df))

final_output = df[["MessageId", "label", "Predicted", "Probability"]]
final_output.columns = ["MessageId", "Actual", "Predicted", "Probability"]

final_output.to_csv("final_model_predictions.csv", index=False)




Running NaiveBayes...
Running LogisticRegression...
Running LinearSVM...
Running Voting_Hard...
Running Voting_Soft...
Running Stacking...
Running AdaBoost_Stump...

Final Comparison:

   Precision_mean  Recall_mean   F1_mean  ROC_AUC_mean  \
0        0.995384     0.850058  0.916819      0.989756   
1        0.986339     0.777763  0.869679      0.990823   
2        0.979882     0.898237  0.936995           NaN   
3        0.992385     0.868805  0.926358           NaN   
4        0.990374     0.831320  0.903886      0.991745   
5        0.978316     0.900895  0.937784      0.991465   
6        0.954926     0.428456  0.591101      0.927640   

                   Confusion_Matrix               Model  
0   [[4822.0, 3.0], [112.0, 635.0]]          NaiveBayes  
1   [[4817.0, 8.0], [166.0, 581.0]]  LogisticRegression  
2   [[4811.0, 14.0], [76.0, 671.0]]           LinearSVM  
3    [[4820.0, 5.0], [98.0, 649.0]]         Voting_Hard  
4   [[4819.0, 6.0], [126.0, 621.0]]         Voting_Soft  
5 

In [12]:
# Clean comparison file (remove confusion matrix for Teams preview)
clean_results = results_df.drop(columns=["Confusion_Matrix"])

clean_results.to_csv("ensemble_comparison.csv", index=False)

# Save confusion matrices separately
cm_data = []

for i, row in results_df.iterrows():
    cm = row["Confusion_Matrix"]
    cm_data.append({
        "Model": row["Model"],
        "TN": cm[0][0],
        "FP": cm[0][1],
        "FN": cm[1][0],
        "TP": cm[1][1]
    })

cm_df = pd.DataFrame(cm_data)
cm_df.to_csv("confusion_matrices.csv", index=False)

print("Files saved:")
print("1. ensemble_comparison.csv (clean)")
print("2. confusion_matrices.csv")
print("3. final_model_predictions.csv")

Files saved:
1. ensemble_comparison.csv (clean)
2. confusion_matrices.csv
3. final_model_predictions.csv


In [13]:
from google.colab import files

files.download("ensemble_comparison.csv")
files.download("confusion_matrices.csv")
files.download("final_model_predictions.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Among the combining strategies, the Stacking Classifier generally performs best because it learns how to optimally combine predictions from multiple base models using a meta-learner. Unlike simple voting, stacking adapts weights based on model performance, reducing bias and variance.

Soft voting provides strong performance with lower complexity and is suitable when computational efficiency is required.

AdaBoost with decision stumps improves performance by focusing on difficult samples, but it may overfit noisy data.

Therefore, Stacking is recommended for highest predictive performance, while Soft Voting is recommended for simplicity and speed.