In [1]:
# Cell 1: Imports & basic settings

import os

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
)

import joblib

RANDOM_STATE = 42

# 確保我們是從 notebooks/ 底下在跑
import os
print("Current working dir:", os.getcwd())


Current working dir: c:\Users\PC\Desktop\2025ML-spamEmail\notebooks


In [2]:
# Cell 2: Load preprocessed train/test data (TF-IDF)

tfidf_split_path = "../models/train_test_tfidf.pkl"

X_train_tfidf, X_test_tfidf, y_train, y_test = joblib.load(tfidf_split_path)

print(">>> Loaded TF-IDF train/test split from:", tfidf_split_path)
print("X_train_tfidf shape:", X_train_tfidf.shape)
print("X_test_tfidf  shape:", X_test_tfidf.shape)
print("y_train length:", len(y_train))
print("y_test  length:", len(y_test))

print("\n>>> First 10 labels in y_train:", y_train[:10])


>>> Loaded TF-IDF train/test split from: ../models/train_test_tfidf.pkl
X_train_tfidf shape: (4458, 12076)
X_test_tfidf  shape: (1115, 12076)
y_train length: 4458
y_test  length: 1115

>>> First 10 labels in y_train: ['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham']


In [3]:
# Cell 3: Helper function to train & evaluate a model

def train_and_evaluate_model(model, model_name, X_train, y_train, X_test, y_test, pos_label="spam"):
    """
    訓練單一模型並計算 metrics。
    回傳一個 dict: { "model": model, "name": model_name, "accuracy": ..., ... }
    """
    print(f"\n========== {model_name} ==========")
    
    # 訓練
    model.fit(X_train, y_train)
    
    # 預測
    y_pred = model.predict(X_test)
    
    # 如果 label 是文字（spam / ham），我們假設 spam 是 positive class
    # 若你的資料標記不同，可以改 pos_label
    accuracy = accuracy_score(y_test, y_pred)
    try:
        precision = precision_score(y_test, y_pred, pos_label=pos_label)
        recall = recall_score(y_test, y_pred, pos_label=pos_label)
        f1 = f1_score(y_test, y_pred, pos_label=pos_label)
    except Exception as e:
        # 如果 pos_label 不存在，就用 macro average 當 fallback
        print("Warning: binary precision/recall failed, fallback to macro average. Error:", e)
        precision = precision_score(y_test, y_pred, average="macro")
        recall = recall_score(y_test, y_pred, average="macro")
        f1 = f1_score(y_test, y_pred, average="macro")
    
    print("Accuracy :", round(accuracy, 4))
    print("Precision:", round(precision, 4))
    print("Recall   :", round(recall, 4))
    print("F1-score :", round(f1, 4))
    
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:\n", cm)
    
    return {
        "name": model_name,
        "model": model,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "confusion_matrix": cm,
    }


In [4]:
# Cell 4: Train Logistic Regression

log_reg = LogisticRegression(
    solver="liblinear",       # 適合小型/中型 dataset
    max_iter=1000,
    random_state=RANDOM_STATE,
)

log_reg_result = train_and_evaluate_model(
    model=log_reg,
    model_name="Logistic Regression",
    X_train=X_train_tfidf,
    y_train=y_train,
    X_test=X_test_tfidf,
    y_test=y_test,
    pos_label="spam",  # 如果資料不是 spam/ham，可以改
)



Accuracy : 0.974
Precision: 0.9918
Recall   : 0.8121
F1-score : 0.893

Classification report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       0.99      0.81      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Confusion matrix:
 [[965   1]
 [ 28 121]]


In [5]:
# Cell 5: Train Multinomial Naive Bayes

nb_clf = MultinomialNB()

nb_result = train_and_evaluate_model(
    model=nb_clf,
    model_name="Multinomial Naive Bayes",
    X_train=X_train_tfidf,
    y_train=y_train,
    X_test=X_test_tfidf,
    y_test=y_test,
    pos_label="spam",
)



Accuracy : 0.9677
Precision: 1.0
Recall   : 0.7584
F1-score : 0.8626

Classification report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.76      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Confusion matrix:
 [[966   0]
 [ 36 113]]


In [6]:
# Cell 6: Train Linear SVM (LinearSVC)

svm_clf = LinearSVC(
    random_state=RANDOM_STATE
)

svm_result = train_and_evaluate_model(
    model=svm_clf,
    model_name="Linear SVM (LinearSVC)",
    X_train=X_train_tfidf,
    y_train=y_train,
    X_test=X_test_tfidf,
    y_test=y_test,
    pos_label="spam",
)



Accuracy : 0.9865
Precision: 0.9786
Recall   : 0.9195
F1-score : 0.9481

Classification report:
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.98      0.92      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Confusion matrix:
 [[963   3]
 [ 12 137]]


In [7]:
# Cell 7: Compare models

results = [log_reg_result, nb_result, svm_result]

# 用 pandas 整理成表格
summary_rows = []
for r in results:
    summary_rows.append({
        "model": r["name"],
        "accuracy": r["accuracy"],
        "precision": r["precision"],
        "recall": r["recall"],
        "f1": r["f1"],
    })

summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df.sort_values(by="f1", ascending=False)

print(">>> Model comparison (sorted by F1-score):")
display(summary_df)

best_model_name = summary_df.iloc[0]["model"]
print("\n>>> Best model based on F1-score:", best_model_name)


>>> Model comparison (sorted by F1-score):


Unnamed: 0,model,accuracy,precision,recall,f1
2,Linear SVM (LinearSVC),0.986547,0.978571,0.919463,0.948097
0,Logistic Regression,0.973991,0.991803,0.812081,0.892989
1,Multinomial Naive Bayes,0.967713,1.0,0.758389,0.862595



>>> Best model based on F1-score: Linear SVM (LinearSVC)


In [8]:
# Cell 8: Save the best model to ../models/best_model.pkl

# 找出對應的 result object
name_to_result = {r["name"]: r for r in results}
best_result = name_to_result[best_model_name]
best_model = best_result["model"]

best_model_path = "../models/best_model.pkl"
joblib.dump(best_model, best_model_path)

print(f">>> Best model '{best_model_name}' has been saved to: {best_model_path}")


>>> Best model 'Linear SVM (LinearSVC)' has been saved to: ../models/best_model.pkl


In [9]:
# Cell 9 (Optional): Quick manual test with a few example messages

# 載入已存好的 vectorizer
vectorizer_path = "../models/tfidf_vectorizer.pkl"
tfidf_loaded = joblib.load(vectorizer_path)

test_messages = [
    "WIN a FREE iPhone now!!! Click this link!",
    "Hey, are we still meeting for lunch tomorrow?",
    "Congratulations, you have won a lottery! Reply with your bank details.",
]

X_test_manual = tfidf_loaded.transform(test_messages)
preds = best_model.predict(X_test_manual)

for msg, pred in zip(test_messages, preds):
    print("-----")
    print("Message:", msg)
    print("Predicted label:", pred)


-----
Message: WIN a FREE iPhone now!!! Click this link!
Predicted label: spam
-----
Message: Hey, are we still meeting for lunch tomorrow?
Predicted label: ham
-----
Message: Congratulations, you have won a lottery! Reply with your bank details.
Predicted label: spam
