In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt


In [16]:
import os
print(os.getcwd())

/Users/zhicha/Project_Folder/Data Visualization Practicing/2notebooks


In [17]:
# Load the email data from a csv file and split into training and testing sets
df = pd.read_csv("/Users/zhicha/Project_Folder/Data/emails.csv")
label_col = df.columns[-1]
id_col = df.columns[0]
feature_cols = df.columns[1:-1]

X_raw = df[feature_cols]
y = df[label_col]

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size = 0.2, random_state = 42, stratify = y
)

print("Train:", X_train_raw.shape, "Test:", X_test_raw.shape)
print("Spam rate train:", y_train.mean(), "Spam rate test:", y_test.mean())

Train: (4137, 3000) Test: (1035, 3000)
Spam rate train: 0.290065264684554 Spam rate test: 0.2898550724637681


In [18]:

# Filter short tokens
cols_len3 = [c for c in X_train_raw.columns if len(str(c)) >= 3]

X_train_len = X_train_raw[cols_len3]
X_test_len = X_test_raw[cols_len3]
print("After length filter:", X_train_len.shape[1])

After length filter: 2847


In [19]:
doc_freq = (X_train_len > 0).mean(axis=0)

min_df = 0.01
max_df = 0.90
kept_cols = doc_freq[(doc_freq >= min_df) & (doc_freq <= max_df)].index.tolist()

X_train = X_train_len[kept_cols]
X_test = X_test_len[kept_cols]

print("After DF filter:", X_train.shape[1])

After DF filter: 1574


In [20]:
tfidf = TfidfTransformer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [21]:
models = {
    "MultinomialNB (counts)": MultinomialNB(),
    "LogReg (tfidf)": LogisticRegression(max_iter=2000),
    "LinearSVM (tfidf)": LinearSVC()
}

results = []
trained = {}

for name, model in models.items():
    if "counts" in name:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_score = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_score)
    else:
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)
        if hasattr(model, "decision_function"):
            y_score = model.decision_function(X_test_tfidf)
            auc = roc_auc_score(y_test, y_score)
        else:
            auc = np.nan
    f1 = f1_score(y_test, y_pred)
    results.append({"model": name, "F1": f1, "ROC-AUC": auc})
    trained[name] = model

results_df = pd.DataFrame(results, columns = ["model", "F1", "ROC-AUC"]).sort_values("F1", ascending = False)
results_df

Unnamed: 0,model,F1,ROC-AUC
2,LinearSVM (tfidf),0.970492,0.996685
1,LogReg (tfidf),0.962233,0.995442
0,MultinomialNB (counts),0.900929,0.9671


In [None]:
# Visualize the models

In [None]:
# Confusion matrices

best_model_name = results_df.iloc[0]["model"]
best_model = trained[best_model_name]

if "counts" in best_model_name:
    best_pred = best_model.predict(X_test)
else:
    best_pred = best_model.predict(X_test_tfidf)

cm = confusion_matrix(y_test, best_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = ["Not Spam", "Spam"])
disp.plot()
plt.title(f"Confusion Matrix: {best_model_name}")
plt.show()

In [None]:
# Top words from logistic regression

logreg = trained["LogReg (tfidf)"]
coefs = logreg.coef_.ravel()
feature_names = np.array(kept_cols)

top_n = 20
top_spam_idx = np.argsort(coefs)[-top_n:]
top_ham_idx = np.argsort(coefs)[:top_n]

top_spam = pd.Series(coefs[top_spam_idx], index = feature_names[top_spam_idx]).sort_values()
top_ham = pd.Series(coefs[top_ham_idx], index = feature_names[top_ham_idx]).sort_values()

display(top_spam)
display(top_ham)

In [None]:
# 

import matplotlib.pyplot as plt
import numpy as np

models_order = results_df["model"].tolist()
f1_vals = results_df["F1"].tolist()
auc_vals = results_df["ROC-AUC"].tolist()

x = np.arange(len(models_order))
width = 0.35

plt.figure()
plt.bar(x - width/2, f1_vals, width, label="F1")
plt.bar(x + width/2, auc_vals, width, label="ROC-AUC")
plt.xticks(x, models_order, rotation=20, ha="right")
plt.ylim(0, 1)
plt.title("Model Comparison (F1 & ROC-AUC)")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# ROC Curves

from sklearn.metrics import roc_curve, roc_auc_score

plt.figure()

for name, model in trained.items():
    # 选对输入矩阵
    X_in = X_test if "counts" in name else X_test_tfidf

    # 取 score（用于ROC）
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(X_in)[:, 1]
    elif hasattr(model, "decision_function"):
        y_score = model.decision_function(X_in)
    else:
        continue  # 没有score就跳过（一般不会）

    fpr, tpr, _ = roc_curve(y_test, y_score)
    auc = roc_auc_score(y_test, y_score)

    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")

# 随机分类基线
plt.plot([0, 1], [0, 1], linestyle="--", label="Random")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Confusion Matrices for all models

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

names = list(trained.keys())

fig, axes = plt.subplots(1, len(names), figsize=(5*len(names), 4))

if len(names) == 1:
    axes = [axes]

for ax, name in zip(axes, names):
    model = trained[name]
    X_in = X_test if "counts" in name else X_test_tfidf
    y_pred = model.predict(X_in)

    cm = confusion_matrix(y_test, y_pred)
    im = ax.imshow(cm)

    ax.set_title(name)
    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(["Not Spam", "Spam"], rotation=45, ha="right")
    ax.set_yticklabels(["Not Spam", "Spam"])
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")

    # 写数值
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha="center", va="center")

plt.tight_layout()
plt.show()


In [None]:
# PCA visualization

from sklearn.decomposition import PCA


X2 = PCA(n_components=2, random_state=42).fit_transform(X_test_tfidf.toarray())

plt.figure()
plt.scatter(X2[:, 0], X2[:, 1], c=y_test, s=10)
plt.title("PCA (2D) projection of TF-IDF features")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()
