In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold


I use the UCI Spambase dataset (4601 rows, 58 columns).  
57 numeric engineered features + 1 binary target (`class`).  
No text preprocessing needed.


In [None]:
# Load data from UCI CSV (raw file has no header)
uci_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
df = pd.read_csv(uci_url, header=None)
cols = [f"f{i:02d}" for i in range(57)]
df.columns = cols + ["class"]
df["class"] = df["class"].astype(int)

df.head()


Class balance: Ham=2788, Spam=1813 (~61% ham, 39% spam).  
Standardization may help LR/SVM.


In [None]:
df.info()
df["class"].value_counts()
df.describe().T.head(5)

plt.figure()
df["class"].value_counts().sort_index().plot(kind="bar")
plt.title("Class balance")
plt.show()


Train/test split (80/20), stratified.

In [None]:
X = df.drop(columns=["class"])
y = df["class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y
)

print(X_train.shape, X_test.shape)


In [None]:
# Helper: calculates metrics for a given probability vector
def evaluate_predictions(y_true, y_proba, threshold=0.5):
    y_pred = (y_proba >= threshold).astype(int)
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "auc": roc_auc_score(y_true, y_proba),
        "cm": confusion_matrix(y_true, y_pred)
    }

# Store model results
results = {}


Logistic Regression (threshold=0.50)

In [None]:
lr_pipe = Pipeline([
    ("scale", StandardScaler()),
    ("clf", LogisticRegression(solver="liblinear", max_iter=200))
])

lr_pipe.fit(X_train, y_train)
y_proba_lr = lr_pipe.predict_proba(X_test)[:, 1]
lr_metrics = evaluate_predictions(y_test, y_proba_lr, 0.50)
results["LR@0.50"] = lr_metrics
results


In [None]:
# Confusion matrix + ROC
cm = lr_metrics["cm"]
ConfusionMatrixDisplay(cm, display_labels=[0,1]).plot()
plt.show()

fpr, tpr, _ = roc_curve(y_test, y_proba_lr)
plt.plot(fpr, tpr, label=f"AUC={lr_metrics['auc']:.3f}")
plt.legend()
plt.show()


Threshold sweep to improve F1.

In [None]:
thresholds = np.linspace(0.05, 0.95, 19)
scores = []
for t in thresholds:
    m = evaluate_predictions(y_test, y_proba_lr, t)
    scores.append((t, m["precision"], m["recall"], m["f1"]))

thr_df = pd.DataFrame(scores, columns=["threshold","precision","recall","f1"])
best_threshold = thr_df.iloc[thr_df.f1.idxmax()]["threshold"]

# Evaluate LR at best threshold
lr_best = evaluate_predictions(y_test, y_proba_lr, best_threshold)
results[f"LR@{best_threshold:.2f}"] = lr_best
results


Gaussian Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_proba_nb = nb.predict_proba(X_test)[:, 1]
nb_metrics = evaluate_predictions(y_test, y_proba_nb, 0.50)
results["NB@0.50"] = nb_metrics
results


Linear SVM (calibrated to get probabilities)

In [None]:
svm_pipe = Pipeline([
    ("scale", StandardScaler()),
    ("clf", CalibratedClassifierCV(
        estimator=LinearSVC(max_iter=5000),
        cv=StratifiedKFold(n_splits=5, shuffle=True),
        method="sigmoid"
    ))
])

svm_pipe.fit(X_train, y_train)
y_proba_svm = svm_pipe.predict_proba(X_test)[:, 1]
svm_metrics = evaluate_predictions(y_test, y_proba_svm, 0.50)
results["SVM@0.50"] = svm_metrics
results

# ROC for SVM
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_proba_svm)
plt.plot(fpr_svm, tpr_svm, label=f"AUC={svm_metrics['auc']:.3f}")
plt.legend()
plt.show()


Compare models by F1 and AUC

In [None]:
comparison = []
for name, m in results.items():
    comparison.append([name, m["accuracy"], m["precision"], m["recall"], m["f1"], m["auc"]])

comp_df = pd.DataFrame(comparison, columns=["model","acc","prec","rec","f1","auc"])
comp_df.sort_values("f1", ascending=False)


Feature importance (from Logistic Regression)

In [None]:
# Extract LR coefficients
lr_clf = lr_pipe.named_steps["clf"]
coef = lr_clf.coef_.ravel()
feature_names = X_train.columns

fi = pd.DataFrame({"feature": feature_names, "coef": coef})
fi["abs"] = fi["coef"].abs()
fi.sort_values("abs", ascending=False).head(15)
