# 04 — Advanced Evaluation & Testing (Aligned with 02)

This notebook strengthens evaluation by **reusing the same configuration and models from `02_machine_learning.ipynb`** and adds:
- Cross-validated metrics and robust **test-set evaluation**
- **ROC** & **PR** curves (macro-averaged, one-vs-rest)
- **Calibration** (reliability) curves and Brier scores
- **Threshold tuning** for F1 (and optional custom cost ratios)
- **Bootstrap 95% CIs** for key metrics
- **Learning curves** (data size vs. score) — optional
- **Error analysis**: top false positives/negatives
- Consolidated artifacts: tables/CSVs/PNGs under `visualizations/` and `results/`

> Notes
> 1. Plots use **matplotlib** only (no seaborn), and are saved as static PNGs.
> 2. If your data or splits are prepared in `02`, point the paths below to reuse them.
> 3. All random seeds are fixed to **42** for reproducibility.


In [1]:
# ==== CONFIG (aligned with 02) ====
from pathlib import Path
import json, os, sys, math, random
import numpy as np
import pandas as pd

RANDOM_STATE = 42
TEST_SIZE = 0.2
NGRAM_RANGE = (1, 3)
MAX_FEATURES = None      # set to an int if used in 02 (e.g., 20000)
MIN_DF = None            # set to an int if used in 02 (e.g., 2)
STOP_WORDS = None  # set e.g., 'english' if used in 02
MIN_DF = 1 

# Data config — EDIT these to match your project if needed
# If 02 exported splits, point to those files instead.
DATA_CSV = '../data/processed/misinformation_dataset.csv'          # e.g., '../data/processed/dataset.csv'
TEXT_COL = 'text'        # change if your column name differs
LABEL_COL = 'label'      # change if your label column differs

# Optional: pre-made split files from 02 (if available)
X_TRAIN_NPZ = None       # e.g., '../data/processed/X_train_tfidf.npz'
X_TEST_NPZ  = None
Y_TRAIN_NPY = None
Y_TEST_NPY  = None

# Artifact folders
RES_DIR = Path('results'); RES_DIR.mkdir(parents=True, exist_ok=True)
VIS_DIR = Path(RES_DIR / 'visualizations'); VIS_DIR.mkdir(parents=True, exist_ok=True)


np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

print("Python:", sys.version.split()[0])
import sklearn, matplotlib
print("sklearn:", sklearn.__version__)
print("matplotlib:", matplotlib.__version__)

Python: 3.11.11
sklearn: 1.6.1
matplotlib: 3.10.0


In [2]:
# ==== DATA LOADING ====
from pathlib import Path
from typing import Tuple, Optional
from sklearn.model_selection import train_test_split

def try_load_precomputed() -> Optional[Tuple]:
    import numpy as np, scipy.sparse as sp
    if X_TRAIN_NPZ and Path(X_TRAIN_NPZ).exists() and Y_TRAIN_NPY and Path(Y_TRAIN_NPY).exists():
        from scipy.sparse import load_npz
        X_train = load_npz(X_TRAIN_NPZ)
        X_test  = load_npz(X_TEST_NPZ) if X_TEST_NPZ and Path(X_TEST_NPZ).exists() else None
        y_train = np.load(Y_TRAIN_NPY)
        y_test  = np.load(Y_TEST_NPY) if Y_TEST_NPY and Path(Y_TEST_NPY).exists() else None
        if X_test is not None and y_test is not None:
            return X_train, X_test, y_train, y_test, None, None
    return None

def _resolve_csv_path():
    """Return a valid CSV path, or raise with a helpful listing."""
    path = DATA_CSV
    if path is None or not Path(path).exists():
        candidates = sorted([p for p in Path('.').rglob('*.csv') if 'checkpoint' not in str(p).lower()])
        if len(candidates) == 1:
            path = str(candidates[0])
            print(f"Auto-detected DATA_CSV={path}")
        else:
            if candidates:
                hint = "\n".join(str(p) for p in candidates[:50])
            else:
                hint = "(no CSVs found)"
            raise FileNotFoundError(
                "Set DATA_CSV to a valid CSV path, or provide NPZ/NPY splits from 02.\n"
                "CSV candidates I found:\n" + hint
            )
    return path

def load_or_split_from_csv() -> Tuple:
    csv_path = _resolve_csv_path()
    df = pd.read_csv(csv_path)
    assert TEXT_COL in df.columns and LABEL_COL in df.columns, f"CSV must have columns: {TEXT_COL}, {LABEL_COL}"
    X = df[TEXT_COL].astype(str).values
    y = df[LABEL_COL].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    return X_train, X_test, y_train, y_test, df, None

pre = try_load_precomputed()
if pre is None:
    X_train, X_test, y_train, y_test, df_all, _ = load_or_split_from_csv()
    print(f"✓ Split from CSV -> train={len(y_train)}  test={len(y_test)}")
else:
    X_train, X_test, y_train, y_test, _, _ = pre
    print("✓ Loaded precomputed TF-IDF splits from NPZ/NPY files")

# Show label distribution
import numpy as np
unique, counts = np.unique(y_train, return_counts=True)
print("Train label distribution:", dict(zip(unique, counts)))

✓ Split from CSV -> train=62559  test=15640
Train label distribution: {0: 42887, 1: 19672}


In [None]:
# ==== VECTORIZATION (aligned with 02) ====
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_kwargs = dict(
    ngram_range=NGRAM_RANGE,
    dtype=np.float32
)
if MAX_FEATURES is not None:
    tfidf_kwargs["max_features"] = MAX_FEATURES
if MIN_DF is not None:
    tfidf_kwargs["min_df"] = MIN_DF
if STOP_WORDS is not None:
    tfidf_kwargs["stop_words"] = STOP_WORDS

tfidf = TfidfVectorizer(**tfidf_kwargs)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

print("TF-IDF:", X_train_tfidf.shape, "->", X_test_tfidf.shape)


In [4]:
# ==== MODELS (copied/aligned from 02) ====
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

models = {}

# 1) Logistic Regression (liblinear or saga). Use class_weight='balanced' as in 02
models["Logistic Regression (liblinear)"] = LogisticRegression(
    solver="liblinear",
    class_weight="balanced",
    max_iter=1000,
    random_state=RANDOM_STATE
)

# 2) Linear SVM
models["Linear SVM"] = LinearSVC(
    class_weight="balanced",
    random_state=RANDOM_STATE
)

# 3) SGD (logistic loss)
models["SGD (logistic loss)"] = SGDClassifier(
    loss="log_loss",
    class_weight="balanced",
    max_iter=2000,
    early_stopping=True,
    n_iter_no_change=5,
    random_state=RANDOM_STATE
)

# 4) Random Forest (reduced, dense)
USE_RF = True
if USE_RF:
    # reduce dimension for RF (TF-IDF is sparse/high-dim)
    from sklearn.feature_selection import SelectKBest, chi2
    rf_k = min(20000, X_train_tfidf.shape[1])  # cap features for practicality
    selector = SelectKBest(chi2, k=rf_k)
    Xtr_rf = selector.fit_transform(X_train_tfidf, y_train).toarray().astype(np.float32)
    Xte_rf = selector.transform(X_test_tfidf).toarray().astype(np.float32)
    models["Random Forest (reduced)"] = RandomForestClassifier(
        n_estimators=200,
        n_jobs=-1,
        random_state=RANDOM_STATE
    )
else:
    Xtr_rf = Xte_rf = None

print("Models:", list(models.keys()))

Models: ['Logistic Regression (liblinear)', 'Linear SVM', 'SGD (logistic loss)', 'Random Forest (reduced)']


In [None]:
# ==== TRAIN & EVALUATE (test set) ====
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, average_precision_score
from sklearn.preprocessing import label_binarize
import numpy as np

def get_scores(y_true, scores_or_proba, labels):
    # scores_or_proba: array-like (n_samples, n_classes) or decision scores
    y_true_bin = label_binarize(y_true, classes=labels)
    # Handle binary special-case
    if y_true_bin.shape[1] == 1:
        y_true_bin = np.hstack((1 - y_true_bin, y_true_bin))
    # ROC-AUC (macro, OVR) with scores
    rocauc = roc_auc_score(y_true_bin, scores_or_proba, average="macro", multi_class="ovr")
    # PR-AUC (macro): mean of per-class average_precision
    ap_per_class = []
    for k in range(y_true_bin.shape[1]):
        ap_per_class.append(average_precision_score(y_true_bin[:,k], scores_or_proba[:,k]))
    prauc = float(np.mean(ap_per_class))
    return rocauc, prauc

from collections import OrderedDict
results = []

labels_sorted = np.unique(y_test)

for name, model in models.items():
    if name.startswith("Random Forest"):
        model.fit(Xtr_rf, y_train)
        y_pred = model.predict(Xte_rf)
        # probabilities if available
        if hasattr(model, "predict_proba"):
            proba = model.predict_proba(Xte_rf)
            scores = proba
        else:
            # build one-hot as fallback (not ideal for AUC)
            scores = np.eye(len(labels_sorted))[y_pred]
            proba = None
    else:
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)
        if hasattr(model, "predict_proba"):
            proba = model.predict_proba(X_test_tfidf)
            scores = proba
        elif hasattr(model, "decision_function"):
            dec = model.decision_function(X_test_tfidf)
            # Convert decision_function to 2D (n_samples, n_classes)
            if dec.ndim == 1:
                dec = np.vstack([-dec, dec]).T
            scores = dec
            proba = None
        else:
            scores = np.eye(len(labels_sorted))[y_pred]
            proba = None

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro", zero_division=0)
    try:
        rocauc, prauc = get_scores(y_test, scores, labels_sorted)
    except Exception as e:
        rocauc, prauc = None, None

    results.append(OrderedDict([
        ("model", name),
        ("accuracy", round(acc, 4)),
        ("precision", round(prec, 4)),
        ("recall", round(rec, 4)),
        ("f1_macro", round(f1, 4)),
        ("roc_auc_macro", None if rocauc is None else round(rocauc, 4)),
        ("pr_auc_macro", None if prauc is None else round(prauc, 4)),
    ]))

res_df = pd.DataFrame(results).sort_values("f1_macro", ascending=False).reset_index(drop=True)
display(res_df)
res_df.to_csv(RES_DIR / "model_comparison.csv", index=False)
print("✓ Saved results/model_comparison.csv")

In [None]:
# ==== CURVES: ROC & PR (macro, per-model) ====
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

def plot_roc_pr_curves(model, name, X, y, labels, use_rf=False):
    if use_rf:
        X = Xte_rf
    # Get scores/proba
    if hasattr(model, "predict_proba"):
        scores = model.predict_proba(X)
    elif hasattr(model, "decision_function"):
        dec = model.decision_function(X)
        if dec.ndim == 1:
            dec = np.vstack([-dec, dec]).T
        scores = dec
    else:
        y_pred = model.predict(X)
        scores = np.eye(len(labels))[y_pred]

    y_bin = label_binarize(y, classes=labels)
    if y_bin.shape[1] == 1:
        y_bin = np.hstack((1 - y_bin, y_bin))

    # ROC (macro)
    fpr = dict(); tpr = dict(); roc_auc = dict()
    for i in range(y_bin.shape[1]):
        fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], scores[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(y_bin.shape[1])]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(y_bin.shape[1]):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= y_bin.shape[1]
    roc_auc_macro = auc(all_fpr, mean_tpr)

    plt.figure(figsize=(6,5))
    plt.plot(all_fpr, mean_tpr, label=f"macro AUC={roc_auc_macro:.3f}")
    plt.plot([0,1],[0,1], linestyle="--")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate"); plt.title(f"ROC — {name}")
    plt.legend(loc="lower right")
    fn_roc = VIS_DIR / f"roc_{name.replace(' ', '_')}.png"
    plt.savefig(fn_roc, dpi=150, bbox_inches="tight"); plt.close()
    print("Saved", fn_roc)

    # PR (macro)
    precision = dict(); recall = dict(); ap = dict()
    pr_grid = np.linspace(0,1,500)
    mean_precision = np.zeros_like(pr_grid)
    for i in range(y_bin.shape[1]):
        precision[i], recall[i], _ = precision_recall_curve(y_bin[:, i], scores[:, i])
        ap[i] = average_precision_score(y_bin[:, i], scores[:, i])
        # interpolate precision over a common recall grid
        mean_precision += np.interp(pr_grid, recall[i][::-1], precision[i][::-1], left=1.0, right=0.0)
    mean_precision /= y_bin.shape[1]
    ap_macro = float(np.mean(list(ap.values())))

    plt.figure(figsize=(6,5))
    plt.plot(pr_grid, mean_precision, label=f"macro AP={ap_macro:.3f}")
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(f"PR — {name}")
    plt.legend(loc="lower left")
    fn_pr = VIS_DIR / f"pr_{name.replace(' ', '_')}.png"
    plt.savefig(fn_pr, dpi=150, bbox_inches="tight"); plt.close()
    print("Saved", fn_pr)

# Plot for all trained models
for name, model in models.items():
    use_rf = name.startswith("Random Forest")
    plot_roc_pr_curves(model, name, X_test_tfidf, y_test, labels_sorted, use_rf=use_rf)

In [None]:
# ==== CALIBRATION: Reliability curves & Brier score ====
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt

def reliability_plot(probs, y_true, name):
    # binary-only plot; for multi-class, we plot the positive class per label=1
    y_bin = (y_true == np.unique(y_true)[-1]).astype(int)
    prob_pos = probs[:, -1] if probs.shape[1] > 1 else probs[:, 0]
    # create bins
    bins = np.linspace(0.0, 1.0, 11)
    binids = np.digitize(prob_pos, bins) - 1
    bin_true = [y_bin[binids==i].mean() if np.any(binids==i) else np.nan for i in range(len(bins)-1)]
    bin_pred = [(bins[i]+bins[i+1])/2 for i in range(len(bins)-1)]
    plt.figure(figsize=(6,5))
    plt.plot([0,1],[0,1], linestyle="--")
    plt.plot(bin_pred, bin_true, marker="o")
    plt.xlabel("Predicted probability"); plt.ylabel("Empirical positive rate")
    plt.title(f"Reliability — {name}")
    fn = VIS_DIR / f"calibration_{name.replace(' ', '_')}.png"
    plt.savefig(fn, dpi=150, bbox_inches="tight"); plt.close()
    print("Saved", fn)

def calibrate_if_needed(model, name, Xtr, ytr, Xte):
    # returns probabilities for test; wraps non-probabilistic models
    if hasattr(model, "predict_proba"):
        return model.predict_proba(Xte)
    else:
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
        (idx_tr, idx_val) = next(sss.split(Xtr, ytr))
        if isinstance(Xtr, np.ndarray):
            Xtr_, ytr_ = Xtr[idx_tr], ytr[idx_tr]
        else:
            Xtr_, ytr_ = Xtr[idx_tr], ytr[idx_tr]
        calib = CalibratedClassifierCV(model, method="sigmoid", cv="prefit")
        calib.fit(Xtr_, ytr_)
        return calib.predict_proba(Xte)

for name, model in models.items():
    if name.startswith("Random Forest"):
        probs = calibrate_if_needed(model, name, Xtr_rf, y_train, Xte_rf)
    else:
        probs = calibrate_if_needed(model, name, X_train_tfidf, y_train, X_test_tfidf)
    # Brier score (binary or multi-class one-vs-rest average)
    if probs.shape[1] == 2:
        y_bin = (y_test == np.unique(y_test)[-1]).astype(int)
        bs = brier_score_loss(y_bin, probs[:,1])
    else:
        # macro average Brier
        y_bin_all = label_binarize(y_test, classes=np.unique(y_test))
        if y_bin_all.shape[1] == 1:
            y_bin_all = np.hstack((1 - y_bin_all, y_bin_all))
        bs = np.mean([brier_score_loss(y_bin_all[:,k], probs[:,k]) for k in range(y_bin_all.shape[1])])
    reliability_plot(probs, y_test, name)
    with open(RES_DIR / "calibration_brier_scores.csv", "a") as f:
        f.write(f"{name},{bs}\n")
print("✓ Saved results/calibration_brier_scores.csv")

In [None]:
# ==== THRESHOLD TUNING (optimize F1 on validation) ====
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

def best_threshold(y_true, scores):
    # binary only: uses positive class column
    y_bin = (y_true == np.unique(y_true)[-1]).astype(int)
    s = scores[:, -1] if scores.shape[1] > 1 else scores[:, 0]
    ts = np.linspace(0.1, 0.9, 41)
    best_t, best_f1 = 0.5, -1.0
    for t in ts:
        pred = (s >= t).astype(int)
        f1 = f1_score(y_bin, pred, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return best_t, best_f1

for name, model in models.items():
    # Make a small validation split
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    f1s = []; ts = []
    for tr, va in skf.split(X_train_tfidf, y_train):
        if name.startswith("Random Forest"):
            # map indices for reduced RF matrices
            Xtr, Xva = Xtr_rf[tr], Xtr_rf[va]
        else:
            Xtr, Xva = X_train_tfidf[tr], X_train_tfidf[va]
        m = model.__class__(**getattr(model, "get_params")())
        m.random_state = RANDOM_STATE if hasattr(m, "random_state") else None
        m.fit(Xtr, y_train[tr])
        if hasattr(m, "predict_proba"):
            sc = m.predict_proba(Xva)
        elif hasattr(m, "decision_function"):
            dec = m.decision_function(Xva)
            if dec.ndim == 1:
                dec = np.vstack([-dec, dec]).T
            sc = dec
        else:
            # fallback not meaningful; skip
            continue
        t, f = best_threshold(y_train[va], sc)
        f1s.append(f); ts.append(t)
    if f1s:
        t_final = float(np.median(ts))
        with open(RES_DIR / "thresholds.csv", "a") as f:
            f.write(f"{name},{t_final:.3f}\n")
        print(f"{name}: tuned threshold ≈ {t_final:.3f} (median over CV)")

In [None]:
# ==== BOOTSTRAP 95% CIs (macro-F1, ROC-AUC, PR-AUC) ====
rng = np.random.RandomState(RANDOM_STATE)

def bootstrap_ci(metric_fn, y_true, scores, n_boot=1000, alpha=0.05):
    vals = []
    n = len(y_true)
    for _ in range(n_boot):
        idx = rng.randint(0, n, n)
        yt = y_true[idx]
        sc = scores[idx] if isinstance(scores, np.ndarray) else scores[idx,:]
        vals.append(metric_fn(yt, sc))
    vals = np.sort(vals)
    lo = vals[int((alpha/2)*n_boot)]
    hi = vals[int((1 - alpha/2)*n_boot)]
    return np.mean(vals), lo, hi

from sklearn.metrics import f1_score

def macro_f1_from_scores(y_true, scores):
    # derive class predictions by argmax of scores
    y_hat = scores.argmax(axis=1)
    # map indices back to labels
    labels = np.unique(y_true)
    y_hat_lab = labels[y_hat]
    return f1_score(y_true, y_hat_lab, average="macro", zero_division=0)

def roc_macro_from_scores(y_true, scores):
    ys = label_binarize(y_true, classes=np.unique(y_true))
    if ys.shape[1] == 1:
        ys = np.hstack((1 - ys, ys))
    return roc_auc_score(ys, scores, average="macro", multi_class="ovr")

def pr_macro_from_scores(y_true, scores):
    ys = label_binarize(y_true, classes=np.unique(y_true))
    if ys.shape[1] == 1:
        ys = np.hstack((1 - ys, ys))
    aps = []
    for k in range(ys.shape[1]):
        aps.append(average_precision_score(ys[:,k], scores[:,k]))
    return float(np.mean(aps))

ci_rows = []
for name, model in models.items():
    if name.startswith("Random Forest"):
        X = Xte_rf
    else:
        X = X_test_tfidf
    if hasattr(model, "predict_proba"):
        sc = model.predict_proba(X)
    elif hasattr(model, "decision_function"):
        dec = model.decision_function(X)
        if dec.ndim == 1:
            dec = np.vstack([-dec, dec]).T
        sc = dec
    else:
        y_pred = model.predict(X)
        sc = np.eye(len(np.unique(y_test)))[y_pred]
    for metric_name, fn in [("f1_macro", macro_f1_from_scores),
                            ("roc_auc_macro", roc_macro_from_scores),
                            ("pr_auc_macro", pr_macro_from_scores)]:
        try:
            mean, lo, hi = bootstrap_ci(fn, y_test, sc, n_boot=500, alpha=0.05)
            ci_rows.append([name, metric_name, round(mean,4), round(lo,4), round(hi,4)])
        except Exception as e:
            ci_rows.append([name, metric_name, None, None, None])

ci_df = pd.DataFrame(ci_rows, columns=["model","metric","mean","ci_lo","ci_hi"])
display(ci_df.pivot(index="model", columns="metric", values=["mean","ci_lo","ci_hi"]))
ci_df.to_csv(RES_DIR / "bootstrap_cis.csv", index=False)
print("✓ Saved results/bootstrap_cis.csv")

In [None]:
# ==== ERROR ANALYSIS: Top FP/FN ====
import numpy as np
from pathlib import Path

def collect_errors(model, name, X, y, text_source):
    # scores -> predictions
    if hasattr(model, "predict_proba"):
        sc = model.predict_proba(X)
        yhat = sc.argmax(axis=1)
    elif hasattr(model, "decision_function"):
        dec = model.decision_function(X)
        if dec.ndim == 1:
            dec = np.vstack([-dec, dec]).T
        sc = dec
        yhat = sc.argmax(axis=1)
    else:
        yhat = model.predict(X)
        sc = np.eye(len(np.unique(y)))[yhat]
    labels = np.unique(y)
    yhat_lab = labels[yhat]
    errs = (yhat_lab != y)
    if text_source is None:
        # fall back to index as text if we don't have raw text
        text_vals = [f"sample_{i}" for i in range(len(y))]
    else:
        text_vals = text_source

    # score for predicted class
    conf = sc[np.arange(len(y)), yhat]
    # false positives and false negatives for positive class (last label)
    pos_label = labels[-1]
    idx_fp = np.where((yhat_lab == pos_label) & (y != pos_label))[0]
    idx_fn = np.where((yhat_lab != pos_label) & (y == pos_label))[0]
    # sort by confidence descending
    fp_sorted = idx_fp[np.argsort(conf[idx_fp])[::-1]][:20]
    fn_sorted = idx_fn[np.argsort(1 - conf[idx_fn])[::-1]][:20]

    rows = []
    for i in fp_sorted:
        rows.append(["FP", int(i), str(y[i]), str(yhat_lab[i]), float(conf[i]), text_vals[i]])
    for i in fn_sorted:
        rows.append(["FN", int(i), str(y[i]), str(yhat_lab[i]), float(conf[i]), text_vals[i]])

    df = pd.DataFrame(rows, columns=["type","index","true","pred","confidence","text"])
    out = RES_DIR / f"errors_{name.replace(' ','_')}.csv"
    df.to_csv(out, index=False)
    print("✓ Saved", out)

text_source_test = X_test if isinstance(X_test, np.ndarray) else None
for name, model in models.items():
    if name.startswith("Random Forest"):
        collect_errors(model, name, Xte_rf, y_test, text_source_test)
    else:
        collect_errors(model, name, X_test_tfidf, y_test, text_source_test)

In [None]:
# ==== (Optional) LEARNING CURVES ====
from sklearn.model_selection import learning_curve
import numpy as np, matplotlib.pyplot as plt

def plot_learning_curve(estimator, name, X, y, cv=3):
    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5),
        return_times=True, scoring="f1_macro", shuffle=True, random_state=RANDOM_STATE
    )
    train_mean = train_scores.mean(axis=1)
    test_mean = test_scores.mean(axis=1)
    plt.figure(figsize=(6,5))
    plt.plot(train_sizes, train_mean, marker="o", label="train")
    plt.plot(train_sizes, test_mean, marker="o", label="cv")
    plt.xlabel("Training examples"); plt.ylabel("F1 (macro)")
    plt.title(f"Learning Curve — {name}"); plt.legend()
    fn = VIS_DIR / f"learning_curve_{name.replace(' ','_')}.png"
    plt.savefig(fn, dpi=150, bbox_inches="tight"); plt.close()
    print("Saved", fn)

# Run for light models only to keep it quick
plot_learning_curve(models["Linear SVM"], "Linear SVM", X_train_tfidf, y_train, cv=3)

In [None]:
# ==== WRITE ARTIFACT SUMMARY ====
manifest = {
    "model_comparison": str(RES_DIR / "model_comparison.csv"),
    "bootstrap_cis": str(RES_DIR / "bootstrap_cis.csv"),
    "brier_scores": str(RES_DIR / "calibration_brier_scores.csv"),
    "thresholds": str(RES_DIR / "thresholds.csv"),
    "error_lists": [str(p) for p in sorted(RES_DIR.glob("errors_*.csv"))],
    "visualizations": [str(p) for p in sorted(VIS_DIR.glob("*.png"))]
}
with open(RES_DIR / "README.json", "w") as f:
    json.dump(manifest, f, indent=2)
print("✓ Wrote results/README.json")
pd.DataFrame({k:[v] if not isinstance(v, list) else [", ".join(v)] for k,v in manifest.items()})