# 03 — Evaluate Models (Base & Zero-Day)

This notebook evaluates all trained models on **validation** and **test** splits for both scenarios:

- **Base scenario** — all classes included in training/validation.  
- **Zero-Day scenario** — `Bot`, `Web Attack - Brute Force`, and `Infiltration` excluded from train/val, appear only in test.  

### Evaluation procedure
- Load models and metadata from `models/`.  
- Rebuild preprocessing (imputer, scaler, PCA) where needed.  
- Re-implement ET-SSL scoring from saved encoder + centroids.  
- Compute **macro metrics**: confusion matrix, precision, recall, F1-score, ROC-AUC, PR-AUC.  
- Measure **inference times**.  

### Outputs
- **Consolidated metrics**:  
  - `results/metrics_summary.csv`  
  - `results/metrics_summary.json`  
- **Detailed exports**: full confusion-matrix stats (`metrics_full.csv`, LaTeX table),  
- **Plots**: ROC and PR curves, confusion matrices, Macro-F1 barplots, zero-day subset results.  

This ensures all three models are directly comparable across the same splits and metrics.


In [1]:
# %% [markdown]
# ## Imports & configuration

import os, json, time, math
from pathlib import Path
from typing import Dict, Tuple, Any
import numpy as np
import pandas as pd

import joblib
from tqdm.auto import tqdm

from sklearn.metrics import (
    confusion_matrix, classification_report, precision_recall_fscore_support,
    roc_auc_score, average_precision_score, precision_recall_curve, roc_curve
)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

import torch
import torch.nn as nn

SEED = 42
np.random.seed(SEED)

ROOT = Path(".").resolve()
SPLITS_DIR = ROOT / "data" / "splits"
MODELS_DIR = ROOT / "models"
RESULTS_DIR = ROOT / "results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

LABEL_COL = "label"

In [2]:
# ==== Figure & results helpers ====
import os
from pathlib import Path

RESULTS_DIR = Path(RESULTS_DIR) if "RESULTS_DIR" in globals() else Path("./results")
FIG_DIR     = RESULTS_DIR / "figures"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)

def save_fig(fig, name: str, dpi: int = 300):
    p = FIG_DIR / f"{name}.png"
    fig.savefig(p, dpi=dpi, bbox_inches="tight")
    print(f"[saved] {p}")

def save_current_fig(name: str, dpi: int = 300):
    import matplotlib.pyplot as plt
    fig = plt.gcf()
    save_fig(fig, name, dpi=dpi)
    plt.close(fig)

def save_text(lines: str, name: str):
    p = RESULTS_DIR / f"{name}.txt"
    with open(p, "w", encoding="utf-8") as f:
        f.write(lines)
    print(f"[saved] {p}")


In [3]:
# %% [markdown]
# ## Utilities: loader, metrics, CorrelatedGroupsPCA (for RF/IF)

def load_split(approach: str, scenario: str, split: str) -> pd.DataFrame:
    p = SPLITS_DIR / approach / scenario / f"{split}.parquet"
    if not p.exists():
        raise FileNotFoundError(p)
    return pd.read_parquet(p)

def features_and_labels(df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray]:
    X = df.drop(columns=[LABEL_COL, "Label"], errors="ignore")
    y = df[LABEL_COL].values.astype(np.int64)
    return X, y

def compute_macro_metrics(y_true: np.ndarray, y_pred: np.ndarray, scores: np.ndarray=None) -> Dict[str, Any]:
    pre, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    acc = (y_true == y_pred).mean()
    out = {"macro_precision": float(pre), "macro_recall": float(rec), "macro_f1": float(f1), "accuracy": float(acc)}
    if scores is not None:
        try:
            out["roc_auc"] = float(roc_auc_score(y_true, scores))
        except Exception:
            out["roc_auc"] = None
        try:
            out["pr_auc"] = float(average_precision_score(y_true, scores))
        except Exception:
            out["pr_auc"] = None
    return out

class CorrelatedGroupsPCA:
    def __init__(self, rho: float = 0.95, var_keep: float = 0.99):
        self.rho = float(rho)
        self.var_keep = float(var_keep)
        self.groups_ = None
        self.columns_ = None
        self.pca_models_ = None
    def fit(self, X, y=None):
        import numpy as np
        import pandas as pd
        from sklearn.decomposition import PCA
        if isinstance(X, np.ndarray):
            X_df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
        else:
            X_df = X.copy()
        self.columns_ = list(X_df.columns)
        corr = X_df.corr(numeric_only=True).fillna(0.0).values
        n = corr.shape[0]
        visited = np.zeros(n, dtype=bool)
        groups = []
        for i in range(n):
            if visited[i]: continue
            g = [i]
            visited[i] = True
            for j in range(i+1, n):
                if visited[j]: continue
                if abs(corr[i, j]) >= self.rho:
                    g.append(j); visited[j] = True
            groups.append(sorted(g))
        self.pca_models_ = []
        for g in groups:
            if len(g) == 1:
                self.pca_models_.append(("pass", g, None))
            else:
                from sklearn.decomposition import PCA
                pca = PCA(n_components=None, svd_solver="full", random_state=SEED).fit(X_df.iloc[:, g])
                csum = np.cumsum(pca.explained_variance_ratio_)
                k = int(np.searchsorted(csum, self.var_keep) + 1)
                k = max(1, min(k, len(g)))
                pca_k = PCA(n_components=k, svd_solver="full", random_state=SEED).fit(X_df.iloc[:, g])
                self.pca_models_.append(("pca", g, pca_k))
        self.groups_ = groups
        return self
    def transform(self, X):
        import numpy as np
        import pandas as pd
        if isinstance(X, np.ndarray):
            X_df = pd.DataFrame(X, columns=self.columns_)
        else:
            X_df = X.copy()
        outs = []
        for kind, g, model in self.pca_models_:
            if kind == "pass":
                outs.append(X_df.iloc[:, g].values)
            else:
                outs.append(model.transform(X_df.iloc[:, g]))
        return np.concatenate(outs, axis=1)
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [4]:
# %% [markdown]
# ## RF & IF evaluation helpers

def eval_rf_or_if(model_path: Path, split_df: pd.DataFrame, use_prob: bool=True):
    model = joblib.load(model_path)  # Pipeline(var0,impute,scaler,cgpca,estimator)
    X, y = features_and_labels(split_df)
    t0 = time.time()
    if use_prob and hasattr(model, "predict_proba"):
        scores = model.predict_proba(X)[:, 1]
    else:
        # IsolationForest: negative score_samples means more anomalous, but
        # in training we saved a Pipeline; here we try both ways if needed
        try:
            scores = -model[-1].score_samples(model[:-1].transform(X))
        except Exception:
            # fallback to decision_function (shifted)
            scores = -model[-1].decision_function(model[:-1].transform(X))
    infer_s = time.time() - t0
    return y, scores, infer_s

In [None]:
# %% [markdown]
# ## ET-SSL evaluation helpers (rebuild encoder + preprocessors)

def load_etssl_meta(scenario: str):
    with open(MODELS_DIR / f"etssl_{scenario}_meta.json", "r") as f:
        return json.load(f)

class EncoderV2(nn.Module):
    """V2: 512→256→emb mit BN/Dropout, proj: BN→ReLU→Linear"""
    def __init__(self, d, emb=64, proj=128, p_drop=0.1):
        super().__init__()
        self.back = nn.Sequential(
            nn.Linear(d, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(p_drop),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(p_drop),
            nn.Linear(256, emb)
        )
        self.proj = nn.Sequential(
            nn.BatchNorm1d(emb), nn.ReLU(),
            nn.Linear(emb, proj)
        )
    def forward(self, x):
        h = self.back(x)
        z = self.proj(h)
        return z, h

def rebuild_imputer_scaler(meta, n_features: int):
    imp = SimpleImputer(strategy="median")
    stats = np.array(meta["imputer_statistics_"], dtype=float)
    if stats.shape[0] != n_features:
        if stats.shape[0] > n_features:
            stats = stats[:n_features]
        else:
            stats = np.pad(stats, (0, n_features - stats.shape[0]), constant_values=np.nan)
    imp.statistics_ = stats
    imp.n_features_in_ = int(n_features)
    imp._fit_dtype = np.dtype("float64")

    sc = MinMaxScaler()
    sc.min_   = np.array(meta["scaler_min_"],   dtype=float) if meta["scaler_min_"]   is not None else np.zeros(n_features)
    sc.scale_ = np.array(meta["scaler_scale_"], dtype=float) if meta["scaler_scale_"] is not None else np.ones(n_features)
    sc.data_min_      = np.zeros(n_features)
    sc.data_max_      = np.ones(n_features)
    sc.data_range_    = np.ones(n_features)
    sc.n_features_in_ = int(n_features)
    sc.n_samples_seen_ = np.full(n_features, 1, dtype=np.int64)
    return imp, sc



def etssl_scores_for_df(scenario: str, split_df: pd.DataFrame):
    # --- Meta & Weights ---
    with open(MODELS_DIR / f"etssl_{scenario}_meta.json", "r") as f:
        meta = json.load(f)
    state = torch.load(MODELS_DIR / f"etssl_{scenario}_encoder.pt", map_location="cpu")

    # --- Features / Labels ---
    X = split_df.drop(columns=[LABEL_COL, "Label"], errors="ignore")
    y = split_df[LABEL_COL].to_numpy().astype(np.int64)
    d = X.shape[1]

    # --- Imputer/Scaler ---
    imp, sc = rebuild_imputer_scaler(meta, n_features=d)

    # --- Encoder V2 ---
    device = "cuda" if torch.cuda.is_available() else "cpu"
    enc = EncoderV2(d, emb=int(meta["emb_dim"]), proj=int(meta["proj_dim"])).to(device)
    enc.load_state_dict(state, strict=True)
    enc.eval()

    # --- Transform + encode ---
    X_np = X.to_numpy(dtype=float, copy=False)
    Xp = sc.transform(imp.transform(X_np)).astype(np.float32, copy=False)

    with torch.no_grad():
        Z = []
        for i in range(0, len(Xp), 4096):
            xb = torch.from_numpy(Xp[i:i+4096]).to(device)
            _, h = enc(xb)
            Z.append(nn.functional.normalize(h, dim=1).cpu().numpy())
        Z = np.vstack(Z)

    # --- Scores ---
    mu_norm = np.array(meta["mu_norm"], dtype=float)
    mu_anom = np.array(meta["mu_anom"], dtype=float)
    kappa   = float(meta["kappa"])

    dn = ((Z - mu_norm)**2).sum(1)
    da = ((Z - mu_anom)**2).sum(1)
    scores = dn - kappa * da
    return y, scores, 0.0

In [6]:
# %% [markdown]
# ## Run evaluation for both scenarios and all models

rows = []
for scenario in ["base", "zeroday"]:
    print("\n==============================")
    print(f"Scenario: {scenario.upper()}")
    print("==============================")

    for split in ["val", "test"]:
        # RF
        y, scores, secs = eval_rf_or_if(MODELS_DIR / f"rf_{scenario}.joblib", load_split("rf", scenario, split), use_prob=True)
        # Load threshold from meta
        with open(MODELS_DIR / f"rf_{scenario}_meta.json", "r") as f:
            thr = json.load(f)["threshold"]
        yhat = (scores >= thr).astype(int)
        m = compute_macro_metrics(y, yhat, scores)
        rows.append({"model":"rf","scenario":scenario,"split":split,"infer_s":secs, **m})

        # IF
        y, scores, secs = eval_rf_or_if(MODELS_DIR / f"if_{scenario}.joblib", load_split("if", scenario, split), use_prob=False)
        with open(MODELS_DIR / f"if_{scenario}_meta.json", "r") as f:
            thr = json.load(f)["threshold"]
        yhat = (scores >= thr).astype(int)
        m = compute_macro_metrics(y, yhat, scores)
        rows.append({"model":"if","scenario":scenario,"split":split,"infer_s":secs, **m})

        # ETSSL
        y, scores, secs = etssl_scores_for_df(scenario, load_split("etssl", scenario, split))
        with open(MODELS_DIR / f"etssl_{scenario}_meta.json", "r") as f:
            meta = json.load(f)
            thr = meta["theta"]
        yhat = (scores >= thr).astype(int)
        m = compute_macro_metrics(y, yhat, scores)
        rows.append({"model":"etssl","scenario":scenario,"split":split,"infer_s":secs, **m})

df_res = pd.DataFrame(rows)
df_res = df_res.sort_values(["scenario","model","split"]).reset_index(drop=True)
df_res.to_csv(RESULTS_DIR / "metrics_summary.csv", index=False)
with open(RESULTS_DIR / "metrics_summary.json", "w") as f:
    json.dump(df_res.to_dict(orient="records"), f, indent=2)

df_res.head(12)


Scenario: BASE

Scenario: ZERODAY


Unnamed: 0,model,scenario,split,infer_s,macro_precision,macro_recall,macro_f1,accuracy,roc_auc,pr_auc
0,etssl,base,test,0.0,0.847067,0.878041,0.861127,0.908212,0.947739,0.82138
1,etssl,base,val,0.0,0.846831,0.877339,0.860704,0.908174,0.947572,0.821606
2,if,base,test,1.646487,0.744153,0.685819,0.706915,0.835908,0.729613,0.473934
3,if,base,val,1.62724,0.742872,0.6854,0.706265,0.835679,0.729674,0.472114
4,rf,base,test,1.064283,0.990767,0.997674,0.994175,0.996292,0.99997,0.99987
5,rf,base,val,1.084327,0.990567,0.997642,0.994057,0.996223,0.999966,0.999876
6,etssl,zeroday,test,0.0,0.837887,0.867065,0.851165,0.901769,0.924782,0.652962
7,etssl,zeroday,val,0.0,0.837795,0.868488,0.8517,0.902357,0.925336,0.650409
8,if,zeroday,test,1.612712,0.744153,0.685819,0.706915,0.835908,0.729613,0.473934
9,if,zeroday,val,1.615562,0.742872,0.6854,0.706265,0.835679,0.729674,0.472114


In [None]:
# %% [markdown]
# ## Summaries, Tables & Plots  (auto-save)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay

# names
MODEL_NAME = {"rf": "Random Forest", "if": "Isolation Forest", "etssl": "ET-SSL"}
SCEN_NAME  = {"base": "Base", "zeroday": "Zero-Day"}

# --- Pretty-print summary + save as text ---
summary_lines = []
for scenario in ["base", "zeroday"]:
    header = f"\n=== Scenario: {scenario} ==="
    print(header); summary_lines.append(header)
    for model in ["rf", "if", "etssl"]:
        r = df_res.query("scenario == @scenario and split == 'test' and model == @model").iloc[0]
        line = (f"[{model.upper()}/{scenario}] "
                f"F1={r['macro_f1']:.4f} AUC={r['roc_auc']:.4f} AP={r['pr_auc']:.4f} "
                f"test_s={r['infer_s']:.2f}")
        print(line); summary_lines.append(line)
save_text("\n".join(summary_lines).strip(), "summary_test_metrics")

# ---------------------------------------------------
# Compute detailed confusion matrix stats for all 6
# ---------------------------------------------------
details = []
for scenario in ["base","zeroday"]:
    for model in ["rf","if","etssl"]:
        split_df = load_split(model, scenario, "test")
        X, y = features_and_labels(split_df)

        if model in ["rf","if"]:
            y, scores, _ = eval_rf_or_if(MODELS_DIR/f"{model}_{scenario}.joblib", split_df, use_prob=(model=="rf"))
            with open(MODELS_DIR/f"{model}_{scenario}_meta.json") as f: thr = json.load(f)["threshold"]
        else:
            y, scores, _ = etssl_scores_for_df(scenario, split_df)
            with open(MODELS_DIR/f"etssl_{scenario}_meta.json") as f: thr = json.load(f)["theta"]

        yhat = (scores >= thr).astype(int)
        cm = confusion_matrix(y, yhat, labels=[0,1])
        tn, fp, fn, tp = cm.ravel()
        tpr = tp/(tp+fn+1e-12)
        tnr = tn/(tn+fp+1e-12)
        fpr = fp/(fp+tn+1e-12)
        fnr = fn/(fn+tp+1e-12)
        acc = (tp+tn)/(tp+tn+fp+fn)
        row = dict(approach=model, scenario=scenario,
                   macro_f1=float(precision_recall_fscore_support(y,yhat,average="macro")[2]),
                   roc_auc=float(roc_auc_score(y, scores)),
                   pr_ap=float(average_precision_score(y, scores)),
                   TP=tp, TN=tn, FP=fp, FN=fn,
                   TPR=tpr, TNR=tnr, FPR=fpr, FNR=fnr,
                   ACC=acc, thr=thr, test_time_sec=df_res.query("scenario==@scenario and model==@model and split=='test'").iloc[0]["infer_s"])
        details.append(row)

df_full = pd.DataFrame(details)
display(df_full)

# Save table variants
latex_path = RESULTS_DIR / "metrics_latex.tex"
csv_path   = RESULTS_DIR / "metrics_full.csv"
with open(latex_path,"w") as f:
    f.write(df_full.to_latex(index=False, float_format="%.4f"))
df_full.to_csv(csv_path, index=False)
print(f"[saved] {latex_path}")
print(f"[saved] {csv_path}")

# ---------------------------------------------------
# ROC and PR curves (saved)
# ---------------------------------------------------
from sklearn.metrics import roc_curve, precision_recall_curve

def plot_curves(kind="roc", scenario="base"):
    plt.figure(figsize=(6,5))
    for model in ["rf","if","etssl"]:
        split_df = load_split(model, scenario, "test")
        if model in ["rf","if"]:
            y, scores, _ = eval_rf_or_if(MODELS_DIR/f"{model}_{scenario}.joblib", split_df, use_prob=(model=="rf"))
        else:
            y, scores, _ = etssl_scores_for_df(scenario, split_df)

        if kind=="roc":
            fpr, tpr, _ = roc_curve(y, scores)
            plt.plot(fpr, tpr, label=MODEL_NAME[model])  
            plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
            plt.title(f"ROC — {SCEN_NAME[scenario]}")
            fname = f"roc_{scenario}"
        else:
            prec, rec, _ = precision_recall_curve(y, scores)
            plt.plot(rec, prec, label=MODEL_NAME[model])
            plt.xlabel("Recall"); plt.ylabel("Precision")
            plt.title(f"PR — {SCEN_NAME[scenario]}")
            fname = f"pr_{scenario}"
    plt.legend(); plt.grid(True)
    save_current_fig(fname)

for s in ["base","zeroday"]:
    plot_curves("roc", s)
    plot_curves("pr",  s)

# ---------------------------------------------------
# Macro-F1 barplots (saved)
# ---------------------------------------------------
plt.figure(figsize=(6,5))
sns.barplot(data=df_full, x="approach", y="macro_f1", hue="scenario")
plt.title("Macro-F1 by Approach & Scenario")
save_current_fig("macro_f1_by_approach_scenario")

# ---------------------------------------------------
# Confusion matrices (saved)
# ---------------------------------------------------
for s in ["base","zeroday"]:
    for m in ["rf","if","etssl"]:
        split_df = load_split(m, s, "test")
        if m in ["rf","if"]:
            y, scores, _ = eval_rf_or_if(MODELS_DIR/f"{m}_{s}.joblib", split_df, use_prob=(m=="rf"))
            with open(MODELS_DIR/f"{m}_{s}_meta.json") as f: thr = json.load(f)["threshold"]
        else:
            y, scores, _ = etssl_scores_for_df(s, split_df)
            with open(MODELS_DIR/f"etssl_{s}_meta.json") as f: thr = json.load(f)["theta"]

        yhat = (scores >= thr).astype(int)
        cm = confusion_matrix(y, yhat, labels=[0,1])
        disp = ConfusionMatrixDisplay(cm, display_labels=["Benign","Attack"])
        disp.plot(values_format="d")
        plt.title(f"Confusion Matrix — {MODEL_NAME[m]} / {SCEN_NAME[s]}")
        save_current_fig(f"cm_{m}_{s}")



=== Scenario: base ===
[RF/base] F1=0.9942 AUC=1.0000 AP=0.9999 test_s=1.06
[IF/base] F1=0.7069 AUC=0.7296 AP=0.4739 test_s=1.65
[ETSSL/base] F1=0.8611 AUC=0.9477 AP=0.8214 test_s=0.00

=== Scenario: zeroday ===
[RF/zeroday] F1=0.9931 AUC=0.9989 AP=0.9982 test_s=1.20
[IF/zeroday] F1=0.7069 AUC=0.7296 AP=0.4739 test_s=1.61
[ETSSL/zeroday] F1=0.8512 AUC=0.9248 AP=0.6530 test_s=0.00
[saved] /home/user/project/results/summary_test_metrics.txt


Unnamed: 0,approach,scenario,macro_f1,roc_auc,pr_ap,TP,TN,FP,FN,TPR,TNR,FPR,FNR,ACC,thr,test_time_sec
0,rf,base,0.994175,0.99997,0.99987,83454,339155,1569,4,0.999952,0.995395,0.004605,4.8e-05,0.996292,0.009451,1.064283
1,if,base,0.706915,0.729613,0.473934,36584,317993,22731,46874,0.438352,0.933286,0.066714,0.561648,0.835908,0.503404,1.646487
2,etssl,base,0.861127,0.947739,0.82138,69128,316119,24605,14330,0.828297,0.927786,0.072214,0.171703,0.908212,0.000822,0.0
3,rf,zeroday,0.993104,0.998901,0.998231,83185,339137,1587,273,0.996729,0.995342,0.004658,0.003271,0.995615,0.003259,1.199695
4,if,zeroday,0.706915,0.729613,0.473934,36584,317993,22731,46874,0.438352,0.933286,0.066714,0.561648,0.835908,0.503404,1.612712
5,etssl,zeroday,0.851165,0.924782,0.652962,67588,314926,25798,15870,0.809844,0.924285,0.075715,0.190156,0.901769,0.000218,0.0


[saved] /home/user/project/results/metrics_latex.tex
[saved] /home/user/project/results/metrics_full.csv
[saved] /home/user/project/results/figures/roc_base.png
[saved] /home/user/project/results/figures/pr_base.png
[saved] /home/user/project/results/figures/roc_zeroday.png
[saved] /home/user/project/results/figures/pr_zeroday.png
[saved] /home/user/project/results/figures/macro_f1_by_approach_scenario.png
[saved] /home/user/project/results/figures/cm_rf_base.png
[saved] /home/user/project/results/figures/cm_if_base.png
[saved] /home/user/project/results/figures/cm_etssl_base.png
[saved] /home/user/project/results/figures/cm_rf_zeroday.png
[saved] /home/user/project/results/figures/cm_if_zeroday.png
[saved] /home/user/project/results/figures/cm_etssl_zeroday.png


In [None]:
# %% [markdown]
# ## Zero-Day Only — Confusion Matrices (all models, both scenarios)  (auto-save)

from sklearn.metrics import ConfusionMatrixDisplay

MODEL_NAME = {"rf": "Random Forest", "if": "Isolation Forest", "etssl": "ET-SSL"}
SCEN_NAME  = {"base": "Base", "zeroday": "Zero-Day"}

def infer_zero_day_labels() -> set:
    """
    Infer zero-day labels as attack labels present in zeroday/test but absent in zeroday/train.
    Uses RF splits (identical label space across approaches).
    """
    df_train = load_split("rf", "zeroday", "train")
    df_test  = load_split("rf", "zeroday", "test")

    train_attacks = set(df_train.loc[df_train[LABEL_COL] == 1, "Label"].dropna().unique())
    test_attacks  = set(df_test.loc[df_test[LABEL_COL]  == 1, "Label"].dropna().unique())
    zero_day = sorted(list(test_attacks - train_attacks))
    return set(zero_day)

ZD_LABELS = infer_zero_day_labels()
print(f"Zero-day labels inferred ({len(ZD_LABELS)}): {sorted(ZD_LABELS)}")
save_text("Zero-day labels:\n" + "\n".join(sorted(ZD_LABELS)), "zero_day_labels")

def zero_day_subset(df: pd.DataFrame) -> pd.DataFrame:
    if "Label" not in df.columns:
        raise ValueError("Column 'Label' (string labels) not found; needed to filter zero-day classes.")
    return df[df["Label"].isin(ZD_LABELS)].copy()

def cm_for_model_scenario_zero_day(model: str, scenario: str):
    """
    Compute confusion matrix restricted to zero-day rows of the *test* split.
    """
    df_test = load_split(model, scenario, "test")
    df_zd   = zero_day_subset(df_test)
    if df_zd.empty:
        print(f"[{model.upper()}/{scenario}] No zero-day rows in test split.")
        return None, None, None, None

    # Scores & threshold
    if model in ["rf", "if"]:
        y, scores, _ = eval_rf_or_if(MODELS_DIR / f"{model}_{scenario}.joblib", df_zd, use_prob=(model=="rf"))
        with open(MODELS_DIR / f"{model}_{scenario}_meta.json") as f:
            thr = json.load(f)["threshold"]
    else:
        y, scores, _ = etssl_scores_for_df(scenario, df_zd)
        with open(MODELS_DIR / f"etssl_{scenario}_meta.json") as f:
            thr = json.load(f)["theta"]

    yhat = (scores >= thr).astype(int)
    cm = confusion_matrix(y, yhat, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    return cm, (tn, fp, fn, tp), y, yhat

# --- Print counts, plot, and save ---
zd_rows = []
for scenario in ["base", "zeroday"]:
    print("\n" + "="*30)
    print(f"Zero-Day Confusion Matrices — {SCEN_NAME[scenario]}")
    print("="*30)
    for model in ["rf","if","etssl"]:
        try:
            cm, (tn, fp, fn, tp), y, yhat = cm_for_model_scenario_zero_day(model, scenario)
            if cm is None:
                continue
            # Numeric dump
            line = f"[{model.upper()}/{scenario}] TP={tp}  TN={tn}  FP={fp}  FN={fn}  |  N={len(y)}"
            print(line)
            zd_rows.append(dict(approach=model, scenario=scenario, TP=tp, TN=tn, FP=fp, FN=fn, N=len(y)))
            # Plot & save CM
            disp = ConfusionMatrixDisplay(cm, display_labels=["Benign","Attack"])
            disp.plot(values_format="d")
            plt.title(f"Zero-Day CM — {MODEL_NAME[model]} / {SCEN_NAME[scenario]}")
            save_current_fig(f"cm_zeroday_only_{model}_{scenario}")
        except Exception as e:
            print(f"[warn] {model}/{scenario} failed on zero-day subset: {e}")

# Export LaTeX + CSV der Zero-Day CMs
df_zd = pd.DataFrame(zd_rows)
display(df_zd)
zd_tex = RESULTS_DIR / "zero_day_confusion_matrices.tex"
zd_csv = RESULTS_DIR / "zero_day_confusion_matrices.csv"
with open(zd_tex, "w") as f:
    f.write(df_zd.to_latex(index=False))
df_zd.to_csv(zd_csv, index=False)
print(f"[saved] {zd_tex}")
print(f"[saved] {zd_csv}")


Zero-day labels inferred (3): ['Bot', 'Infiltration', 'Web Attack - Brute Force']
[saved] /home/user/project/results/zero_day_labels.txt

Zero-Day Confusion Matrices — Base
[RF/base] TP=509  TN=0  FP=0  FN=4  |  N=513
[saved] /home/user/project/results/figures/cm_zeroday_only_rf_base.png
[IF/base] TP=24  TN=0  FP=0  FN=489  |  N=513
[saved] /home/user/project/results/figures/cm_zeroday_only_if_base.png
[ETSSL/base] TP=206  TN=0  FP=0  FN=307  |  N=513
[saved] /home/user/project/results/figures/cm_zeroday_only_etssl_base.png

Zero-Day Confusion Matrices — Zero-Day
[RF/zeroday] TP=240  TN=0  FP=0  FN=273  |  N=513
[saved] /home/user/project/results/figures/cm_zeroday_only_rf_zeroday.png
[IF/zeroday] TP=24  TN=0  FP=0  FN=489  |  N=513
[saved] /home/user/project/results/figures/cm_zeroday_only_if_zeroday.png
[ETSSL/zeroday] TP=195  TN=0  FP=0  FN=318  |  N=513
[saved] /home/user/project/results/figures/cm_zeroday_only_etssl_zeroday.png


Unnamed: 0,approach,scenario,TP,TN,FP,FN,N
0,rf,base,509,0,0,4,513
1,if,base,24,0,0,489,513
2,etssl,base,206,0,0,307,513
3,rf,zeroday,240,0,0,273,513
4,if,zeroday,24,0,0,489,513
5,etssl,zeroday,195,0,0,318,513


[saved] /home/user/project/results/zero_day_confusion_matrices.tex
[saved] /home/user/project/results/zero_day_confusion_matrices.csv
