# 6. Classification Model

### Cell 1 — Overview, imports, global config

In [1]:
# ================================================================
# Cell 1: Imports & global configuration
# ================================================================

import warnings
warnings.filterwarnings("ignore")

import os
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    confusion_matrix,
)

# Optional XGBoost; if not available, fall back to GradientBoosting
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

print("=" * 70)
print("CLASSIFICATION MODEL EVALUATION – FINAL EXPERIMENT PIPELINE")
print("=" * 70)

# ---------------- Paths & configuration ----------------
PROJECT_ROOT = Path("..")

PREP_DIR = PROJECT_ROOT / "output_Preprocessing_TemporalDataSplitting"
FE_DIR   = PROJECT_ROOT / "output_FeatureEngineering"

FE_TRAIN_ORIG_DIR  = FE_DIR / "train" / "orig"
FE_TRAIN_CLEAN_DIR = FE_DIR / "train" / "cleaned"
FE_TEST_DIR        = FE_DIR / "test"

RESULTS_DIR  = PROJECT_ROOT / "results_cls"
FIG_DIR      = RESULTS_DIR / "figs"
ANALYSIS_DIR = RESULTS_DIR / "analysis"

RESULTS_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)
ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)

HORIZONS    = [1, 6, 12, 24]
FE_TYPES    = ["hourly", "daily", "merge"]
DATA_VERS   = ["orig", "cleaned"]
CLASS_NAMES = ["Low", "Medium", "High"]

RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")

print("\nExperiment configuration:")
print(f"Horizons: {HORIZONS}")
print(f"FE types: {FE_TYPES}")
print(f"Data vers: {DATA_VERS}")
print(f"Results dir: {RESULTS_DIR}")
print(f"Run tag: {RUN_TAG}")

# ---------------- Load base data for labels ----------------
print("\n[STEP 0] Loading base data for label construction...")

base_df = pd.read_csv(
    PREP_DIR / "preprocessed_data.csv",
    index_col="DateTime",
    parse_dates=True,
)
base_df = base_df.sort_index()

if "CO(GT)" not in base_df.columns:
    raise KeyError("Column 'CO(GT)' not found in preprocessed_data.csv")

print(f"base_df shape: {base_df.shape}")
print(f"base_df range: {base_df.index.min()} → {base_df.index.max()}")
print(f"Years present: {sorted(base_df.index.year.unique())}")

CLASSIFICATION MODEL EVALUATION – FINAL EXPERIMENT PIPELINE

Experiment configuration:
Horizons: [1, 6, 12, 24]
FE types: ['hourly', 'daily', 'merge']
Data vers: ['orig', 'cleaned']
Results dir: ../results_cls
Run tag: 20251119_014125

[STEP 0] Loading base data for label construction...
base_df shape: (8833, 12)
base_df range: 2004-03-10 18:00:00 → 2005-04-04 14:00:00
Years present: [2004, 2005]


### Cell 2 — Helpers: loading data, labels, models, confusion matrix

In [2]:
# ================================================================
# Cell 2: Helper functions (labels, loaders, models, CM plotting)
# ================================================================

# ---------------- Label construction ----------------
def make_future_cls_label(df: pd.DataFrame, h: int) -> pd.Series:
    """
    Build classification labels y_{t+h} from CO(GT) with 3 bins:
      0: CO(GT) < 1.5
      1: 1.5 ≤ CO(GT) < 2.5
      2: CO(GT) ≥ 2.5
    """
    y_future = df["CO(GT)"].shift(-h)
    bins = [-np.inf, 1.5, 2.5, np.inf]
    y_cls = pd.cut(y_future, bins=bins, labels=[0, 1, 2]).astype("Int64")
    y_cls = y_cls.dropna().astype(int)
    return y_cls


# ---------------- Feature loaders ----------------
def load_train_test_fe(fe_type: str, data_ver: str):
    """
    Load train (2004) and test (2005) feature tables for a given
    FE type (hourly/daily/merge) and data version (orig/cleaned).
    """
    if fe_type not in FE_TYPES:
        raise ValueError(f"Unknown FE type: {fe_type}")

    if data_ver == "orig":
        train_path = FE_TRAIN_ORIG_DIR / f"train_2004_fe_{fe_type}_orig.csv"
    elif data_ver == "cleaned":
        train_path = FE_TRAIN_CLEAN_DIR / f"train_2004_fe_{fe_type}_cleaned.csv"
    else:
        raise ValueError(f"Unknown data version: {data_ver}")

    test_path = FE_TEST_DIR / f"test_2005_fe_{fe_type}.csv"

    if not train_path.exists():
        raise FileNotFoundError(f"Train FE file not found: {train_path}")
    if not test_path.exists():
        raise FileNotFoundError(f"Test FE file not found: {test_path}")

    X_tr = pd.read_csv(train_path, index_col="DateTime", parse_dates=True)
    X_te = pd.read_csv(test_path, index_col="DateTime", parse_dates=True)

    X_tr = X_tr.sort_index()
    X_te = X_te.sort_index()

    return X_tr, X_te


def build_train_test_for_fe(fe_type: str, data_ver: str, h: int):
    """
    For a given FE type + data version + horizon, construct aligned
    (X_train, y_train, X_test, y_test) with time-respecting splits.
    """
    X_tr, X_te = load_train_test_fe(fe_type, data_ver)
    y_full = make_future_cls_label(base_df, h)

    # Train: 2004 only, and avoid using labels that look into 2005
    idx_tr = X_tr.index.intersection(y_full.index)
    boundary = pd.Timestamp("2004-12-31 23:00:00") - pd.Timedelta(hours=h - 1)
    idx_tr = idx_tr[idx_tr <= boundary]
    X_train = X_tr.loc[idx_tr]
    y_train = y_full.loc[idx_tr]

    # Test: 2005 only
    idx_te = X_te.index.intersection(y_full.index)
    X_test = X_te.loc[idx_te]
    y_test = y_full.loc[idx_te]

    return X_train, y_train, X_test, y_test


# ---------------- Model zoo ----------------
def model_zoo():
    """
    Return a dict of candidate models:
      - Logistic Regression
      - Random Forest
      - XGBoost (if available) or GradientBoosting as fallback
    """
    models = {}

    models["LogReg"] = LogisticRegression(
        max_iter=2000,
        multi_class="auto",
        n_jobs=-1,
    )

    models["RF"] = RandomForestClassifier(
        n_estimators=400,
        random_state=42,
        n_jobs=-1,
    )

    if HAS_XGB:
        models["XGB"] = XGBClassifier(
            n_estimators=400,
            max_depth=5,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="multi:softmax",
            num_class=3,
            random_state=42,
            n_jobs=-1,
        )
    else:
        print("xgboost not available, using GradientBoostingClassifier as GB.")
        models["GB"] = GradientBoostingClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=3,
            random_state=42,
        )

    return models


# ---------------- Metrics ----------------
def eval_metrics(y_true, y_pred) -> dict:
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
    }


# ---------------- Confusion matrix plotting ----------------
def save_confusion(y_true, y_pred, title: str, path_png: Path):
    """
    Save a 3×3 confusion matrix figure with labels:
        Low / Medium / High
    """
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1, 2])

    plt.figure(figsize=(4.5, 4.0))
    im = plt.imshow(cm, interpolation="nearest", cmap="Blues")
    plt.title(title, fontsize=10)
    plt.colorbar(im, fraction=0.046, pad=0.04)

    tick_marks = np.arange(len(CLASS_NAMES))
    plt.xticks(tick_marks, CLASS_NAMES, rotation=45, ha="right")
    plt.yticks(tick_marks, CLASS_NAMES)

    thresh = cm.max() / 2.0 if cm.max() > 0 else 0.5
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            val = cm[i, j]
            plt.text(
                j,
                i,
                str(val),
                ha="center",
                va="center",
                color="white" if val > thresh else "black",
                fontsize=9,
            )

    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.tight_layout()
    plt.savefig(path_png, dpi=220)
    plt.close()

### Cell 3 — Run full experiment grid, save summary_full_run_*.csv + confusion matrices

In [3]:
# ================================================================
# Cell 3: Run full experiment grid
# ================================================================

results = []

print("\n[STEP 1] Running full experiment grid...")

# ---------------- Naive baseline ----------------
def eval_naive_baseline(h: int):
    """
    Naive baseline:
      y_{t+h} = class of CO(GT) at t+h
      y_hat   = class of CO(GT) at t
    """
    y = make_future_cls_label(base_df, h)

    bins = [-np.inf, 1.5, 2.5, np.inf]
    c_t_cls = pd.cut(base_df["CO(GT)"], bins=bins, labels=[0, 1, 2]).astype("Int64")
    c_t_cls = c_t_cls.dropna().astype(int)

    idx = y.index.intersection(c_t_cls.index)

    y = y.loc[idx]
    yhat = c_t_cls.loc[idx]

    boundary = pd.Timestamp("2004-12-31 23:00:00") - pd.Timedelta(hours=h - 1)
    train_idx = idx[(idx.year == 2004) & (idx <= boundary)]
    test_idx = idx[idx.year == 2005]

    y_te = y.loc[test_idx]
    yhat_te = yhat.loc[test_idx]

    metrics = eval_metrics(y_te, yhat_te)
    return metrics, y_te, yhat_te


# 1) Naive baseline per horizon
for h in HORIZONS:
    print(f"\n[Naive] Horizon h = {h}")
    naive_metrics, y_te_naive, yhat_naive = eval_naive_baseline(h)

    results.append({
        "h": h,
        "fe_type": "N/A",
        "data_ver": "N/A",
        "model": "Naive",
        "train_n": np.nan,
        "train_n_eff": np.nan,
        "clean_removed": np.nan,
        **naive_metrics,
    })

    cm_path = FIG_DIR / f"cm_naive_h{h}.png"
    save_confusion(y_te_naive, yhat_naive, f"Naive baseline (h={h})", cm_path)


# 2) Models with feature sets
model_dict = model_zoo()

for h in HORIZONS:
    print("\n" + "=" * 60)
    print(f"[GRID] Horizon h = {h}")
    print("=" * 60)

    for fe_type in FE_TYPES:
        for data_ver in DATA_VERS:
            print(f"FE = {fe_type:6s} | data = {data_ver:7s}")

            X_tr, y_tr, X_te, y_te = build_train_test_for_fe(fe_type, data_ver, h)
            n_train = len(X_tr)

            if data_ver == "orig":
                clean_removed = 0
            else:
                X_tr_orig, _, _, _ = build_train_test_for_fe(fe_type, "orig", h)
                clean_removed = max(0, len(X_tr_orig) - len(X_tr))

            print(f"Train size: {len(X_tr)}, Test size: {len(X_te)}, Removed by cleaning: {clean_removed}")

            scaler = StandardScaler().fit(X_tr)
            X_tr_s = scaler.transform(X_tr)
            X_te_s = scaler.transform(X_te)

            for m_name, proto in model_dict.items():
                print(f"  Model: {m_name}")

                model_cls = proto.__class__
                model = model_cls(**proto.get_params())

                model.fit(X_tr_s, y_tr)
                yhat = model.predict(X_te_s)
                metrics = eval_metrics(y_te, yhat)

                results.append({
                    "h": h,
                    "fe_type": fe_type,
                    "data_ver": data_ver,
                    "model": m_name,
                    "train_n": int(n_train),
                    "train_n_eff": int(len(X_tr)),
                    "clean_removed": int(clean_removed),
                    **metrics,
                })

                cm_title = f"{m_name} | FE={fe_type} | {data_ver} | h={h}"
                safe_data_ver = data_ver.replace("(", "").replace(")", "").replace(" ", "_")
                cm_path = FIG_DIR / f"cm_{m_name}_FE-{fe_type}_{safe_data_ver}_h{h}.png"
                save_confusion(y_te, yhat, cm_title, cm_path)

# 3) Save summary
df_res = pd.DataFrame(results)
df_res = df_res[
    ["h", "fe_type", "data_ver", "model", "train_n", "train_n_eff",
     "clean_removed", "Accuracy", "Macro-F1", "Macro-Recall"]
].sort_values(["h", "model", "fe_type", "data_ver"])

summary_path = RESULTS_DIR / f"summary_full_run_{RUN_TAG}.csv"
df_res.to_csv(summary_path, index=False)

print("\nFull experiment grid finished.")
print(f"Summary saved to: {summary_path}")
df_res.head()


[STEP 1] Running full experiment grid...

[Naive] Horizon h = 1

[Naive] Horizon h = 6

[Naive] Horizon h = 12

[Naive] Horizon h = 24

[GRID] Horizon h = 1
FE = hourly | data = orig   
Train size: 6590, Test size: 2230, Removed by cleaning: 0
  Model: LogReg
  Model: RF
  Model: XGB
FE = hourly | data = cleaned
Train size: 6414, Test size: 2230, Removed by cleaning: 176
  Model: LogReg
  Model: RF
  Model: XGB
FE = daily  | data = orig   
Train size: 252, Test size: 93, Removed by cleaning: 0
  Model: LogReg
  Model: RF
  Model: XGB
FE = daily  | data = cleaned
Train size: 252, Test size: 93, Removed by cleaning: 0
  Model: LogReg
  Model: RF
  Model: XGB
FE = merge  | data = orig   
Train size: 6006, Test size: 2230, Removed by cleaning: 0
  Model: LogReg
  Model: RF
  Model: XGB
FE = merge  | data = cleaned
Train size: 5845, Test size: 2230, Removed by cleaning: 161
  Model: LogReg
  Model: RF
  Model: XGB

[GRID] Horizon h = 6
FE = hourly | data = orig   
Train size: 6585, Test si

Unnamed: 0,h,fe_type,data_ver,model,train_n,train_n_eff,clean_removed,Accuracy,Macro-F1,Macro-Recall
13,1,daily,cleaned,LogReg,252.0,252.0,0.0,0.677419,0.515093,0.517472
10,1,daily,orig,LogReg,252.0,252.0,0.0,0.688172,0.583563,0.58203
7,1,hourly,cleaned,LogReg,6414.0,6414.0,176.0,0.760538,0.728104,0.737527
4,1,hourly,orig,LogReg,6590.0,6590.0,0.0,0.767265,0.739668,0.748551
19,1,merge,cleaned,LogReg,5845.0,5845.0,161.0,0.742152,0.736794,0.751057


### Cell 4 — Analysis by four dimensions (RQ1–RQ4) + plots + CSVs

In [4]:
# ================================================================
# Cell 4: Post-hoc analysis for RQ1–RQ4
# ================================================================

print("\n[STEP 2] Post-hoc analysis for 4 evaluation dimensions...")

df_all    = df_res.copy()
df_naive  = df_all[df_all["model"] == "Naive"].copy()
df_models = df_all[df_all["model"] != "Naive"].copy()

PRIMARY = "Macro-F1"

# ------------------------------------------------------------
# RQ1: Model family comparison (cleaned + merge only)
# ------------------------------------------------------------
print("\n[RQ1] Model comparison (cleaned data, FE = merge)...")

rq1 = df_models[
    (df_models["data_ver"] == "cleaned") &
    (df_models["fe_type"] == "merge")
].copy()

rq1_summary = (
    rq1.groupby(["h", "model"])[["Accuracy", "Macro-F1", "Macro-Recall"]]
       .mean().reset_index()
)

rq1_path = ANALYSIS_DIR / "rq1_model_comparison.csv"
rq1_summary.to_csv(rq1_path, index=False)
print(f"Saved RQ1 summary to: {rq1_path}")

plt.figure(figsize=(6, 4))
for m in sorted(rq1["model"].unique()):
    sub = rq1_summary[rq1_summary["model"] == m]
    plt.plot(sub["h"], sub[PRIMARY], marker="o", label=m)

plt.xlabel("Horizon h (hours)")
plt.ylabel(PRIMARY)
plt.title(f"RQ1 – Model comparison (cleaned, FE=merge, metric={PRIMARY})")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(FIG_DIR / "rq1_model_comparison.png", dpi=220)
plt.close()

# ------------------------------------------------------------
# RQ2: Anomaly detection effect (orig vs cleaned)
# ------------------------------------------------------------
print("\n[RQ2] Effect of anomaly cleaning (orig vs cleaned)...")

def compute_clean_delta(group):
    if set(group["data_ver"]) >= {"orig", "cleaned"}:
        m_clean = group[group["data_ver"] == "cleaned"][["Accuracy", "Macro-F1", "Macro-Recall"]].mean()
        m_orig  = group[group["data_ver"] == "orig"][["Accuracy", "Macro-F1", "Macro-Recall"]].mean()
        return (m_clean - m_orig)
    else:
        return pd.Series({"Accuracy": np.nan, "Macro-F1": np.nan, "Macro-Recall": np.nan})

rq2_delta = (
    df_models.groupby(["h", "fe_type", "model"]).apply(compute_clean_delta)
             .reset_index()
)

rq2_path = ANALYSIS_DIR / "rq2_anomaly_effect.csv"
rq2_delta.to_csv(rq2_path, index=False)
print(f"Saved RQ2 delta table to: {rq2_path}")

rq2_fe_summary = (
    rq2_delta.groupby("fe_type")[["Accuracy", "Macro-F1", "Macro-Recall"]]
             .mean().reset_index()
)

plt.figure(figsize=(5, 3.5))
plt.bar(rq2_fe_summary["fe_type"], rq2_fe_summary["Macro-F1"])
plt.xlabel("FE type")
plt.ylabel("Δ Macro-F1 (cleaned - orig)")
plt.title("RQ2 – Average cleaning uplift by FE type")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.savefig(FIG_DIR / "rq2_cleaning_uplift_by_fe.png", dpi=220)
plt.close()

# ------------------------------------------------------------
# RQ3: Feature engineering effect (cleaned data only)
# ------------------------------------------------------------
print("\n[RQ3] Feature engineering comparison (cleaned data only)...")

rq3 = df_models[df_models["data_ver"] == "cleaned"].copy()

rq3_summary = (
    rq3.groupby(["h", "fe_type"])[["Accuracy", "Macro-F1", "Macro-Recall"]]
       .mean().reset_index()
)

rq3_path = ANALYSIS_DIR / "rq3_fe_effect.csv"
rq3_summary.to_csv(rq3_path, index=False)
print(f"Saved RQ3 summary to: {rq3_path}")

pivot = rq3_summary.pivot(index="fe_type", columns="h", values=PRIMARY)

plt.figure(figsize=(6, 3.5))
im = plt.imshow(pivot.values, aspect="auto", cmap="viridis")
plt.colorbar(im, fraction=0.046, pad=0.04, label=PRIMARY)
plt.xticks(range(len(pivot.columns)), pivot.columns)
plt.yticks(range(len(pivot.index)), pivot.index)
plt.xlabel("Horizon h (hours)")
plt.ylabel("FE type")
plt.title(f"RQ3 – Feature engineering effect ({PRIMARY})")

for i in range(pivot.shape[0]):
    for j in range(pivot.shape[1]):
        val = pivot.values[i, j]
        plt.text(
            j,
            i,
            f"{val:.2f}",
            ha="center",
            va="center",
            color="white" if val < pivot.values.max() / 2 else "black",
            fontsize=8,
        )

plt.tight_layout()
plt.savefig(FIG_DIR / "rq3_fe_effect_heatmap.png", dpi=220)
plt.close()

# ------------------------------------------------------------
# RQ4: Horizon effect (performance decay vs h)
# ------------------------------------------------------------
print("\n[RQ4] Horizon effect (average over models/FE/versions)...")

rq4_summary = (
    df_models.groupby("h")[["Accuracy", "Macro-F1", "Macro-Recall"]]
             .mean().reset_index()
)

rq4_path = ANALYSIS_DIR / "rq4_horizon_effect.csv"
rq4_summary.to_csv(rq4_path, index=False)
print(f"Saved RQ4 summary to: {rq4_path}")

plt.figure(figsize=(6, 4))
plt.plot(rq4_summary["h"], rq4_summary["Accuracy"], marker="o", label="Accuracy")
plt.plot(rq4_summary["h"], rq4_summary["Macro-F1"], marker="o", label="Macro-F1")
plt.plot(rq4_summary["h"], rq4_summary["Macro-Recall"], marker="o", label="Macro-Recall")
plt.xlabel("Horizon h (hours)")
plt.ylabel("Score")
plt.title("RQ4 – Average performance vs horizon")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(FIG_DIR / "rq4_perf_vs_horizon.png", dpi=220)
plt.close()

print("\nAnalysis complete.")
print(f"Full grid results: {summary_path}")
print(f"Analysis CSVs    : {ANALYSIS_DIR}")
print(f"Figures          : {FIG_DIR}")


[STEP 2] Post-hoc analysis for 4 evaluation dimensions...

[RQ1] Model comparison (cleaned data, FE = merge)...
Saved RQ1 summary to: ../results_cls/analysis/rq1_model_comparison.csv

[RQ2] Effect of anomaly cleaning (orig vs cleaned)...
Saved RQ2 delta table to: ../results_cls/analysis/rq2_anomaly_effect.csv

[RQ3] Feature engineering comparison (cleaned data only)...
Saved RQ3 summary to: ../results_cls/analysis/rq3_fe_effect.csv

[RQ4] Horizon effect (average over models/FE/versions)...
Saved RQ4 summary to: ../results_cls/analysis/rq4_horizon_effect.csv

Analysis complete.
Full grid results: ../results_cls/summary_full_run_20251119_014125.csv
Analysis CSVs    : ../results_cls/analysis
Figures          : ../results_cls/figs
