"""
Program 5 â€” Unified Evaluation & Statistical Analysis

- Aggregates predictions from all routers
- Normalizes schemas
- Computes accuracy, F1, and bootstrap confidence intervals
- Performs lightweight error analysis

Outputs:
- router_performance_with_ci.csv
"""


In [1]:
# ðŸ“Œ Step 0 â€” Setup & Reproducibility

import random
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import accuracy_score, classification_report

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print("Reproducibility seed set:", RANDOM_SEED)


Reproducibility seed set: 42


In [2]:
# ðŸ“Œ Step 1 â€” Mount Drive & Define Paths

from google.colab import drive
drive.mount("/content/drive")

BASE_DIR = Path("/content/drive/MyDrive/FinGuardSDG")
RESULTS_DIR = BASE_DIR / "results"

print("BASE_DIR:", BASE_DIR)


Mounted at /content/drive
BASE_DIR: /content/drive/MyDrive/FinGuardSDG


In [3]:
# ðŸ“Œ Step 2 â€” Load Router Prediction Files

router_dfs = {
    "keyword": pd.read_csv(RESULTS_DIR / "keyword" / "keyword_router_predictions.csv"),
    "encoder": pd.read_csv(RESULTS_DIR / "encoder" / "encoder_router_predictions.csv"),
    "hybrid": pd.read_csv(RESULTS_DIR / "hybrid" / "hybrid_router_predictions.csv"),
    "llm_fallback": pd.read_csv(
        RESULTS_DIR / "llm_fallback" / "llm_fallback_router_predictions.csv"
    ),
}

for name, df in router_dfs.items():
    print(f"{name} loaded â†’ shape:", df.shape)


keyword loaded â†’ shape: (174, 7)
encoder loaded â†’ shape: (174, 5)
hybrid loaded â†’ shape: (174, 7)
llm_fallback loaded â†’ shape: (174, 7)


In [4]:
# ðŸ“Œ Step 3 â€” Normalize Router Outputs

normalized_router_dfs = {}

# Keyword router
df_kw = router_dfs["keyword"].copy()
normalized_router_dfs["keyword"] = df_kw[
    ["id", "true_category", "predicted_category"]
]

# Encoder router
df_enc = router_dfs["encoder"].copy().rename(columns={
    "true_label": "true_category",
    "predicted_label": "predicted_category",
})
normalized_router_dfs["encoder"] = df_enc[
    ["id", "true_category", "predicted_category"]
]

# Hybrid router
df_hyb = router_dfs["hybrid"].copy()
normalized_router_dfs["hybrid"] = df_hyb[
    ["id", "true_category", "predicted_category"]
]

# LLM fallback router
df_llm = router_dfs["llm_fallback"].copy().rename(columns={
    "true_label": "true_category",
    "final_pred": "predicted_category",
})
normalized_router_dfs["llm_fallback"] = df_llm[
    ["id", "true_category", "predicted_category"]
]

# Sanity check
for name, df in normalized_router_dfs.items():
    print(f"{name} normalized â†’", df.shape)


keyword normalized â†’ (174, 3)
encoder normalized â†’ (174, 3)
hybrid normalized â†’ (174, 3)
llm_fallback normalized â†’ (174, 3)


In [5]:
# ðŸ“Œ Step 4 â€” Compute Core Metrics (Accuracy + F1)

metrics = []

for name, df in normalized_router_dfs.items():
    y_true = df["true_category"]
    y_pred = df["predicted_category"]

    report = classification_report(
        y_true, y_pred, output_dict=True, zero_division=0
    )

    metrics.append({
        "router": name,
        "accuracy": accuracy_score(y_true, y_pred),
        "macro_f1": report["macro avg"]["f1-score"],
        "weighted_f1": report["weighted avg"]["f1-score"],
    })

metrics_df = pd.DataFrame(metrics)
metrics_df


Unnamed: 0,router,accuracy,macro_f1,weighted_f1
0,keyword,0.465517,0.341334,0.482668
1,encoder,0.873563,0.869472,0.875993
2,hybrid,0.87931,0.874289,0.88048
3,llm_fallback,0.793103,0.78901,0.791748


In [6]:
# ðŸ“Œ Step 5 â€” Bootstrap Confidence Intervals (Accuracy)

def bootstrap_ci(y_true, y_pred, n_boot=1000, alpha=0.05):
    scores = []
    n = len(y_true)

    for _ in range(n_boot):
        idx = np.random.choice(n, n, replace=True)
        scores.append(
            accuracy_score(
                y_true.iloc[idx],
                y_pred.iloc[idx]
            )
        )

    lower = np.percentile(scores, 100 * alpha / 2)
    upper = np.percentile(scores, 100 * (1 - alpha / 2))
    return lower, upper


ci_rows = []

for name, df in normalized_router_dfs.items():
    lo, hi = bootstrap_ci(
        df["true_category"],
        df["predicted_category"]
    )
    ci_rows.append({
        "router": name,
        "accuracy_ci_lower": lo,
        "accuracy_ci_upper": hi
    })

ci_df = pd.DataFrame(ci_rows)
ci_df


Unnamed: 0,router,accuracy_ci_lower,accuracy_ci_upper
0,keyword,0.396552,0.54023
1,encoder,0.821839,0.925287
2,hybrid,0.827586,0.925287
3,llm_fallback,0.729885,0.850575


In [7]:
# ðŸ“Œ Step 6 â€” Merge Metrics + Confidence Intervals

final_metrics_df = metrics_df.merge(ci_df, on="router")
final_metrics_df


Unnamed: 0,router,accuracy,macro_f1,weighted_f1,accuracy_ci_lower,accuracy_ci_upper
0,keyword,0.465517,0.341334,0.482668,0.396552,0.54023
1,encoder,0.873563,0.869472,0.875993,0.821839,0.925287
2,hybrid,0.87931,0.874289,0.88048,0.827586,0.925287
3,llm_fallback,0.793103,0.78901,0.791748,0.729885,0.850575


In [8]:
# ðŸ“Œ Step 7 â€” Save Final Evaluation Tables

OUT_DIR = RESULTS_DIR / "evaluation"
OUT_DIR.mkdir(parents=True, exist_ok=True)

final_metrics_df.to_csv(
    OUT_DIR / "router_performance_with_ci.csv",
    index=False
)

print("Saved evaluation table to:",
      OUT_DIR / "router_performance_with_ci.csv")


Saved evaluation table to: /content/drive/MyDrive/FinGuardSDG/results/evaluation/router_performance_with_ci.csv


In [9]:
# ðŸ“Œ Step 8 â€” Per-Category Error Analysis

name = "hybrid"  # change to any router
df = normalized_router_dfs[name]

errors = df[df["true_category"] != df["predicted_category"]]
errors.head(10)


Unnamed: 0,id,true_category,predicted_category
3,C-RR-020,conceptual,quantitative
5,Q-PORT-052,quantitative,conceptual
7,C-BF-017,conceptual,quantitative
15,Q-PORT-030,quantitative,conceptual
17,E-SOC-018,esg,conceptual
25,Q-RISK-035,quantitative,conceptual
52,Q-PORT-026,quantitative,advisory
60,C-CF-038,conceptual,esg
62,A-ADV-170,advisory,conceptual
73,A-ADV-252,advisory,esg
