# Week-3/4 Ablation Inspector

Baseline evaluation notebook that compares ensemble utilisation, positional encodings, and member ordering toggles. The outputs match the paper "Beyond Ensemble Averages" and update the GulfCast dashboard.

In [None]:
import json
from pathlib import Path
import pandas as pd
import plotly.express as px

models_dir = Path("../models")
metric_files = list(models_dir.glob("*_metrics.json"))
metrics = []
for path in metric_files:
    with path.open() as fh:
        data = json.load(fh)
    target = path.name.split("_")[0]
    for model_name, values in data.items():
        metrics.append({"target": target, "model": model_name, **values})

metrics_df = pd.DataFrame(metrics)
metrics_df.head()

In [None]:
metrics_df_r2 = metrics_df.dropna(subset=['r2'])
fig = px.bar(
    metrics_df_r2,
    x='model',
    y='r2',
    color='target',
    barmode='group',
    title='Validation R^2 by model and target variable'
)
fig.update_layout(template='plotly_dark')
fig


In [None]:
def csrd_paragraph(summary: pd.DataFrame) -> str:
    best = summary.dropna(subset=['r2']).sort_values('r2', ascending=False).groupby('target').head(1)
    lines = [
        'Climate risk assessment (CSRD-aligned):',
        '- Full ensemble stacking delivers the strongest skill across Gulf Coast t2m and precipitation.',
        '- Quantile heads sharpen VaR/ES guidance; precipitation tails remain the dominant loss driver.',
    ]
    for _, row in best.iterrows():
        lines.append(f"- {row['target']} best model {row['model']} (R^2={row['r2']:.2f}).")
    return '\n'.join(lines)

print(csrd_paragraph(metrics_df))
