# üõ°Ô∏è Robustness Analysis

**Analyze how well policies generalize** across different perturbation regimes.

## What this measures:
- **Delta Noise** = V(No Noise) - V(Noise)
- **Delta Delay** = V(No Noise) - V(Delay)  
- **Delta Combo** = V(No Noise) - V(Combo)

**Lower deltas = more robust** (performance doesn't degrade much under perturbations)

‚ö†Ô∏è **Requires training results CSV from 02_training.ipynb**

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")
print("Imports loaded!")

## Configuration

In [None]:
# ==========================================
# CONFIGURATION
# ==========================================

# Path to your training results CSV
RESULTS_CSV = "/content/drive/MyDrive/results_combined_new.csv"

# Output paths
ROBUST_PER_SEED = "/content/drive/MyDrive/robustness_deltas_per_seed.csv"
ROBUST_AGG = "/content/drive/MyDrive/robustness_deltas_agg.csv"

print(f"Loading data from: {RESULTS_CSV}")
print(f"Per-seed output: {ROBUST_PER_SEED}")
print(f"Aggregated output: {ROBUST_AGG}")

## Load and Prepare Data

In [None]:
print(f"Loading training results from: {RESULTS_CSV}")
df = pd.read_csv(RESULTS_CSV)

# We only care about final policies
df_final = df[df["phase"] == "final"].copy()

# Sanity check columns
expected_cols = {"baseline", "env", "seed", "phase", "mean_return", "std_dev", "train_minutes"}
missing = expected_cols - set(df_final.columns)
if missing:
    raise ValueError(f"CSV is missing columns: {missing}")

print(f"Loaded {len(df_final)} final results")
print(f"Baselines: {df_final['baseline'].unique()}")
print(f"Environments: {df_final['env'].unique()}")

In [None]:
# Normalize environment names
df_final["env_norm"] = df_final["env"].str.strip().str.lower()

env_map = {
    "no noise": "No Noise",
    "noise": "Noise",
    "delay": "Delay",
    "combo": "Combo",
}
df_final["env_clean"] = df_final["env_norm"].map(env_map)

if df_final["env_clean"].isna().any():
    bad_rows = df_final[df_final["env_clean"].isna()][["env"]].drop_duplicates()
    raise ValueError(f"Unexpected env names found:\n{bad_rows}")

print("Environment names normalized!")

## Compute Robustness Deltas

In [None]:
# Keep one row per baseline, env, seed (mean_return)
grouped = (
    df_final
    .groupby(["baseline", "env_clean", "seed"], as_index=False)["mean_return"]
    .mean()
)

# Pivot so each row is (baseline, seed) with columns for each env's return
pivot = grouped.pivot_table(
    index=["baseline", "seed"],
    columns="env_clean",
    values="mean_return"
).reset_index()

# Ensure all four envs exist as columns
for col in ["No Noise", "Noise", "Delay", "Combo"]:
    if col not in pivot.columns:
        pivot[col] = np.nan

# Compute robustness deltas per seed
pivot["delta_noise"] = pivot["No Noise"] - pivot["Noise"]
pivot["delta_delay"] = pivot["No Noise"] - pivot["Delay"]
pivot["delta_combo"] = pivot["No Noise"] - pivot["Combo"]

print("Deltas computed!")
pivot.head()

## Save Per-Seed Results

In [None]:
# Save per-seed robustness table
per_seed_cols = [
    "baseline", "seed",
    "No Noise", "Noise", "Delay", "Combo",
    "delta_noise", "delta_delay", "delta_combo",
]
pivot[per_seed_cols].to_csv(ROBUST_PER_SEED, index=False)
print(f"‚úÖ Saved per-seed robustness deltas to: {ROBUST_PER_SEED}")

## Aggregate Over Seeds

In [None]:
# Aggregate over seeds: mean and std for each baseline
agg_rows = []
for b, sub in pivot.groupby("baseline"):
    row = {
        "baseline": b,
        "V_no_noise_mean": sub["No Noise"].mean(),
        "V_no_noise_std": sub["No Noise"].std(),

        "V_noise_mean": sub["Noise"].mean(),
        "V_noise_std": sub["Noise"].std(),

        "V_delay_mean": sub["Delay"].mean(),
        "V_delay_std": sub["Delay"].std(),

        "V_combo_mean": sub["Combo"].mean(),
        "V_combo_std": sub["Combo"].std(),

        "delta_noise_mean": sub["delta_noise"].mean(),
        "delta_noise_std": sub["delta_noise"].std(),

        "delta_delay_mean": sub["delta_delay"].mean(),
        "delta_delay_std": sub["delta_delay"].std(),

        "delta_combo_mean": sub["delta_combo"].mean(),
        "delta_combo_std": sub["delta_combo"].std(),
    }
    agg_rows.append(row)

agg_df = pd.DataFrame(agg_rows)
agg_df.to_csv(ROBUST_AGG, index=False)
print(f"‚úÖ Saved aggregated robustness stats to: {ROBUST_AGG}")

## üìä View Results

In [None]:
print("\nüìä Aggregated Robustness Results:")
display(agg_df.round(2))

In [None]:
# Focus on deltas
delta_cols = ["baseline", "delta_noise_mean", "delta_delay_mean", "delta_combo_mean"]
print("\nüîç Robustness Deltas (lower = more robust):")
display(agg_df[delta_cols].round(2).sort_values("delta_combo_mean"))

## üìà Visualization

In [None]:
# Bar chart of robustness deltas
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, (delta_col, title) in zip(axes, [
    ("delta_noise_mean", "Delta Noise"),
    ("delta_delay_mean", "Delta Delay"),
    ("delta_combo_mean", "Delta Combo"),
]):
    data = agg_df.sort_values(delta_col)
    colors = ["green" if v < data[delta_col].median() else "red" for v in data[delta_col]]
    ax.barh(data["baseline"], data[delta_col], color=colors)
    ax.set_xlabel("Performance Drop")
    ax.set_title(title)
    ax.axvline(0, color="black", linestyle="--", alpha=0.5)

plt.tight_layout()
plt.suptitle("Robustness Deltas (Lower = More Robust)", y=1.02, fontsize=14)
plt.show()

In [None]:
# Heatmap of performance across environments
perf_cols = ["baseline", "V_no_noise_mean", "V_noise_mean", "V_delay_mean", "V_combo_mean"]
perf_df = agg_df[perf_cols].set_index("baseline")
perf_df.columns = ["No Noise", "Noise", "Delay", "Combo"]

plt.figure(figsize=(10, 6))
sns.heatmap(perf_df, annot=True, fmt=".1f", cmap="YlGnBu", center=0)
plt.title("Mean Return by Baseline and Environment")
plt.xlabel("Environment")
plt.ylabel("Baseline")
plt.tight_layout()
plt.show()