# ‚úÖ Task Completion Analysis

**Evaluate actual task performance** in the sparse reward Overcooked environment.

## What this measures:
- **Completion Score** = Mean Return - Minimum Penalty
- **Completion Norm** = Normalized score (0 = worst, 1 = best)
- Proxy for "dishes served" in Overcooked

‚ö†Ô∏è **Requires training results CSV from 02_training.ipynb**

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")
print("Imports loaded!")

## Configuration

In [None]:
# ==========================================
# CONFIGURATION
# ==========================================

# Path to your training results CSV
RESULTS_CSV = "/content/drive/MyDrive/results_combined_new.csv"

# Output paths
TASK_PER_SEED = "/content/drive/MyDrive/task_completion_per_seed.csv"
TASK_AGG = "/content/drive/MyDrive/task_completion_agg.csv"

# Penalty floor for the sparse reward regime
# For asymmetric_advantages this is around -40
R_MIN = -40.0

print(f"Loading data from: {RESULTS_CSV}")
print(f"Per-seed output: {TASK_PER_SEED}")
print(f"Aggregated output: {TASK_AGG}")
print(f"Reward floor: {R_MIN}")

## Load and Prepare Data

In [None]:
print(f"Loading training results from: {RESULTS_CSV}")
df = pd.read_csv(RESULTS_CSV)

# Only use final policies
df_final = df[df["phase"] == "final"].copy()

expected_cols = {
    "baseline", "env", "seed",
    "phase", "mean_return", "std_dev", "train_minutes"
}
missing = expected_cols - set(df_final.columns)
if missing:
    raise ValueError(f"CSV is missing columns: {missing}")

print(f"Loaded {len(df_final)} final results")

In [None]:
# Clean env labels
df_final["env_clean"] = df_final["env"].str.strip()

# One row per (baseline, env, seed)
grouped = (
    df_final
    .groupby(["baseline", "env_clean", "seed"], as_index=False)
    .agg(
        mean_return=("mean_return", "mean"),
        std_return=("mean_return", "std"),
        mean_std_dev=("std_dev", "mean"),
    )
)

print(f"Grouped into {len(grouped)} unique configurations")

## Compute Completion Metrics

In [None]:
# Completion metrics
# completion_score = distance above penalty floor
grouped["completion_score"] = grouped["mean_return"] - R_MIN

# normalized to approx [0, 1] for R in [R_MIN, 0]
grouped["completion_norm"] = grouped["completion_score"] / abs(R_MIN)

print("Completion metrics computed!")
grouped.head()

## Save Per-Seed Results

In [None]:
# Save per-seed metrics
grouped.to_csv(TASK_PER_SEED, index=False)
print(f"‚úÖ Saved per-seed task completion metrics to: {TASK_PER_SEED}")

## Aggregate Over Seeds

In [None]:
# Aggregate over seeds per (baseline, env)
agg = (
    grouped
    .groupby(["baseline", "env_clean"], as_index=False)
    .agg(
        mean_return_mean=("mean_return", "mean"),
        mean_return_std=("mean_return", "std"),

        completion_score_mean=("completion_score", "mean"),
        completion_score_std=("completion_score", "std"),

        completion_norm_mean=("completion_norm", "mean"),
        completion_norm_std=("completion_norm", "std"),
    )
)

agg.to_csv(TASK_AGG, index=False)
print(f"‚úÖ Saved aggregated task completion metrics to: {TASK_AGG}")

## üìä View Results

In [None]:
print("\nüìä Aggregated Task Completion Stats:")
display(agg.sort_values(["env_clean", "baseline"]).round(2))

In [None]:
# Summary by baseline (averaged across envs)
baseline_summary = agg.groupby("baseline").agg({
    "mean_return_mean": "mean",
    "completion_score_mean": "mean",
    "completion_norm_mean": "mean",
}).round(2).sort_values("completion_norm_mean", ascending=False)

print("\nüèÜ Baseline Ranking (by completion norm):")
display(baseline_summary)

## üìà Visualization

In [None]:
# Bar chart by baseline and environment
plt.figure(figsize=(14, 6))
sns.barplot(data=agg, x="baseline", y="completion_score_mean", hue="env_clean")
plt.title("Task Completion Score by Baseline and Environment")
plt.xlabel("Baseline")
plt.ylabel("Completion Score (higher = better)")
plt.xticks(rotation=45)
plt.legend(title="Environment")
plt.tight_layout()
plt.show()

In [None]:
# Heatmap of normalized completion
pivot = agg.pivot(index="baseline", columns="env_clean", values="completion_norm_mean")

plt.figure(figsize=(10, 6))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="RdYlGn", vmin=0, vmax=1)
plt.title("Normalized Task Completion (0 = worst, 1 = best)")
plt.xlabel("Environment")
plt.ylabel("Baseline")
plt.tight_layout()
plt.show()

In [None]:
# Compare PPO+LLM vs others
ppo_llm = agg[agg["baseline"] == "PPO+LLM"].set_index("env_clean")["completion_norm_mean"]
baseline = agg[agg["baseline"] == "Baseline"].set_index("env_clean")["completion_norm_mean"]

comparison = pd.DataFrame({
    "PPO+LLM": ppo_llm,
    "Baseline": baseline,
    "Improvement": ppo_llm - baseline
})

print("\nüîç PPO+LLM vs Baseline Comparison:")
display(comparison.round(3))