In [22]:
import sys
from pathlib import Path

# Add analysis tools to path
NOTEBOOK_DIR = Path.cwd()
ANALYSIS_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR
if str(ANALYSIS_ROOT) not in sys.path:
    sys.path.insert(0, str(ANALYSIS_ROOT))

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from tools import wandb_io, config, paths
from tools.aggregations import runs_history_to_frame

## Configuration

In [23]:
# Sweep config
SWEEP_ID = "qbykqthl"
ENTITY = "thomasevers9"
PROJECT = "tdmpc2-tdmpc2"

# Plot config
MAX_STEP = 1000_000  # This sweep runs for 100k steps
METRIC_KEY = "eval/episode_reward"

# Task name mapping: sweep task name -> baseline CSV name
TASK_NAME_MAP = {
    "ball_in_cup-catch": "cup-catch",
    "finger-turn_easy": "finger-turn-easy",
    "finger-turn_hard": "finger-turn-hard",
    "cartpole-balance_sparse": "cartpole-balance-sparse",
    "cartpole-swingup_sparse": "cartpole-swingup-sparse",
}

# Baseline roots
BASELINE_ROOTS = {
    "TDMPC2": paths.BASELINE_TDMPC2,
    "DreamerV3": paths.BASELINE_DREAMERV3,
    "SAC": paths.BASELINE_SAC,
    "EZ2": paths.PROJECT_ROOT / "results" / "ez2_parsed",
    "SimbaV2": paths.PROJECT_ROOT / "results" / "simbav2_parsed",
    "BMPC": paths.PROJECT_ROOT / "results" / "BMPC_parsed" / "dmcontrol",
}

# Colors for methods
COLORS = {
    "Ours": "#1f77b4",      # Blue
    "TDMPC2": "#2ca02c",    # Green
    "DreamerV3": "#ff7f0e", # Orange
    "EZ2": "#d62728",       # Red
    "SimbaV2": "#9467bd",   # Purple
    "SAC": "#8c564b",       # Brown
    "BMPC": "#17becf",      # Cyan
}

## Load Our Results from W&B

In [24]:
# Fetch sweep runs from W&B (with caching)
runs, manifest, source = wandb_io.fetch_sweep_runs(
    entity=ENTITY,
    project=PROJECT,
    sweep_id=SWEEP_ID,
    history_keys=[METRIC_KEY, "_step"],
    use_cache=True,
    force_refresh=False,
)
print(f"Loaded {len(runs)} runs from {source}")
print(f"Fetched at: {manifest.get('fetched_at', 'N/A')}")

[34m[1mwandb[0m: Currently logged in as: [33mthomasevers9[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
Downloading sweep qbykqthl: 100%|██████████| 6/6 [00:17<00:00,  2.91s/it]

Loaded 6 runs from remote
Fetched at: 2026-01-26T13:16:09.874565+00:00





In [25]:
# Convert to DataFrame
ours_df = runs_history_to_frame(
    runs,
    metric_key=METRIC_KEY,
    step_keys=["_step"],
    config_to_columns={"task": "task", "seed": "seed"},
)

# Normalize task names to match baseline naming
ours_df["task"] = ours_df["task"].replace(TASK_NAME_MAP)

# Rename metric column to 'reward' for consistency
ours_df = ours_df.rename(columns={METRIC_KEY: "reward"})

print(f"Our results: {len(ours_df)} rows")
print(f"Tasks: {sorted(ours_df['task'].unique().tolist())}")
print(f"Seeds: {sorted(ours_df['seed'].unique().tolist())}")
print(f"Max step: {ours_df['step'].max()}")
ours_df.head()

Our results: 72 rows
Tasks: ['dog-run', 'dog-stand', 'dog-walk', 'humanoid-run', 'humanoid-stand', 'humanoid-walk']
Seeds: [1]
Max step: 600000


Unnamed: 0,task,seed,run_id,step,reward
0,dog-run,1,41dke6a2,50000,133.744781
1,dog-run,1,41dke6a2,100000,189.670563
2,dog-run,1,41dke6a2,150000,212.449631
3,dog-run,1,41dke6a2,200000,287.821472
4,dog-run,1,41dke6a2,250000,358.939484


## Load Baseline Results

In [26]:
def load_baseline_for_task(task: str, method: str, root: Path) -> pd.DataFrame:
    """Load baseline CSV for a task.
    
    Args:
        task: Normalized task name (e.g., 'cup-catch').
        method: Baseline method name.
        root: Path to baseline CSV directory.
    
    Returns:
        DataFrame with columns [step, reward, seed, task, method], or empty if not found.
    """
    csv_path = root / f"{task}.csv"
    if csv_path.exists():
        df = pd.read_csv(csv_path)
        df["task"] = task
        df["method"] = method
        return df
    return pd.DataFrame()


def load_all_baselines(tasks: list, max_step: int) -> pd.DataFrame:
    """Load all baseline results for given tasks.
    
    Args:
        tasks: List of normalized task names.
        max_step: Maximum step to include.
    
    Returns:
        Combined DataFrame of all baselines.
    """
    frames = []
    missing = {method: [] for method in BASELINE_ROOTS}
    
    for method, root in BASELINE_ROOTS.items():
        if not root.exists():
            print(f"Warning: {method} root not found: {root}")
            continue
            
        for task in tasks:
            df = load_baseline_for_task(task, method, root)
            if df.empty:
                missing[method].append(task)
            else:
                df = df[df["step"] <= max_step]
                frames.append(df)
    
    # Report missing
    for method, tasks_missing in missing.items():
        if tasks_missing:
            print(f"{method}: missing {len(tasks_missing)} tasks: {tasks_missing[:5]}..." 
                  if len(tasks_missing) > 5 else f"{method}: missing {tasks_missing}")
    
    if not frames:
        return pd.DataFrame()
    return pd.concat(frames, ignore_index=True)

In [27]:
# Get unique tasks from our results
our_tasks = sorted(ours_df["task"].unique().tolist())
print(f"Our tasks ({len(our_tasks)}): {our_tasks}")

Our tasks (6): ['dog-run', 'dog-stand', 'dog-walk', 'humanoid-run', 'humanoid-stand', 'humanoid-walk']


In [28]:
# Load baselines
baselines_df = load_all_baselines(our_tasks, MAX_STEP)
print(f"\nBaseline results: {len(baselines_df)} rows")

if not baselines_df.empty:
    coverage = baselines_df.groupby('method')['task'].nunique()
    print(f"\nTask coverage per method:")
    for method, count in coverage.items():
        print(f"  {method}: {count}/{len(our_tasks)} tasks")

EZ2: missing 6 tasks: ['dog-run', 'dog-stand', 'dog-walk', 'humanoid-run', 'humanoid-stand']...

Baseline results: 1728 rows

Task coverage per method:
  BMPC: 6/6 tasks
  DreamerV3: 6/6 tasks
  SAC: 6/6 tasks
  SimbaV2: 6/6 tasks
  TDMPC2: 6/6 tasks


## Combine Data

In [29]:
# Add method column to our results
ours_df["method"] = "Ours"

# Filter our results to max step
ours_filtered = ours_df[ours_df["step"] <= MAX_STEP].copy()

# Combine all data
all_data = pd.concat([ours_filtered, baselines_df], ignore_index=True)
print(f"Total rows: {len(all_data)}")
print(f"Methods: {sorted(all_data['method'].unique().tolist())}")

Total rows: 1800
Methods: ['BMPC', 'DreamerV3', 'Ours', 'SAC', 'SimbaV2', 'TDMPC2']


## Plotting Functions

In [30]:
def hex_to_rgba(hex_color: str, alpha: float) -> str:
    """Convert hex color to rgba string."""
    hex_color = hex_color.lstrip('#')
    r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f"rgba({r},{g},{b},{alpha})"


def compute_stats(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """Compute mean and std across seeds."""
    return df.groupby(group_cols).agg(
        mean_reward=("reward", "mean"),
        std_reward=("reward", "std"),
        n_seeds=("reward", "count")
    ).reset_index()

In [31]:
def plot_task_comparison(
    data: pd.DataFrame,
    task: str,
    colors: dict,
) -> go.Figure:
    """Create a comparison plot for a single task.
    
    Args:
        data: DataFrame with columns [step, reward, method, task].
        task: Task name to plot.
        colors: Dict mapping method names to hex colors.
    
    Returns:
        Plotly Figure, or None if no data for task.
    """
    task_data = data[data["task"] == task]
    if task_data.empty:
        return None
    
    stats = compute_stats(task_data, ["method", "step"])
    fig = go.Figure()
    
    # Plot Ours first, then baselines
    methods = ["Ours"] + [m for m in sorted(stats["method"].unique()) if m != "Ours"]
    
    for method in methods:
        method_stats = stats[stats["method"] == method].sort_values("step")
        if method_stats.empty:
            continue
        
        color = colors.get(method, "#666666")
        x = method_stats["step"].values
        y_mean = method_stats["mean_reward"].values
        y_std = np.nan_to_num(method_stats["std_reward"].values, nan=0)
        
        # Shaded std region - use numpy arrays
        x_fill = np.concatenate([x, x[::-1]])
        y_fill = np.concatenate([y_mean + y_std, (y_mean - y_std)[::-1]])
        
        fig.add_trace(go.Scatter(
            x=x_fill,
            y=y_fill,
            fill="toself",
            fillcolor=hex_to_rgba(color, 0.2),
            line=dict(color="rgba(0,0,0,0)"),
            showlegend=False,
            hoverinfo="skip",
        ))
        
        # Mean line (solid for Ours, dashed for baselines)
        line_style = dict(color=color, width=2)
        if method != "Ours":
            line_style["dash"] = "dash"
        
        fig.add_trace(go.Scatter(
            x=x,
            y=y_mean,
            mode="lines",
            name=method,
            line=line_style,
            hovertemplate=f"{method}<br>Step: %{{x:,}}<br>Reward: %{{y:.1f}}<extra></extra>",
        ))
    
    fig.update_layout(
        title=dict(text=f"<b>{task}</b>", x=0.5),
        xaxis_title="Environment Steps",
        yaxis_title="Episode Reward",
        legend=dict(x=0.02, y=0.98, bgcolor="rgba(255,255,255,0.8)"),
        hovermode="x unified",
        template="plotly_white",
        width=700,
        height=450,
    )
    
    return fig

## Individual Task Plots

In [32]:
# Generate plots for all tasks
figures = {}
for task in our_tasks:
    fig = plot_task_comparison(all_data, task, COLORS)
    if fig is not None:
        figures[task] = fig

print(f"Generated {len(figures)} task plots")

Generated 6 task plots


In [33]:
# Show first 4 plots
for task in list(figures.keys())[:4]:
    print(f"\n--- {task} ---")
    figures[task].show()


--- dog-run ---



--- dog-stand ---



--- dog-walk ---



--- humanoid-run ---


## Grid View (All Tasks)

In [34]:
def create_grid_figure(
    data: pd.DataFrame,
    tasks: list,
    colors: dict,
    cols: int = 3,
) -> go.Figure:
    """Create grid of comparison plots for all tasks.
    
    Args:
        data: Combined DataFrame with all methods.
        tasks: List of task names to plot.
        colors: Dict mapping method names to hex colors.
        cols: Number of columns in grid.
    
    Returns:
        Plotly Figure with subplots.
    """
    n_tasks = len(tasks)
    rows = (n_tasks + cols - 1) // cols
    
    fig = make_subplots(
        rows=rows,
        cols=cols,
        subplot_titles=tasks,
        vertical_spacing=0.08,
        horizontal_spacing=0.05,
    )
    
    legend_added = set()
    
    for idx, task in enumerate(tasks):
        row = idx // cols + 1
        col = idx % cols + 1
        
        task_data = data[data["task"] == task]
        if task_data.empty:
            continue
        
        stats = compute_stats(task_data, ["method", "step"])
        methods = ["Ours"] + [m for m in sorted(stats["method"].unique()) if m != "Ours"]
        
        for method in methods:
            method_stats = stats[stats["method"] == method].sort_values("step")
            if method_stats.empty:
                continue
            
            color = colors.get(method, "#666666")
            x = method_stats["step"].values
            y_mean = method_stats["mean_reward"].values
            y_std = np.nan_to_num(method_stats["std_reward"].values, nan=0)
            
            show_legend = method not in legend_added
            legend_added.add(method)
            
            # Shaded region - use numpy arrays
            x_fill = np.concatenate([x, x[::-1]])
            y_fill = np.concatenate([y_mean + y_std, (y_mean - y_std)[::-1]])
            
            fig.add_trace(
                go.Scatter(
                    x=x_fill,
                    y=y_fill,
                    fill="toself",
                    fillcolor=hex_to_rgba(color, 0.15),
                    line=dict(color="rgba(0,0,0,0)"),
                    showlegend=False,
                    hoverinfo="skip",
                    legendgroup=method,
                ),
                row=row, col=col
            )
            
            # Mean line
            line_style = dict(color=color, width=1.5)
            if method != "Ours":
                line_style["dash"] = "dash"
            
            fig.add_trace(
                go.Scatter(
                    x=x,
                    y=y_mean,
                    mode="lines",
                    name=method,
                    line=line_style,
                    showlegend=show_legend,
                    legendgroup=method,
                    hovertemplate=f"{method}: %{{y:.0f}}<extra></extra>",
                ),
                row=row, col=col
            )
    
    fig.update_layout(
        height=280 * rows,
        width=1000,
        title_text="<b>DM-Control Benchmark (2 Enc Layers): Ours vs Baselines</b>",
        title_x=0.5,
        showlegend=True,
        legend=dict(x=1.01, y=1, bgcolor="rgba(255,255,255,0.9)"),
        template="plotly_white",
    )
    
    return fig

In [35]:
grid_fig = create_grid_figure(all_data, our_tasks, COLORS, cols=3)
grid_fig.show()

## Normalized Aggregate Performance

Average performance across all tasks, normalized by max performance per task.

In [36]:
def compute_normalized_aggregate(
    data: pd.DataFrame,
    eval_step: int = 100_000,
    common_steps: list = None,
) -> pd.DataFrame:
    """Compute normalized aggregate performance across tasks.
    
    For each task, normalize rewards by max performance at eval_step.
    Interpolate to common steps, then average across tasks per method.
    
    Args:
        data: Combined DataFrame with [step, reward, method, task].
        eval_step: Step at which to compute max for normalization.
        common_steps: List of steps to interpolate to. If None, uses [0, 10k, ..., eval_step].
    
    Returns:
        DataFrame with [method, step, mean_normalized, std_normalized, n_tasks].
    """
    if common_steps is None:
        common_steps = list(range(0, eval_step + 1, 10_000))
    
    # First, compute max reward per task at eval_step (across all methods)
    task_max = {}
    for task in data["task"].unique():
        task_data = data[data["task"] == task]
        max_reward = 0
        for method in task_data["method"].unique():
            method_data = task_data[task_data["method"] == method]
            available_steps = method_data["step"].unique()
            valid_steps = available_steps[available_steps <= eval_step]
            if len(valid_steps) > 0:
                step = valid_steps.max()
                rewards_at_step = method_data[method_data["step"] == step]["reward"]
                max_reward = max(max_reward, rewards_at_step.mean())
        task_max[task] = max_reward if max_reward > 0 else 1
    
    print("Max rewards per task (for normalization):")
    for task, max_r in sorted(task_max.items()):
        print(f"  {task}: {max_r:.1f}")
    
    # Normalize all rewards
    data_normalized = data.copy()
    data_normalized["normalized_reward"] = data_normalized.apply(
        lambda r: r["reward"] / task_max[r["task"]], axis=1
    )
    
    # Interpolate each (method, task) to common steps
    interpolated_rows = []
    for (method, task), group in data_normalized.groupby(["method", "task"]):
        # Average across seeds first, then interpolate
        seed_avg = group.groupby("step")["normalized_reward"].mean().reset_index()
        seed_avg = seed_avg.sort_values("step")
        
        # Interpolate to common steps
        for target_step in common_steps:
            if target_step < seed_avg["step"].min():
                val = seed_avg["normalized_reward"].iloc[0] if len(seed_avg) > 0 else 0
            elif target_step > seed_avg["step"].max():
                val = seed_avg["normalized_reward"].iloc[-1]
            else:
                val = np.interp(target_step, seed_avg["step"], seed_avg["normalized_reward"])
            
            interpolated_rows.append({
                "method": method,
                "task": task,
                "step": target_step,
                "normalized_reward": val,
            })
    
    interpolated_df = pd.DataFrame(interpolated_rows)
    
    # Aggregate across tasks
    aggregate = interpolated_df.groupby(["method", "step"]).agg(
        mean_normalized=("normalized_reward", "mean"),
        std_normalized=("normalized_reward", "std"),
        n_tasks=("normalized_reward", "count")
    ).reset_index()
    
    return aggregate

In [37]:
def plot_normalized_aggregate(
    aggregate_df: pd.DataFrame,
    colors: dict,
) -> go.Figure:
    """Plot normalized aggregate performance across tasks.
    
    Args:
        aggregate_df: DataFrame from compute_normalized_aggregate.
        colors: Dict mapping method names to hex colors.
    
    Returns:
        Plotly Figure.
    """
    fig = go.Figure()
    
    # Plot Ours first, then baselines
    methods = ["Ours"] + [m for m in sorted(aggregate_df["method"].unique()) if m != "Ours"]
    
    for method in methods:
        method_data = aggregate_df[aggregate_df["method"] == method].sort_values("step")
        if method_data.empty:
            continue
        
        color = colors.get(method, "#666666")
        x = method_data["step"].values
        y_mean = method_data["mean_normalized"].values
        y_std = np.nan_to_num(method_data["std_normalized"].values, nan=0)
        
        # Shaded std region - use numpy arrays
        x_fill = np.concatenate([x, x[::-1]])
        y_fill = np.concatenate([y_mean + y_std, (y_mean - y_std)[::-1]])
        
        fig.add_trace(go.Scatter(
            x=x_fill,
            y=y_fill,
            fill="toself",
            fillcolor=hex_to_rgba(color, 0.2),
            line=dict(color="rgba(0,0,0,0)"),
            showlegend=False,
            hoverinfo="skip",
        ))
        
        # Mean line
        line_style = dict(color=color, width=2.5)
        if method != "Ours":
            line_style["dash"] = "dash"
            line_style["width"] = 2
        
        fig.add_trace(go.Scatter(
            x=x,
            y=y_mean,
            mode="lines",
            name=method,
            line=line_style,
            hovertemplate=f"{method}<br>Step: %{{x:,}}<br>Normalized: %{{y:.3f}}<extra></extra>",
        ))
    
    fig.update_layout(
        title=dict(text="<b>Normalized Aggregate Performance (Mean over Tasks)</b>", x=0.5),
        xaxis_title="Environment Steps",
        yaxis_title="Normalized Reward (fraction of max)",
        legend=dict(x=0.02, y=0.98, bgcolor="rgba(255,255,255,0.8)"),
        hovermode="x unified",
        template="plotly_white",
        width=800,
        height=500,
        yaxis=dict(range=[0, 1.05]),
    )
    
    return fig

In [38]:
# Compute and plot normalized aggregate
aggregate_df = compute_normalized_aggregate(all_data, eval_step=MAX_STEP)
aggregate_fig = plot_normalized_aggregate(aggregate_df, COLORS)
aggregate_fig.show()

Max rewards per task (for normalization):
  dog-run: 555.7
  dog-stand: 972.1
  dog-walk: 949.7
  humanoid-run: 428.9
  humanoid-stand: 933.0
  humanoid-walk: 917.4


## Summary Table

In [39]:
def create_summary_table(data: pd.DataFrame, eval_step: int = None) -> pd.DataFrame:
    """Create summary table with final mean/std per method per task.
    
    Args:
        data: Combined DataFrame.
        eval_step: Step at which to evaluate. If None, uses max step per method.
    
    Returns:
        DataFrame with one row per task, columns for each method's mean/std.
    """
    results = []
    methods = sorted(data["method"].unique())
    
    for task in sorted(data["task"].unique()):
        task_data = data[data["task"] == task]
        row = {"task": task}
        
        for method in methods:
            method_data = task_data[task_data["method"] == method]
            if method_data.empty:
                row[f"{method}_mean"] = np.nan
                row[f"{method}_std"] = np.nan
                continue
            
            # Get data at eval step or max available
            if eval_step is not None:
                available_steps = method_data["step"].unique()
                valid_steps = available_steps[available_steps <= eval_step]
                if len(valid_steps) == 0:
                    step = available_steps.max()
                else:
                    step = valid_steps.max()
            else:
                step = method_data["step"].max()
            
            final_data = method_data[method_data["step"] == step]
            row[f"{method}_mean"] = final_data["reward"].mean()
            row[f"{method}_std"] = final_data["reward"].std()
        
        results.append(row)
    
    return pd.DataFrame(results)

In [40]:
# Summary at 100k steps (this sweep's max)
summary_100k = create_summary_table(all_data, eval_step=100_000)
print("Summary at 100k steps (or max available):")
summary_100k.round(1)

Summary at 100k steps (or max available):


Unnamed: 0,task,BMPC_mean,BMPC_std,DreamerV3_mean,DreamerV3_std,Ours_mean,Ours_std,SAC_mean,SAC_std,SimbaV2_mean,SimbaV2_std,TDMPC2_mean,TDMPC2_std
0,dog-run,35.1,23.8,0.0,,189.7,,8.3,2.5,106.3,9.5,26.6,16.7
1,dog-stand,174.3,99.2,50.6,,843.4,,32.6,9.0,511.5,61.5,146.5,115.7
2,dog-walk,67.9,16.0,9.0,,399.2,,12.6,3.3,229.6,48.3,34.6,14.8
3,humanoid-run,0.9,0.1,1.2,,1.3,,0.8,0.2,16.4,23.9,1.2,0.3
4,humanoid-stand,5.6,0.4,7.6,,5.3,,4.6,1.5,13.5,15.0,5.9,0.5
5,humanoid-walk,1.1,0.1,0.6,,140.6,,1.1,0.3,33.5,62.7,1.5,0.1


In [41]:
def format_summary_pretty(df: pd.DataFrame) -> pd.DataFrame:
    """Format summary with 'mean ± std' columns."""
    methods = sorted(set(col.replace("_mean", "") for col in df.columns if col.endswith("_mean")))
    
    formatted = df[["task"]].copy()
    for method in methods:
        mean_col = f"{method}_mean"
        std_col = f"{method}_std"
        if mean_col in df.columns:
            formatted[method] = df.apply(
                lambda r: f"{r[mean_col]:.0f} ± {r[std_col]:.0f}" 
                if pd.notna(r[mean_col]) else "—",
                axis=1
            )
    return formatted

pretty_summary = format_summary_pretty(summary_100k)
pretty_summary

Unnamed: 0,task,BMPC,DreamerV3,Ours,SAC,SimbaV2,TDMPC2
0,dog-run,35 ± 24,0 ± nan,190 ± nan,8 ± 2,106 ± 9,27 ± 17
1,dog-stand,174 ± 99,51 ± nan,843 ± nan,33 ± 9,512 ± 62,147 ± 116
2,dog-walk,68 ± 16,9 ± nan,399 ± nan,13 ± 3,230 ± 48,35 ± 15
3,humanoid-run,1 ± 0,1 ± nan,1 ± nan,1 ± 0,16 ± 24,1 ± 0
4,humanoid-stand,6 ± 0,8 ± nan,5 ± nan,5 ± 1,14 ± 15,6 ± 0
5,humanoid-walk,1 ± 0,1 ± nan,141 ± nan,1 ± 0,34 ± 63,1 ± 0


## Save Results

In [42]:
# Create output directory
output_dir = paths.notebook_results_dir("11_dmcontrol_2enc_benchmark")
print(f"Output directory: {output_dir}")

# Save summary tables
summary_100k.to_csv(output_dir / "summary_100k.csv", index=False)
pretty_summary.to_csv(output_dir / "summary_100k_pretty.csv", index=False)

# Save grid figure
grid_fig.write_html(output_dir / "grid_comparison.html")

# Save normalized aggregate results
aggregate_df.to_csv(output_dir / "normalized_aggregate.csv", index=False)
aggregate_fig.write_html(output_dir / "normalized_aggregate.html")

# Save individual task figures
for task, fig in figures.items():
    safe_name = task.replace("-", "_")
    fig.write_html(output_dir / f"{safe_name}.html")

print(f"\nSaved:")
print(f"  - summary_100k.csv")
print(f"  - summary_100k_pretty.csv")
print(f"  - grid_comparison.html")
print(f"  - normalized_aggregate.csv")
print(f"  - normalized_aggregate.html")
print(f"  - {len(figures)} individual task plots")

Output directory: /gpfs/work4/0/prjs0951/Thomas/Thesis/RL_weather/tdmpc2-with-return-based-auxiliary-tasks.worktrees/main/analysis/results/11_dmcontrol_2enc_benchmark

Saved:
  - summary_100k.csv
  - summary_100k_pretty.csv
  - grid_comparison.html
  - normalized_aggregate.csv
  - normalized_aggregate.html
  - 6 individual task plots
