# Humanoid-Bench Comparison

Compare **Ours** (scaled architecture: 8Q + 3 enc layers) against **SimbaV2** on Humanoid-Bench H1 tasks.

Sweep: `63_humanoid_bench_baseline_2`
- Tasks: balance_simple, walk, slide
- Seeds: 2 per task
- Steps: 1M

In [1]:
import sys
from pathlib import Path

# Add analysis tools to path
NOTEBOOK_DIR = Path.cwd()
ANALYSIS_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR
if str(ANALYSIS_ROOT) not in sys.path:
    sys.path.insert(0, str(ANALYSIS_ROOT))

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from tools import wandb_io, config, paths
from tools.aggregations import runs_history_to_frame

## Configuration

In [2]:
# Sweep config
SWEEP_ID = "hhcmrr4v"
ENTITY = "thomasevers9"
PROJECT = "tdmpc2-tdmpc2"

# Plot config
MAX_STEP = 1_000_000  # Plot up to 1M steps
METRIC_KEY = "eval/episode_reward"

# Task name mapping: sweep task name -> SimbaV2 CSV name
TASK_NAME_MAP = {
    "humanoid_h1-balance_simple-v0": "h1_balance_simple_v0",
    "humanoid_h1-walk-v0": "h1_walk_v0",
    "humanoid_h1-slide-v0": "h1_slide_v0",
}

# Display names for plots
TASK_DISPLAY_NAMES = {
    "h1_balance_simple_v0": "H1 Balance Simple",
    "h1_walk_v0": "H1 Walk",
    "h1_slide_v0": "H1 Slide",
}

# SimbaV2 data path
SIMBAV2_CSV = paths.PROJECT_ROOT / "results" / "simbav2" / "simbaV2_utd8.csv"

# Colors for methods
COLORS = {
    "Ours": "#1f77b4",      # Blue
    "SimbaV2": "#9467bd",   # Purple
}

## Load Our Results from W&B

In [3]:
# Fetch sweep runs from W&B (with caching)
runs, manifest, source = wandb_io.fetch_sweep_runs(
    entity=ENTITY,
    project=PROJECT,
    sweep_id=SWEEP_ID,
    history_keys=[METRIC_KEY, "_step"],
    use_cache=True,
    force_refresh=False,
)
print(f"Loaded {len(runs)} runs from {source}")
print(f"Fetched at: {manifest.get('fetched_at', 'N/A')}")

[34m[1mwandb[0m: Currently logged in as: [33mthomasevers9[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
Downloading sweep hhcmrr4v: 100%|██████████| 6/6 [00:15<00:00,  2.66s/it]

Loaded 6 runs from remote
Fetched at: 2025-12-18T09:08:12.204885+00:00





In [4]:
# Convert to DataFrame
ours_df = runs_history_to_frame(
    runs,
    metric_key=METRIC_KEY,
    step_keys=["_step"],
    config_to_columns={"task": "task", "seed": "seed"},
)

# Normalize task names to match SimbaV2 naming
ours_df["task"] = ours_df["task"].replace(TASK_NAME_MAP)

# Rename metric column to 'reward' for consistency
ours_df = ours_df.rename(columns={METRIC_KEY: "reward"})
ours_df["method"] = "Ours"

print(f"Our results: {len(ours_df)} rows")
print(f"Tasks: {sorted(ours_df['task'].unique().tolist())}")
print(f"Seeds: {sorted(ours_df['seed'].unique().tolist())}")
print(f"Max step: {ours_df['step'].max()}")
ours_df.head()

Our results: 24 rows
Tasks: ['h1_balance_simple_v0', 'h1_slide_v0', 'h1_walk_v0']
Seeds: [42, 43]
Max step: 200172


Unnamed: 0,task,seed,run_id,step,reward,method
0,h1_walk_v0,43,57auulc8,50090,25.594549,Ours
1,h1_walk_v0,43,57auulc8,100087,46.200176,Ours
2,h1_walk_v0,43,57auulc8,150180,124.997009,Ours
3,h1_walk_v0,43,57auulc8,200172,216.589874,Ours
4,h1_walk_v0,42,bx9wig46,50017,12.627457,Ours


## Load SimbaV2 Baseline

In [5]:
def load_simbav2_tasks(csv_path: Path, tasks: list, max_step: int) -> pd.DataFrame:
    """Load SimbaV2 results for specific tasks.
    
    Args:
        csv_path: Path to SimbaV2 CSV file.
        tasks: List of task names (SimbaV2 naming convention).
        max_step: Maximum step to include.
    
    Returns:
        DataFrame with columns [step, reward, seed, task, method].
    """
    df = pd.read_csv(csv_path)
    
    # Filter to avg_return metric and requested tasks
    df = df[(df["metric"] == "avg_return") & (df["env_name"].isin(tasks))]
    
    # Rename columns to standard format
    df = df.rename(columns={
        "env_step": "step",
        "value": "reward",
        "env_name": "task",
    })
    
    # Keep only needed columns and add method
    df = df[["step", "reward", "seed", "task"]].copy()
    df["method"] = "SimbaV2"
    
    # Filter to max step
    df = df[df["step"] <= max_step]
    
    return df

In [6]:
# Get tasks from our results
our_tasks = sorted(ours_df["task"].unique().tolist())
print(f"Our tasks: {our_tasks}")

# Load SimbaV2 data for these tasks
simbav2_df = load_simbav2_tasks(SIMBAV2_CSV, our_tasks, MAX_STEP)
print(f"\nSimbaV2 results: {len(simbav2_df)} rows")
print(f"Tasks: {sorted(simbav2_df['task'].unique().tolist())}")
print(f"Seeds: {sorted(simbav2_df['seed'].unique().tolist())}")
simbav2_df.head()

Our tasks: ['h1_balance_simple_v0', 'h1_slide_v0', 'h1_walk_v0']

SimbaV2 results: 396 rows
Tasks: ['h1_balance_simple_v0', 'h1_slide_v0', 'h1_walk_v0']
Seeds: [0, 1000, 2000, 3000, 4000]


Unnamed: 0,step,reward,seed,task,method
4482,0.0,4.097499,0,h1_walk_v0,SimbaV2
4483,100000.0,33.929088,0,h1_walk_v0,SimbaV2
4484,200000.0,126.686775,0,h1_walk_v0,SimbaV2
4485,300000.0,261.875216,0,h1_walk_v0,SimbaV2
4486,400000.0,463.215019,0,h1_walk_v0,SimbaV2


## Combine Data

In [7]:
# Filter our results to max step
ours_filtered = ours_df[ours_df["step"] <= MAX_STEP].copy()

# Combine all data
all_data = pd.concat([ours_filtered, simbav2_df], ignore_index=True)
print(f"Total rows: {len(all_data)}")
print(f"Methods: {sorted(all_data['method'].unique().tolist())}")

Total rows: 420
Methods: ['Ours', 'SimbaV2']


## Plotting Functions

In [8]:
def hex_to_rgba(hex_color: str, alpha: float) -> str:
    """Convert hex color to rgba string."""
    hex_color = hex_color.lstrip('#')
    r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    return f"rgba({r},{g},{b},{alpha})"


def compute_stats(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """Compute mean and std across seeds."""
    return df.groupby(group_cols).agg(
        mean_reward=("reward", "mean"),
        std_reward=("reward", "std"),
        n_seeds=("reward", "count")
    ).reset_index()

In [9]:
def plot_task_comparison(
    data: pd.DataFrame,
    task: str,
    colors: dict,
    display_name: str = None,
) -> go.Figure:
    """Create a comparison plot for a single task."""
    task_data = data[data["task"] == task]
    if task_data.empty:
        return None
    
    stats = compute_stats(task_data, ["method", "step"])
    fig = go.Figure()
    
    # Plot Ours first, then baselines
    methods = ["Ours"] + [m for m in sorted(stats["method"].unique()) if m != "Ours"]
    
    for method in methods:
        method_stats = stats[stats["method"] == method].sort_values("step")
        if method_stats.empty:
            continue
        
        color = colors.get(method, "#666666")
        x = method_stats["step"]
        y_mean = method_stats["mean_reward"]
        y_std = method_stats["std_reward"].fillna(0)
        
        # Shaded std region
        fig.add_trace(go.Scatter(
            x=pd.concat([x, x[::-1]]),
            y=pd.concat([y_mean + y_std, (y_mean - y_std)[::-1]]),
            fill="toself",
            fillcolor=hex_to_rgba(color, 0.2),
            line=dict(color="rgba(0,0,0,0)"),
            showlegend=False,
            hoverinfo="skip",
        ))
        
        # Mean line (solid for Ours, dashed for baselines)
        line_style = dict(color=color, width=2)
        if method != "Ours":
            line_style["dash"] = "dash"
        
        fig.add_trace(go.Scatter(
            x=x,
            y=y_mean,
            mode="lines",
            name=method,
            line=line_style,
            hovertemplate=f"{method}<br>Step: %{{x:,}}<br>Reward: %{{y:.1f}}<extra></extra>",
        ))
    
    title = display_name if display_name else task
    fig.update_layout(
        title=dict(text=f"<b>{title}</b>", x=0.5),
        xaxis_title="Environment Steps",
        yaxis_title="Episode Reward",
        legend=dict(x=0.02, y=0.98, bgcolor="rgba(255,255,255,0.8)"),
        hovermode="x unified",
        template="plotly_white",
        width=700,
        height=450,
    )
    
    return fig

## Individual Task Plots

In [10]:
# Generate plots for all tasks
figures = {}
for task in our_tasks:
    display_name = TASK_DISPLAY_NAMES.get(task, task)
    fig = plot_task_comparison(all_data, task, COLORS, display_name)
    if fig is not None:
        figures[task] = fig

print(f"Generated {len(figures)} task plots")

Generated 3 task plots


In [11]:
# Show all plots
for task, fig in figures.items():
    fig.show()

## Grid View (All Tasks)

In [12]:
def create_grid_figure(
    data: pd.DataFrame,
    tasks: list,
    colors: dict,
    display_names: dict = None,
    cols: int = 3,
) -> go.Figure:
    """Create grid of comparison plots for all tasks."""
    n_tasks = len(tasks)
    rows = (n_tasks + cols - 1) // cols
    
    titles = [display_names.get(t, t) if display_names else t for t in tasks]
    
    fig = make_subplots(
        rows=rows,
        cols=cols,
        subplot_titles=titles,
        vertical_spacing=0.12,
        horizontal_spacing=0.08,
    )
    
    legend_added = set()
    
    for idx, task in enumerate(tasks):
        row = idx // cols + 1
        col = idx % cols + 1
        
        task_data = data[data["task"] == task]
        if task_data.empty:
            continue
        
        stats = compute_stats(task_data, ["method", "step"])
        methods = ["Ours"] + [m for m in sorted(stats["method"].unique()) if m != "Ours"]
        
        for method in methods:
            method_stats = stats[stats["method"] == method].sort_values("step")
            if method_stats.empty:
                continue
            
            color = colors.get(method, "#666666")
            x = method_stats["step"]
            y_mean = method_stats["mean_reward"]
            y_std = method_stats["std_reward"].fillna(0)
            
            show_legend = method not in legend_added
            legend_added.add(method)
            
            # Shaded region
            fig.add_trace(
                go.Scatter(
                    x=pd.concat([x, x[::-1]]),
                    y=pd.concat([y_mean + y_std, (y_mean - y_std)[::-1]]),
                    fill="toself",
                    fillcolor=hex_to_rgba(color, 0.2),
                    line=dict(color="rgba(0,0,0,0)"),
                    showlegend=False,
                    hoverinfo="skip",
                    legendgroup=method,
                ),
                row=row, col=col
            )
            
            # Mean line
            line_style = dict(color=color, width=2)
            if method != "Ours":
                line_style["dash"] = "dash"
            
            fig.add_trace(
                go.Scatter(
                    x=x,
                    y=y_mean,
                    mode="lines",
                    name=method,
                    line=line_style,
                    showlegend=show_legend,
                    legendgroup=method,
                    hovertemplate=f"{method}: %{{y:.0f}}<extra></extra>",
                ),
                row=row, col=col
            )
    
    fig.update_layout(
        height=350 * rows,
        width=1000,
        title_text="<b>Humanoid-Bench (H1): Ours vs SimbaV2</b>",
        title_x=0.5,
        showlegend=True,
        legend=dict(x=1.01, y=1, bgcolor="rgba(255,255,255,0.9)"),
        template="plotly_white",
    )
    
    # Update x-axis labels
    fig.update_xaxes(title_text="Steps", row=rows)
    fig.update_yaxes(title_text="Reward", col=1)
    
    return fig

In [13]:
grid_fig = create_grid_figure(all_data, our_tasks, COLORS, TASK_DISPLAY_NAMES, cols=3)
grid_fig.show()

## Summary Table

In [14]:
def create_summary_table(data: pd.DataFrame, eval_step: int = None) -> pd.DataFrame:
    """Create summary table with mean/std per method per task at eval_step."""
    results = []
    methods = sorted(data["method"].unique())
    
    for task in sorted(data["task"].unique()):
        task_data = data[data["task"] == task]
        display_name = TASK_DISPLAY_NAMES.get(task, task)
        row = {"task": display_name}
        
        for method in methods:
            method_data = task_data[task_data["method"] == method]
            if method_data.empty:
                row[f"{method}_mean"] = np.nan
                row[f"{method}_std"] = np.nan
                continue
            
            # Get data at eval step or max available
            if eval_step is not None:
                available_steps = method_data["step"].unique()
                valid_steps = available_steps[available_steps <= eval_step]
                step = valid_steps.max() if len(valid_steps) > 0 else available_steps.max()
            else:
                step = method_data["step"].max()
            
            final_data = method_data[method_data["step"] == step]
            row[f"{method}_mean"] = final_data["reward"].mean()
            row[f"{method}_std"] = final_data["reward"].std()
            row[f"{method}_step"] = step
        
        results.append(row)
    
    return pd.DataFrame(results)

In [15]:
# Get current max step from our runs
our_max_step = ours_filtered["step"].max()
print(f"Our current max step: {our_max_step:,}")

# Summary at our current max step
summary = create_summary_table(all_data, eval_step=our_max_step)
summary.round(1)

Our current max step: 200,172


Unnamed: 0,task,Ours_mean,Ours_std,Ours_step,SimbaV2_mean,SimbaV2_std,SimbaV2_step
0,H1 Balance Simple,69.7,,200073.0,135.4,27.7,200000.0
1,H1 Slide,226.2,,200143.0,103.8,29.3,200000.0
2,H1 Walk,216.6,,200172.0,104.3,21.2,200000.0


In [16]:
def format_summary_pretty(df: pd.DataFrame) -> pd.DataFrame:
    """Format summary with 'mean ± std' columns."""
    methods = sorted(set(col.replace("_mean", "") for col in df.columns if col.endswith("_mean")))
    
    formatted = df[["task"]].copy()
    for method in methods:
        mean_col = f"{method}_mean"
        std_col = f"{method}_std"
        if mean_col in df.columns:
            formatted[method] = df.apply(
                lambda r: f"{r[mean_col]:.0f} ± {r[std_col]:.0f}" 
                if pd.notna(r[mean_col]) else "—",
                axis=1
            )
    return formatted

pretty_summary = format_summary_pretty(summary)
pretty_summary

Unnamed: 0,task,Ours,SimbaV2
0,H1 Balance Simple,70 ± nan,135 ± 28
1,H1 Slide,226 ± nan,104 ± 29
2,H1 Walk,217 ± nan,104 ± 21


## Save Results

In [17]:
# Create output directory
output_dir = paths.notebook_results_dir("09_humanoid_bench")
print(f"Output directory: {output_dir}")

# Save summary tables
summary.to_csv(output_dir / "summary.csv", index=False)
pretty_summary.to_csv(output_dir / "summary_pretty.csv", index=False)

# Save grid figure
grid_fig.write_html(output_dir / "grid_comparison.html")

# Save individual task figures
for task, fig in figures.items():
    safe_name = task.replace("-", "_")
    fig.write_html(output_dir / f"{safe_name}.html")

print(f"\nSaved:")
print(f"  - summary.csv")
print(f"  - summary_pretty.csv")
print(f"  - grid_comparison.html")
print(f"  - {len(figures)} individual task plots")

Output directory: /gpfs/work4/0/prjs0951/Thomas/Thesis/RL_weather/tdmpc2-with-return-based-auxiliary-tasks/analysis/results/09_humanoid_bench

Saved:
  - summary.csv
  - summary_pretty.csv
  - grid_comparison.html
  - 3 individual task plots
