# Midterm Sweep — Imagination RGB

Analyze sweep `8finalsweep/imagination_rgb` using RGB observations with hyperparameters tuned for proprioceptive control.

In [None]:
from pathlib import Path
import sys

NOTEBOOK_DIR = Path().resolve()
REPO_ROOT = NOTEBOOK_DIR.parent.parent
if not (REPO_ROOT / "analysis" / "tools").exists():
    raise RuntimeError("Unable to locate analysis/tools package from notebook directory")
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

In [None]:
import pandas as pd

from IPython.display import Image
from analysis.tools import aggregations, baselines, naming, plotting, selection, wandb_io, paths
from analysis.tools.paths import ensure_dir

In [None]:
NOTEBOOK_STEM = "imagination_rgb_sweep"
SWEEP_ROOT = Path("../../sweep_list/midterm_sweep/8finalsweep/imagination_rgb").resolve()
SWEEP_ID = SWEEP_ROOT.joinpath("id.txt").read_text().strip()
WANDB_PROJECT = SWEEP_ROOT.joinpath("project.txt").read_text().strip()
WANDB_ENTITY = "thomasevers9"
HISTORY_KEYS = [
    "eval/episode_reward",
eval/step",
train/step",
global_step",
total_env_steps",
step",
_step",

In [None]:
runs_payload, manifest, data_source = wandb_io.fetch_sweep_runs(
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    sweep_id=SWEEP_ID,
    history_keys=HISTORY_KEYS,
    use_cache=True,
    force_refresh=False,
)
print(f"Loaded {manifest['run_count']} runs from {data_source}")

In [None]:
CONFIG_TO_COLUMNS = {
    "task": "task",
    "seed": "seed",
    "utd_ratio": "utd_ratio",
    "model_size": "model_size",
    "multi_gamma_gammas": "multi_gamma_gammas",
    "num_rollouts": "num_rollouts",
    "entropy_coef": "entropy_coef",
    "enc_lr_scale": "enc_lr_scale",
    "obs": "obs",
}

runs_df = aggregations.runs_history_to_frame(
    runs_payload,
    metric_key=METRIC_KEY,
    step_keys=STEP_KEYS,
    config_to_columns=CONFIG_TO_COLUMNS,
)
runs_df["task_baseline"] = runs_df["task"].map(naming.wandb_task_to_baseline)
runs_df.head()

In [None]:
obs_modes = sorted(runs_df["obs"].unique())
if not obs_modes:
    raise ValueError("No observation modalities detected in runs_df")
if len(obs_modes) > 1:
    raise ValueError(
        f"Multiple observation modalities found {obs_modes}; per-task baseline handling not yet implemented"
    )
obs_mode = obs_modes[0]
if obs_mode == "rgb":
    baseline_root = baselines.PIXEL_BASELINE_ROOT
elif obs_mode == "state":
    baseline_root = baselines.STATE_BASELINE_ROOT
else:
    raise ValueError(f"Unsupported observation modality '{obs_mode}' for baselines")

baseline_tasks = sorted(runs_df["task_baseline"].unique())
available_tasks = [
    task for task in baseline_tasks if baselines.has_task(task, root=baseline_root)
]
missing_tasks = sorted(set(baseline_tasks) - set(available_tasks))
if missing_tasks:
    print(
        "Skipping tasks without baseline CSV:",
        ", ",.join(missing_tasks),
    )
if not available_tasks:
    raise ValueError(
        f"No baselines available under {baseline_root} for tasks {baseline_tasks}"
    )

baseline_df = baselines.load_many(available_tasks, root=baseline_root)
baseline_df.head()

In [None]:
variant_dir = ensure_dir(RESULTS_DIR / "utd_ratio")
variant_fig = plotting.sample_efficiency_figure(
    runs_df,
    metric_key=METRIC_KEY,
    variant_column="utd_ratio",
    task_name="All Tasks",
    baseline_frame=pd.DataFrame(),
    baseline_label=BASELINE_LABEL,
)
plotting.write_png(
    variant_fig,
    output_path=variant_dir / "utd_ratio.png",
)
utd_summary = aggregations.aggregate_at_step(
    runs_df,
    step_value=STEP_TARGET,
    metric_key=METRIC_KEY,
    group_cols=["utd_ratio"],
)
utd_summary_path = variant_dir / f"utd_ratio_{STEP_TARGET}.csv"
utd_summary.to_csv(utd_summary_path, index=False)
utd_summary

In [None]:
baseline_task_set = set(baseline_df["task"].unique())
summary_tables = []
evaluated_tasks = []

for task in sorted(runs_df["task"].unique()):
    task_df = runs_df[runs_df["task"] == task]
    if task_df.empty:
        continue

    baseline_name = naming.wandb_task_to_baseline(task)
    if baseline_name not in baseline_task_set:
        print(f"Skipping task {task} — baseline CSV not available")
        continue

    baseline_task_df = baseline_df[baseline_df["task"] == baseline_name]
    if baseline_task_df.empty:
        raise ValueError(f"Baseline DataFrame unexpectedly missing task {baseline_name}")

    task_dir = ensure_dir(RESULTS_DIR / task)
    encoded_fig = plotting.sample_efficiency_encoded_figure(
        frame=task_df,
        metric_key=METRIC_KEY,
        task_name=task,
        baseline_frame=baseline_task_df,
        baseline_label=BASELINE_LABEL,
        encodings=ENCODING_SPECS,
    )
    plotting.write_png(
        encoded_fig,
        output_path=task_dir / "sample_efficiency.png",
    )

    agg = aggregations.aggregate_at_step(
        task_df,
        step_value=STEP_TARGET,
        metric_key=METRIC_KEY,
        group_cols=HYPERPARAM_COLUMNS,
    )
    agg["task"] = task
    summary_tables.append(agg)
    evaluated_tasks.append(task)

if not summary_tables:
    raise ValueError("No summary tables were generated")

summary_df = pd.concat(summary_tables, ignore_index=True)
summary_csv_path = RESULTS_DIR / f"reward_summary_{STEP_TARGET}.csv"
summary_df.to_csv(summary_csv_path, index=False)
summary_df

In [None]:
evaluated_tasks = sorted(set(evaluated_tasks))
if not evaluated_tasks:
    raise ValueError("No tasks with baseline coverage are available for comparison")

evaluated_runs_df = runs_df[runs_df["task"].isin(evaluated_tasks)]
if evaluated_runs_df.empty:
    raise ValueError("No runs found after filtering to tasks with baseline coverage")

best_result = selection.best_configuration_at_step(
    evaluated_runs_df,
    metric_key=METRIC_KEY,
    step_value=STEP_TARGET,
    hyperparam_columns=HYPERPARAM_COLUMNS,
)

best_result.config

In [None]:
config_summary = best_result.config_summary.sort_values("mean_reward", ascending=False)
best_config_row = config_summary.iloc[0]
best_config_mean_reward = float(best_config_row["mean_reward"])
best_config_description = 
.join(f"{name}={best_result.config[name]}" for name in HYPERPARAM_COLUMNS)
config_summary.head(10)

In [None]:
baseline_names = [naming.wandb_task_to_baseline(task) for task in evaluated_tasks]
comparison_baseline_df = baselines.load_many(baseline_names, root=baseline_root)
comparison_baselines = {BASELINE_LABEL: comparison_baseline_df}

comparison_df = selection.comparison_table(
    model_task_summary=best_result.task_summary,
    baselines=comparison_baselines,
    step_value=STEP_TARGET,
    model_label=MODEL_LABEL,
)

comparison_csv = RESULTS_DIR / f"comparison_table_{STEP_TARGET}.csv"
comparison_df.to_csv(comparison_csv, index=False)

footer_text = (
    f"Best configuration: {best_config_description} — mean reward "
    f"{best_config_mean_reward:.2f} averaged across {best_result.task_summary.shape[0]} tasks."
 )

comparison_title = f"Imagination RGB Sweep vs TD-MPC2-Pixels @ {STEP_TARGET:,} Steps"
comparison_fig = plotting.comparison_table_figure(
    comparison_df,
    title=comparison_title,
    footer_text=footer_text,
)
comparison_png = RESULTS_DIR / f"comparison_table_{STEP_TARGET}.png"
plotting.write_png(comparison_fig, output_path=comparison_png)
Image(filename=str(comparison_png))