# NonCurriculum_MetaDrive_SB3_Experiments

This notebook contains a full, reproducible experiment pipeline for **non-curriculum** based reinforcement learning for autonomous driving using **MetaDrive** and **Stable Baselines3 (SB3)**. It includes:

- Full environment factory and wrappers (including a discrete-action wrapper for DQN).
- Exact stage definitions (C0..C3) and matching budgets.
- Non-curriculum runner (train each target map separately for the same total sample budget).
- Evaluation harness (metrics logging, CSV saving, TensorBoard integration, video recording).
- Hyperparameters and experiment folder conventions.



## 1. Setup


In [1]:
!pip uninstall -y metadrive metadrive-simulator metadrive-simulator-py3-12 || true


[0m

In [None]:
!pip install -q "stable-baselines3[extra]" "metadrive-simulator-py3-12" tensorboard opencv-python # for colab

## for kaggle:
# !pip install "numpy<2.0" "protobuf==3.20.3" "metadrive-simulator" "stable-baselines3[extra]" tensorboard opencv-python

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: protobuf 6.33.0
Uninstalling protobuf-6.33.0:
  Successfully uninstalled protobuf-6.33.0
Collecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting protobuf==3.20.3
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting metadrive-simulator
  Downloading metadrive_simulator-0.4.3-py3-none-any.whl.metadata (11 kB)
Collecting yapf (from metadrive-simulator)
  Downloading yapf-0.43.0-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting progressbar (from metadrive-simulator)
  Downloading progressbar-2.5.tar.gz (10 kB)


In [3]:
import metadrive

print("metadrive.__file__ :", getattr(metadrive, "__file__", None))
print("metadrive.__path__ :", getattr(metadrive, "__path__", None))




metadrive.__file__ : /usr/local/lib/python3.11/dist-packages/metadrive/__init__.py
metadrive.__path__ : ['/usr/local/lib/python3.11/dist-packages/metadrive']


## 2. Imports

In [9]:
import os
import time
import json
import math
import random
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import logging

# SB3 imports
from metadrive.envs.metadrive_env import MetaDriveEnv
from metadrive.envs.varying_dynamics_env import VaryingDynamicsEnv
from stable_baselines3 import PPO, SAC, DQN
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, BaseCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# MetaDrive import guard
# try:
# from metadrive.envs.metadrive_env import MetaDriveEnv
# except Exception as e:
    # MetaDriveEnv = None
    # print('MetaDrive import failed.')

2025-11-28 16:27:55.317757: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764347275.498266      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764347275.549195      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
import warnings

warnings.filterwarnings("ignore")                       # ignore everything
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", module="stable_baselines3")
warnings.filterwarnings("ignore", module="gymnasium")


## 3. Experiment configuration
setup: maps, stages, budgets, hyperparameters

In [None]:
STAGES = [
    # C0: straight road, no traffic – just learn to go forward safely.
    {
        "id": "C0",
        "name": "C0_Straight",
        "env_type": "general",
        "map": "S", # Straight
        "traffic": 0.0,
        "budget": 100_000,
        "reward": {
            "base_w": 1.0,              # scale MetaDrive base reward
            "speed_w": 0.05,            # weak speed shaping
            "max_speed_kmh": 80.0,
            "collision_penalty": -5.0,
            "offroad_penalty": -3.0,
            "traffic_violation_penalty": -2.0,
            "success_bonus": 10.0,
            "step_penalty": -0.001,
        },
    },

    # C1: roundabout, no traffic – topology harder, still single-ego.
    {
        "id": "C1",
        "name": "C1_Roundabout",
        "env_type": "general",
        "map": "O", # Roundabout
        "traffic": 0.0,
        "budget": 150_000,
        "reward": {
            "base_w": 1.0,
            "speed_w": 0.05,
            "max_speed_kmh": 80.0,
            "collision_penalty": -6.0,  # slightly harsher for bad manoeuvres
            "offroad_penalty": -4.0,
            "traffic_violation_penalty": -3.0,
            "success_bonus": 10.0,
            "step_penalty": -0.005,
        },
    },

    # C2: 20-block PG map with **light traffic** – first exposure to traffic.
    {
        "id": "C2",
        "name": "C2_LightTraffic",
        "map": 10, # 20-block
        "traffic": 0.05,               # light traffic
        "budget": 200_000,
        "reward": {
            "base_w": 1.0,
            "speed_w": 0.08,            # encourage moving at speed in traffic
            "max_speed_kmh": 80.0,
            "collision_penalty": -8.0,
            "offroad_penalty": -6.0,
            "traffic_violation_penalty": -4.0,
            "success_bonus": 12.0,
            "step_penalty": -0.01,
        },
    },

    # C3: same PG map with **dense traffic** – “multi-agent-ish” final stage.
    # Still single learning ego, but many interacting vehicles (like CuRLA's
    # higher-traffic final curriculum stage).
    {
        "id": "C3",
        "name": "C3_DenseTraffic",
        "map": 20,
        "traffic": 0.30,               # dense traffic ≈ mild multi-agent
        "budget": 200_000,
        "reward": {
            "base_w": 1.0,
            "speed_w": 0.10,
            "max_speed_kmh": 80.0,
            "collision_penalty": -10.0, # strong safety pressure
            "offroad_penalty": -8.0,
            "traffic_violation_penalty": -5.0,
            "success_bonus": 15.0,
            "step_penalty": -0.05,
        },
    },
]

TOTAL_CURRICULUM_BUDGET = sum(s["budget"] for s in STAGES)
print("Total curriculum budget (per algorithm) =", TOTAL_CURRICULUM_BUDGET)

Total curriculum budget (per algorithm) = 650000


In [12]:
# Held-out 1: fixed 6-block map SCrRXO with medium traffic
HELDOUT_SCENARIO_STAGE = {
    "id": "HELDOUT_SCENARIO",
    "name": "HELDOUT_SCrRXO_MedTraffic",
    "map": "SCrRXO", # Straight -> Circular -> in-ramp -> out-ramp -> intersection -> roundabout
    "traffic": 0.30, # medium-ish traffic
    "budget": 0, # no training budget, eval only
    "reward": STAGES[-1]["reward"],  # reuse hardest-stage shaping for comparability
}

# Held-out 2: VaryingDynamics environment (dynamics robustness test)
VARYING_DYNAMICS_CONFIG = dict(
    num_scenarios=100,
    map=5,                  # small PG map
    log_level=logging.ERROR,
    # random_dynamics uses default ranges from docs; that's enough to show robustness
)

In [None]:
# Folder convention
EXPERIMENT_ROOT = Path('experiments')
EXPERIMENT_ROOT.mkdir(exist_ok=True)

# Seeds and workers
SEEDS = [0]
N_ENVS = 1 # should be 8 but metadrive issues
EVAL_FREQ = 10_000
EVAL_EPISODES = 5

In [14]:
# Hyperparameters
HYPERS = {
    'PPO': {
        'policy':'MlpPolicy',
        'policy_kwargs':{'net_arch':[64,64]}, # can do 64, 64 as well
        'learning_rate':3e-4,
        'n_steps':1024, # compute issue, can do 2048 if faster
        'batch_size':64,
        'n_epochs':10,
        'gamma':0.99,
        'clip_range':0.2
    },
    # 'SAC': {
    #     'policy':'MlpPolicy',
    #     'policy_kwargs':{'net_arch':[256,256]},
    #     'learning_rate':3e-4,
    #     'batch_size':256,
    #     'buffer_size':100_000,
    #     'gamma':0.99
    # },
    'SAC':{
        'policy':'MlpPolicy',
        'policy_kwargs':{'net_arch':[128,128]},
        'learning_rate':3e-4,
        'batch_size':128,
        'buffer_size':100_000,
        'gamma':0.99
    },
    'DQN': { # Atari setup
        'policy':'MlpPolicy',
        'policy_kwargs':{'net_arch':[64,64]},
        'learning_rate':1e-4,
        'buffer_size':100_000, # can do 50,000 if needed
        'batch_size':32,
        'train_freq':4
    }
}

Includes a discrete-action wrapper for DQN (maps discrete indices -> continuous steer/throttle).

In [15]:
import gymnasium as gym
from gymnasium import spaces

class DiscreteActionWrapper(gym.ActionWrapper):
    def __init__(self, env, mapping):
        super().__init__(env)
        self.mapping = mapping
        self.action_space = spaces.Discrete(len(mapping))

    def action(self, action):
        return np.array(self.mapping[action], dtype=np.float32)

In [16]:
class CurriculumRewardWrapper(gym.Wrapper):
    """
    Stage-dependent reward shaping:
    - Start from MetaDrive's base reward.
    - Add speed term.
    - Add collision / off-road / traffic-violation penalties.
    - Add success bonus.

    Uses the per-stage reward config from STAGES.
    """
    def __init__(self, env, reward_cfg):
        super().__init__(env)
        self.cfg = reward_cfg
        self.max_speed = self.cfg.get("max_speed_kmh", 80.0)

    def step(self, action):
        # MetaDrive uses Gymnasium API: obs, reward, terminated, truncated, info
        obs, base_r, terminated, truncated, info = self.env.step(action)

        # --- speed term ---
        # MetaDrive usually exposes speed either as 'speed' or 'velocity'
        raw_speed = float(info.get("speed", info.get("velocity", 0.0)))
        speed = max(0.0, min(raw_speed, self.max_speed))
        speed_term = self.cfg.get("speed_w", 0.0) * (speed / self.max_speed)

        # --- start from scaled base reward + speed shaping ---
        r = self.cfg.get("base_w", 1.0) * base_r + speed_term

        # --- per-step cost (encourage finishing sooner) ---
        r += self.cfg.get("step_penalty", 0.0)

        # --- collision penalties ---
        crashed = (
            info.get("crash_vehicle", False)
            or info.get("crash_object", False)
            or info.get("crash_building", False)
        )
        if crashed:
            r += self.cfg.get("collision_penalty", 0.0)
        info["collision"] = bool(crashed)

        # --- off-road / traffic-violation penalties ---
        offroad = info.get("out_of_road", False)
        if offroad:
            r += self.cfg.get("offroad_penalty", 0.0)

        # generic "traffic violation" flag for your metrics callback
        traffic_violation = bool(offroad or info.get("traffic_light_violation", False))
        if traffic_violation:
            r += self.cfg.get("traffic_violation_penalty", 0.0)
        info["traffic_violation"] = traffic_violation

        # --- success bonus at terminal step ---
        success = bool(info.get("arrive_dest", False) or info.get("success", False))
        if terminated and success:
            r += self.cfg.get("success_bonus", 0.0)
        info["success"] = success

        # For logging
        info["avg_speed"] = speed
        info["shaped_reward"] = r

        return obs, r, terminated, truncated, info

## 4. Functions

### 4.1 Environment factory

In [17]:
import gymnasium as gym

class MetaDriveGymCompatibilityWrapper(gym.Wrapper):
    """
    Makes MetaDriveEnv follow the Gymnasium reset() and step() signature.
    Removes unsupported arguments like options.
    """
    def reset(self, *, seed=None, options=None):
        if seed is not None:
            obs, info = self.env.reset(seed=seed)
        else:
            obs, info = self.env.reset()
        return obs, info

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)
        # MetaDrive uses done only; Gymnasium expects (terminated, truncated)
        terminated = done
        return obs, reward, terminated, truncated, info

In [18]:
from functools import partial
import logging

def make_metadrive_env(stage, use_discrete=False, seed=0, render=False):
    """
    stage: one of the dicts from STAGES. Uses:
        stage["map"], stage["traffic"], stage["reward"]
    """
    map_name = stage["map"]
    traffic_density = stage["traffic"]
    reward_cfg = stage["reward"]

    def _init():
        cfg = {
            "map": map_name,
            "traffic_density": traffic_density,
            "use_render": render,
            "start_seed": seed,
            # "random_spawn": True,
            "debug": False,
            "log_level": logging.ERROR,
            # cap episode length so eval can't run forever
            "horizon": 1000, # max steps per episode
            "truncate_as_terminate": True,   # treat horizon as done
        }
        env = MetaDriveEnv(cfg)

        # Fix reset() signature mismatch
        env = MetaDriveGymCompatibilityWrapper(env)

        # CuRLA-style stage-dependent reward shaping
        env = CurriculumRewardWrapper(env, reward_cfg)

        # discrete-action wrapper for DQN: [steering, throttle]
        if use_discrete:
            mapping = [
              (-1.0, 0.0), # hard left, no throttle
              (-1.0, 0.3), # left + some accel
              (0.0, 0.5), # go straight, accel
              (1.0, 0.3), # right + some accel
              (1.0, 0.0), # hard right, no throttle
            ]
            env = DiscreteActionWrapper(env, mapping)

        return Monitor(env)

    return _init


In [None]:
from stable_baselines3.common.vec_env import DummyVecEnv

def make_vec_env(stage, n_envs=1, use_discrete=False, seed=0):
    """
    Create vectorized MetaDrive envs.
    FIX: We must ALWAYS use SubprocVecEnv for MetaDrive.
    Using DummyVecEnv (single process) prevents creating a second env
    (like eval_env) because MetaDrive allows only one engine per process.
    """

    return DummyVecEnv([
        make_metadrive_env(stage, use_discrete=use_discrete, seed=seed)
    ])

PPO/SAC use continous control (`Box`) from MetaDrive while DQN uses a manually defined discrete control space.

In [20]:
vec = make_vec_env(STAGES[2], n_envs=1, use_discrete=False, seed=0)
base_env = vec.envs[0]    # DummyVecEnv
print("PPO/SAC action space:", base_env.action_space)
print("obs space:", base_env.observation_space)

PPO/SAC action space: Box(-1.0, 1.0, (2,), float32)
obs space: Box(-0.0, 1.0, (259,), float32)


In [21]:
vec = make_vec_env(STAGES[2], n_envs=1, use_discrete=True, seed=0)
base_env = vec.envs[0]    # DummyVecEnv
print("DQN action space:", base_env.action_space)
print("obs space:", base_env.observation_space)

DQN action space: Discrete(5)
obs space: Box(-0.0, 1.0, (259,), float32)


In [22]:
def make_eval_vec_env(stage, use_discrete=False, seed=0):
    """
    Eval env:
    - Always uses SubprocVecEnv (even with 1 worker) so that MetaDriveEnv
      is only ever created in subprocesses, not in the main process.
    """
    env_fns = [make_metadrive_env(stage, use_discrete=use_discrete, seed=seed)]
    return SubprocVecEnv(env_fns)


### 4.2 log per-eval metrics (success rate, collisions, speed etc)

In [None]:
class MetricsCallback(BaseCallback):
    """
    Run a short evaluation every eval_freq steps and log:
      - mean_reward
      - success_rate
      - collision_rate
      - traffic_violation_rate
      - avg_speed
      - avg_episode_length

    Saves to a CSV at csv_path.
    """

    def __init__(self, eval_env, csv_path, eval_freq=50_000, eval_episodes=10, verbose=0):
        super().__init__(verbose)
        self.eval_env = eval_env
        self.csv_path = csv_path
        self.eval_freq = eval_freq
        self.eval_episodes = eval_episodes

        # Create dir if needed
        os.makedirs(os.path.dirname(csv_path), exist_ok=True)

        # Write header if file doesn't exist
        if not os.path.exists(self.csv_path):
            with open(self.csv_path, "w", newline="") as f:
                writer = csv.DictWriter(
                    f,
                    fieldnames=[
                        "timesteps",
                        "mean_reward",
                        "success_rate",
                        "collision_rate",
                        "traffic_violation_rate",
                        "avg_speed",
                        "avg_episode_length",
                    ],
                )
                writer.writeheader()

    def _on_step(self) -> bool:
        # Only evaluate every eval_freq calls
        if self.n_calls % self.eval_freq != 0:
            return True

        episode_rewards = []
        episode_successes = []
        episode_collisions = []
        episode_traffic_violations = []
        episode_speeds = []
        episode_lengths = []

        for _ in range(self.eval_episodes):
            # DummyVecEnv.reset() -> obs (no info, vec-batched)
            obs = self.eval_env.reset()
            info = {}
            done = False

            ep_reward = 0.0
            ep_success = False
            ep_collision = False
            ep_traffic_violation = False
            ep_steps = 0
            ep_speeds = []

            while not done:
                # obs shape: (n_envs, obs_dim); n_envs = 1 here
                action, _ = self.model.predict(obs, deterministic=True)
                # DummyVecEnv.step() -> obs, rewards, dones, infos
                obs, rewards, dones, infos = self.eval_env.step(action)

                # unwrap vec env outputs for single env
                if isinstance(rewards, (np.ndarray, list, tuple)):
                    r = float(rewards[0])
                else:
                    r = float(rewards)

                if isinstance(dones, (np.ndarray, list, tuple)):
                    d = bool(dones[0])
                else:
                    d = bool(dones)

                if isinstance(infos, (list, tuple)) and len(infos) > 0:
                    info = infos[0]
                else:
                    info = infos

                ep_reward += r
                ep_steps += 1
                done = d

                # flags from MetaDrive info / our wrapper
                if isinstance(info, dict):
                    # success: either our wrapper's "success" OR MetaDrive's arrive_dest
                    if info.get("success", False) or info.get("arrive_dest", False):
                        ep_success = True

                    # collision: either our wrapper's "collision" OR any crash/out-of-road
                    if (
                        info.get("collision", False)
                        or info.get("crash_vehicle", False)
                        or info.get("crash_object", False)
                        or info.get("crash_building", False)
                        or info.get("out_of_road", False)
                    ):
                        ep_collision = True

                    # traffic violation if we ever log it; otherwise this will stay 0
                    if info.get("traffic_violation", False):
                        ep_traffic_violation = True

                    # speed logging
                    if "avg_speed" in info:
                        ep_speeds.append(float(info["avg_speed"]))
                    elif "speed" in info:
                        ep_speeds.append(float(info["speed"]))


            episode_rewards.append(ep_reward)
            episode_successes.append(1.0 if ep_success else 0.0)
            episode_collisions.append(1.0 if ep_collision else 0.0)
            episode_traffic_violations.append(1.0 if ep_traffic_violation else 0.0)
            episode_lengths.append(ep_steps)
            if ep_speeds:
                episode_speeds.append(sum(ep_speeds) / len(ep_speeds))

        mean_reward = float(sum(episode_rewards) / len(episode_rewards)) if episode_rewards else 0.0
        success_rate = float(sum(episode_successes) / len(episode_successes)) if episode_successes else 0.0
        collision_rate = float(sum(episode_collisions) / len(episode_collisions)) if episode_collisions else 0.0
        traffic_violation_rate = float(sum(episode_traffic_violations) / len(episode_traffic_violations)) if episode_traffic_violations else 0.0
        avg_speed = float(sum(episode_speeds) / len(episode_speeds)) if episode_speeds else 0.0
        avg_episode_length = float(sum(episode_lengths) / len(episode_lengths)) if episode_lengths else 0.0

        row = {
            "timesteps": int(self.num_timesteps),
            "mean_reward": mean_reward,
            "success_rate": success_rate,
            "collision_rate": collision_rate,
            "traffic_violation_rate": traffic_violation_rate,
            "avg_speed": avg_speed,
            "avg_episode_length": avg_episode_length,
        }

        with open(self.csv_path, "a", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=row.keys())
            writer.writerow(row)

        if self.verbose > 0:
            print(f"[Metrics] t={self.num_timesteps}  succ={success_rate:.2f}  "
                  f"coll={collision_rate:.2f}  len={avg_episode_length:.1f}")

        return True


In [24]:
from stable_baselines3.common.callbacks import EvalCallback

class PrettyEvalCallback(EvalCallback):
    """
    Clean pretty printing for eval:
      - One separator before the first eval
      - No spam between evals
      - One separator at the end of training on this stage
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._started = False    # whether we have printed the first separator

    def _on_step(self) -> bool:
        # Check if it's time to evaluate
        if self.n_calls % self.eval_freq == 0:
            # On first eval, print top separator
            if not self._started:
                print("\n" + "-" * 60 + "\n")
                self._started = True

        return super()._on_step()

    def _on_training_end(self) -> None:
        # After training for this stage: print final separator
        if self._started:
            print("\n" + "-" * 60 + "\n")
        return super()._on_training_end()


In [25]:
def save_run_config(out_dir, algo, stage, seed):
    """
    Save basic run config (algo, stage, hyperparams, seed) to config.json
    so you can reproduce / inspect later.
    """
    cfg = {
        "algo": algo,
        "seed": seed,
        "stage": {
            "id": stage["id"],
            "name": stage["name"],
            "map": stage["map"],
            "traffic": stage["traffic"],
            "budget": stage["budget"],
            "reward": stage["reward"],
        },
        "hyperparams": HYPERS[algo],
        "heldout_map": {"map": HELDOUT_SCENARIO_STAGE['map'], "traffic": HELDOUT_SCENARIO_STAGE['traffic'], "reward": HELDOUT_SCENARIO_STAGE['reward']},
    }
    with open(out_dir / "config.json", "w") as f:
        json.dump(cfg, f, indent=2)

In [26]:
print(HELDOUT_SCENARIO_STAGE)

{'id': 'HELDOUT_SCENARIO', 'name': 'HELDOUT_SCrRXO_MedTraffic', 'map': 'SCrRXO', 'traffic': 0.3, 'budget': 0, 'reward': {'base_w': 1.0, 'speed_w': 0.1, 'max_speed_kmh': 80.0, 'collision_penalty': -10.0, 'offroad_penalty': -8.0, 'traffic_violation_penalty': -5.0, 'success_bonus': 15.0, 'step_penalty': -0.05}}


### 4.3 Training functions

These functions create models, attach callbacks, and run training. Each saves checkpoint, best-model and CSV metrics.


In [27]:
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def make_model(algo, env, hyperparams):
    common_kwargs = dict(
        verbose=0,  # make SB3 quiet in console
        tensorboard_log=str(EXPERIMENT_ROOT / 'tensorboard'),
        policy_kwargs=hyperparams['policy_kwargs'],
        learning_rate=hyperparams['learning_rate'],
    )

    if algo == 'PPO':
        model = PPO(
            hyperparams['policy'],
            env,
            n_steps=hyperparams['n_steps'],
            batch_size=hyperparams['batch_size'],
            n_epochs=hyperparams['n_epochs'],
            gamma=hyperparams['gamma'],
            device="cpu",
            **common_kwargs,
        )
        return model

    if algo == 'SAC':
        model = SAC(
            hyperparams['policy'],
            env,
            batch_size=hyperparams.get('batch_size', 256),
            buffer_size=hyperparams.get('buffer_size', 100_000),
            gamma=hyperparams.get('gamma', 0.99),
            device=DEVICE,
            **common_kwargs,
        )
        return model

    if algo == 'DQN':
        model = DQN(
            hyperparams['policy'],
            env,
            buffer_size=hyperparams.get('buffer_size', 50_000),
            batch_size=hyperparams.get('batch_size', 32),
            train_freq=hyperparams.get('train_freq', 4),
            device=DEVICE,
            **common_kwargs,
        )
        return model

    raise ValueError('Unknown algo')


In [28]:
def evaluate_env_episodes(model, env, n_episodes=20, deterministic=True):
    """
    Run n_episodes on a *single* (non-vec) env.
    Returns (mean_reward, std_reward, mean_ep_len).
    """
    rewards = []
    lengths = []

    for _ in range(n_episodes):
        reset_out = env.reset()
        # Handle reset() returning either obs or (obs, info)
        if isinstance(reset_out, tuple):
            obs, _info = reset_out
        else:
            obs = reset_out
        info = {}
        done = False
        truncated = False
        ep_r = 0.0
        steps = 0

        while not (done or truncated):
            action, _ = model.predict(obs, deterministic=deterministic)
            step_out = env.step(action)
            # step could be:
            #  - (obs, reward, done, info)  [old Gym]
            #  - (obs, reward, terminated, truncated, info) [Gymnasium]
            if len(step_out) == 5:
                obs, r, terminated, truncated, info = step_out
                done = bool(terminated or truncated)
            elif len(step_out) == 4:
                obs, r, done, info = step_out
                truncated = False
            else:
                raise RuntimeError(f"Unexpected env.step() output length: {len(step_out)}")

            truncated = False
            ep_r += float(r)
            steps += 1

        rewards.append(ep_r)
        lengths.append(steps)

    rewards = np.array(rewards, dtype=np.float32)
    lengths = np.array(lengths, dtype=np.float32)
    return float(rewards.mean()), float(rewards.std()), float(lengths.mean())


In [29]:
def train_noncurriculum(algo, stage, total_timesteps, seed, n_envs=1):
    """
    Non-curriculum baseline:
    - Train from scratch on a SINGLE stage for 'total_timesteps'.
    - Eval during training on SAME env via PrettyEvalCallback (already set up).
    - After training, eval on two held-out envs:
        1) SCrRXO + medium traffic (MetaDriveEnv)
        2) VaryingDynamicsEnv (randomized dynamics)
    """

    out_dir = EXPERIMENT_ROOT / f"{algo}/noncurriculum/seed_{seed}/{stage['name']}"
    out_dir.mkdir(parents=True, exist_ok=True)
    print(f"Training NON-CURRICULUM: {algo} {stage['name']} seed {seed}\n")

    use_discrete = (algo == "DQN")

    # ---- training env (single vec env) ----
    env = make_vec_env(stage, n_envs=n_envs, use_discrete=use_discrete, seed=seed)

    model = make_model(algo, env, HYPERS[algo])

    # in-training eval (same env)
    eval_cb = PrettyEvalCallback(
        env,
        best_model_save_path=str(out_dir / "best_model"),
        log_path=str(out_dir / "eval"),
        eval_freq=EVAL_FREQ,
        n_eval_episodes=EVAL_EPISODES,
        deterministic=True,
        verbose=1,
    )

    ckpt_cb = CheckpointCallback(
        save_freq=EVAL_FREQ,
        save_path=str(out_dir / "checkpoints"),
        name_prefix="ckpt",
    )

    metrics_csv = out_dir / "metrics.csv"
    metrics_cb = MetricsCallback(
        eval_env=env,       # since you’re using n_envs=1 (DummyVecEnv)
        csv_path=str(metrics_csv),
        eval_freq=EVAL_FREQ,
        eval_episodes=EVAL_EPISODES,
        verbose=0,
    )

    save_run_config(out_dir, algo, stage, seed)

    # ---- train ----
    model.learn(
        total_timesteps=total_timesteps,
        callback=[eval_cb, ckpt_cb, metrics_cb],
    )
    model.save(str(out_dir / "model.zip"))

    print(f"\nNon-curriculum training complete and saved to {out_dir}")

    # Close training vec env to avoid engine conflicts before new envs
    env.close()

    # =====================================================================
    # HELD-OUT 1: SCrRXO + medium traffic
    # =====================================================================
    print("\n[HELD-OUT 1] Evaluating on SCrRXO (fixed 6-block) with medium traffic...")

    # build a *single* env instance using same wrappers
    held1_make = make_metadrive_env(
        HELDOUT_SCENARIO_STAGE,
        use_discrete=use_discrete,
        seed=seed + 1000,
        render=False,
    )
    held1_env = held1_make()
    held1_env = MetaDriveGymCompatibilityWrapper(held1_env)

    h1_mean, h1_std, h1_len = evaluate_env_episodes( #, h1_len if manual function
        model,
        held1_env,
        n_episodes=20,
        deterministic=True,
    )

    print(
        f"HELD-OUT 1 (SCrRXO): mean_reward={h1_mean:.2f} ± {h1_std:.2f}, "
        f"mean_ep_len={h1_len:.1f}"
    )

    pd.DataFrame(
        [{
            "algo": algo,
            "train_stage": stage["name"],
            "heldout_name": HELDOUT_SCENARIO_STAGE["name"],
            "mean_reward": h1_mean,
            "std_reward": h1_std,
            "mean_ep_len": h1_len,
        }]
    ).to_csv(out_dir / "heldout_scrrxo_metrics.csv", index=False)

    held1_env.close()

    # =====================================================================
    # HELD-OUT 2: VaryingDynamicsEnv
    # =====================================================================
    # print("\n[HELD-OUT 2] Evaluating on VaryingDynamicsEnv (randomized dynamics)...")

    # # build varying dynamics env
    # vd_env = VaryingDynamicsEnv(VARYING_DYNAMICS_CONFIG)
    # vd_env = MetaDriveGymCompatibilityWrapper(vd_env)

    # if use_discrete:
    #     # same discrete mapping you used for training DQN
    #     discrete_mapping = [(-1.0, 0.0), (-1.0, 0.3), (0.0, 0.5), (1.0, 0.3), (1.0, 0.0)]
    #     vd_env = DiscreteActionWrapper(vd_env, discrete_mapping)

    # h2_mean, h2_std, h2_len = evaluate_env_episodes(
    #     model,
    #     vd_env,
    #     n_episodes=20,
    #     deterministic=True,
    # )

    # print(
    #     f"HELD-OUT 2 (VaryingDynamics): mean_reward={h2_mean:.2f} ± {h2_std:.2f}, "
    #     f"mean_ep_len={h2_len:.1f}"
    # )

    # pd.DataFrame(
    #     [{
    #         "algo": algo,
    #         "train_stage": stage["name"],
    #         "heldout_name": "VaryingDynamicsEnv",
    #         "mean_reward": h2_mean,
    #         "std_reward": h2_std,
    #         "mean_ep_len": h2_len,
    #     }]
    # ).to_csv(out_dir / "heldout_varying_metrics.csv", index=False)

    # vd_env.close()

    return out_dir


### 4.4. Visualization (plot metrics, learning curves, and display videos)

In [30]:
def plot_metrics(csv_path, title=None):
    if not os.path.exists(csv_path):
        print('CSV not found:', csv_path); return
    df = pd.read_csv(csv_path)
    fig, axs = plt.subplots(2,2, figsize=(12,8))
    axs = axs.flatten()
    axs[0].plot(df['total_timesteps'], df['mean_reward'], marker='o'); axs[0].set_title('Mean reward')
    axs[1].plot(df['total_timesteps'], df['success_rate'], marker='o'); axs[1].set_title('Success rate')
    axs[2].plot(df['total_timesteps'], df['collision_rate'], marker='o'); axs[2].set_title('Collision rate')
    axs[3].plot(df['total_timesteps'], df['avg_speed'], marker='o'); axs[3].set_title('Avg speed')
    if title: fig.suptitle(title)
    plt.tight_layout(); plt.show()

print('Plot helper ready')

Plot helper ready


## 5. Full experiment

For each algorithm, for each seed.


In [None]:
algos = ["SAC"] # "SAC", "DQN",

for algo in algos:
    for seed in SEEDS:
        print("=" * 80)
        print(f"ALGO={algo}  SEED={seed}")

        for stage in STAGES:
            train_noncurriculum(
                algo=algo,
                stage=stage,
                total_timesteps=stage["budget"],  # use this stage's budget
                seed=seed,
                n_envs=N_ENVS,
            )

ALGO=SAC  SEED=0
Training NON-CURRICULUM: SAC C0_Straight seed 0


------------------------------------------------------------

Eval num_timesteps=10000, episode_reward=10.08 +/- 0.00
Episode length: 119.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-12.81 +/- 0.00
Episode length: 941.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=-11.19 +/- 0.00
Episode length: 325.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=-3.19 +/- 0.00
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=50000, episode_reward=-3.97 +/- 0.00
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=60000, episode_reward=-3.09 +/- 0.00
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=70000, episode_reward=-3.23 +/- 0.00
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=80000, episode_reward=-0.73 +/- 0.00
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=90000, episode_reward=-1.70 +/- 0.00
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=100000, episode_reward=93.99 +

## 6. Visualization

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os

# Adjust if your root is named differently
EXPERIMENT_ROOT = Path("experiments")

CURVES_DIR = EXPERIMENT_ROOT / "curves"
CURVES_DIR.mkdir(parents=True, exist_ok=True)

def load_eval_npz(eval_npz_path: Path):
    """
    Load SB3 EvalCallback npz.
    Returns: timesteps, mean_rewards, std_rewards, mean_ep_len
    """
    data = np.load(eval_npz_path)
    timesteps = data["timesteps"].flatten()          # [n_eval]
    results = data["results"]                        # [n_eval, n_episodes]
    ep_lengths = data["ep_lengths"]                  # [n_eval, n_episodes]

    mean_rewards = results.mean(axis=1)
    std_rewards = results.std(axis=1)
    mean_ep_len = ep_lengths.mean(axis=1)

    return timesteps, mean_rewards, std_rewards, mean_ep_len


def plot_stage_curves(algo, seed_name, stage_name, eval_npz_path):
    """
    Make per-stage plots:
      - reward vs timesteps
      - episode length vs timesteps
    Save to experiments/curves
    """
    t, mean_r, std_r, mean_len = load_eval_npz(eval_npz_path)

    # 1) Reward curve
    plt.figure(figsize=(6, 4))
    plt.plot(t, mean_r, marker="o")
    plt.fill_between(t, mean_r - std_r, mean_r + std_r, alpha=0.2)
    plt.xlabel("Timesteps")
    plt.ylabel("Eval mean return")
    plt.title(f"{algo} {seed_name} – {stage_name} (eval reward)")
    plt.grid(True)
    plt.tight_layout()

    out_path = CURVES_DIR / f"{algo}_{seed_name}_{stage_name}_reward.png"
    plt.savefig(out_path, dpi=150)
    plt.close()

    # 2) Episode length curve
    plt.figure(figsize=(6, 4))
    plt.plot(t, mean_len, marker="o")
    plt.xlabel("Timesteps")
    plt.ylabel("Mean episode length")
    plt.title(f"{algo} {seed_name} – {stage_name} (episode length)")
    plt.grid(True)
    plt.tight_layout()

    out_path = CURVES_DIR / f"{algo}_{seed_name}_{stage_name}_ep_len.png"
    plt.savefig(out_path, dpi=150)
    plt.close()


def plot_combined_curves(algo, seed_name, stage_to_npz):
    """
    Combined plots across stages for one algo+seed:
      - all reward curves
      - all episode length curves
    stage_to_npz: dict {stage_name: eval_npz_path}
    """
    # Reward
    plt.figure(figsize=(7, 5))
    for stage_name, npz_path in stage_to_npz.items():
        t, mean_r, std_r, mean_len = load_eval_npz(npz_path)
        plt.plot(t, mean_r, marker="o", label=stage_name)

    plt.xlabel("Timesteps")
    plt.ylabel("Eval mean return")
    plt.title(f"{algo} {seed_name} – eval reward (all stages)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    out_path = CURVES_DIR / f"{algo}_{seed_name}_ALL_reward.png"
    plt.savefig(out_path, dpi=150)
    plt.close()

    # Episode length
    plt.figure(figsize=(7, 5))
    for stage_name, npz_path in stage_to_npz.items():
        t, mean_r, std_r, mean_len = load_eval_npz(npz_path)
        plt.plot(t, mean_len, marker="o", label=stage_name)

    plt.xlabel("Timesteps")
    plt.ylabel("Mean episode length")
    plt.title(f"{algo} {seed_name} – episode length (all stages)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    out_path = CURVES_DIR / f"{algo}_{seed_name}_ALL_ep_len.png"
    plt.savefig(out_path, dpi=150)
    plt.close()


def plot_heldout_bars(algo, seed_name, seed_dir: Path):
    """
    If held-out CSVs exist for this algo+seed, make bar plots:
      - heldout_scrrxo_metrics.csv across stages
      - heldout_varying_metrics.csv across stages
    """
    # Find all stages under this seed dir
    stage_dirs = [
        d for d in seed_dir.iterdir()
        if d.is_dir() and not d.name.startswith(".")
    ]

    # ----- Held-out 1: SCrRXO -----
    rows = []
    for sd in stage_dirs:
        csv_path = sd / "heldout_scrrxo_metrics.csv"
        if csv_path.exists():
            df = pd.read_csv(csv_path)
            if not df.empty:
                r = df.iloc[0].to_dict()
                r["stage"] = sd.name
                rows.append(r)

    if rows:
        df_scr = pd.DataFrame(rows)
        plt.figure(figsize=(6, 4))
        plt.bar(df_scr["stage"], df_scr["mean_reward"])
        plt.xlabel("Training stage")
        plt.ylabel("Held-out mean reward")
        plt.title(f"{algo} {seed_name} – Held-out SCrRXO performance")
        plt.grid(axis="y")
        plt.tight_layout()
        out_path = CURVES_DIR / f"{algo}_{seed_name}_heldout_SCrRXO.png"
        plt.savefig(out_path, dpi=150)
        plt.close()

    # ----- Held-out 2: VaryingDynamics -----
    rows = []
    for sd in stage_dirs:
        csv_path = sd / "heldout_varying_metrics.csv"
        if csv_path.exists():
            df = pd.read_csv(csv_path)
            if not df.empty:
                r = df.iloc[0].to_dict()
                r["stage"] = sd.name
                rows.append(r)

    if rows:
        df_vd = pd.DataFrame(rows)
        plt.figure(figsize=(6, 4))
        plt.bar(df_vd["stage"], df_vd["mean_reward"])
        plt.xlabel("Training stage")
        plt.ylabel("Held-out mean reward")
        plt.title(f"{algo} {seed_name} – Held-out VaryingDynamics performance")
        plt.grid(axis="y")
        plt.tight_layout()
        out_path = CURVES_DIR / f"{algo}_{seed_name}_heldout_VaryingDynamics.png"
        plt.savefig(out_path, dpi=150)
        plt.close()


In [35]:
# ======================================================================
# MAIN: walk experiments/ and generate all possible plots
# ======================================================================

for algo_dir in EXPERIMENT_ROOT.iterdir():
    if not algo_dir.is_dir():
        continue
    algo = algo_dir.name  # e.g. "PPO", "SAC", "DQN"

    noncurr_dir = algo_dir / "noncurriculum"
    if not noncurr_dir.exists():
        continue

    for seed_dir in noncurr_dir.iterdir():
        if not seed_dir.is_dir() or not seed_dir.name.startswith("seed_"):
            continue
        seed_name = seed_dir.name  # e.g. "seed_0"

        print(f"Processing {algo} / {seed_name} ...")

        # collect per-stage npz paths for combined plots
        stage_to_npz = {}

        # per-stage plots
        for stage_dir in seed_dir.iterdir():
            if not stage_dir.is_dir():
                continue
            stage_name = stage_dir.name  # e.g. "C0_Straight"

            eval_npz_path = stage_dir / "eval" / "evaluations.npz"
            if not eval_npz_path.exists():
                continue

            # individual plots
            plot_stage_curves(algo, seed_name, stage_name, eval_npz_path)
            stage_to_npz[stage_name] = eval_npz_path

        # combined curves across stages
        if stage_to_npz:
            plot_combined_curves(algo, seed_name, stage_to_npz)

        # held-out bar plots
        plot_heldout_bars(algo, seed_name, seed_dir)

print("All curves saved under:", CURVES_DIR)

Processing SAC / seed_0 ...
All curves saved under: experiments/curves


In [36]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
from pathlib import Path

BASE = Path("experiments/tensorboard")

def load_tb_scalars(event_file):
    event_file = str(event_file)   # <--- FIX HERE
    ea = EventAccumulator(event_file)
    ea.Reload()
    scalars = {}
    for tag in ea.Tags()["scalars"]:
        events = ea.Scalars(tag)
        steps = [e.step for e in events]
        values = [e.value for e in events]
        scalars[tag] = (steps, values)
    return scalars


def plot_scalar(steps, values, title, out_path):
    plt.figure(figsize=(8,4))
    plt.plot(steps, values)
    plt.xlabel("Timesteps")
    plt.ylabel(title)
    plt.title(title)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def plot_stage(stage_path):
    print(f"\nProcessing stage: {stage_path.name}")
    curves_dir = stage_path / "curves"
    curves_dir.mkdir(exist_ok=True)

    # Find tensorboard event file
    event_files = list(stage_path.glob("**/events.out.tfevents.*"))
    if len(event_files)==0:
        print("No TB file found for eval.")
        return
    tb_file = event_files[0]
    tb_scalars = load_tb_scalars(tb_file)

    # Plot all general scalars
    for tag, (steps, values) in tb_scalars.items():
        clean_name = tag.replace("/", "_")
        out = curves_dir / f"{clean_name}.png"
        plot_scalar(steps, values, f"{stage_path.name}: {tag}", out)

    print(f"Saved all curves for {stage_path.name} → {curves_dir}")


# =============================
# PROCESS EACH STAGE
# =============================
all_stage_dirs = sorted([p for p in BASE.glob("*") if p.is_dir()])

for stage_dir in all_stage_dirs:
    plot_stage(stage_dir)

print("\nFinished generating all plots.")



Processing stage: SAC_1
Saved all curves for SAC_1 → experiments/tensorboard/SAC_1/curves

Processing stage: SAC_2
Saved all curves for SAC_2 → experiments/tensorboard/SAC_2/curves

Processing stage: SAC_3
Saved all curves for SAC_3 → experiments/tensorboard/SAC_3/curves

Processing stage: SAC_4
Saved all curves for SAC_4 → experiments/tensorboard/SAC_4/curves

Finished generating all plots.


In [37]:
import shutil

shutil.make_archive("experiments_SAC", "zip", "experiments")

print("Zipped to experiments_SAC.zip")

Zipped to experiments_SAC.zip
