# NonCurriculum_MetaDrive_SB3_Experiments

This notebook contains a full, reproducible experiment pipeline for **non-curriculum** based reinforcement learning for autonomous driving using **MetaDrive** and **Stable Baselines3 (SB3)**. It includes:

- Full environment factory and wrappers (including a discrete-action wrapper for DQN).
- Exact stage definitions (C0..C3) and matching budgets.
- Non-curriculum runner (train each target map separately for the same total sample budget).
- Evaluation harness (metrics logging, CSV saving, TensorBoard integration, video recording).
- Hyperparameters and experiment folder conventions.



## 0. Setup


In [1]:
!pip uninstall -y metadrive metadrive-simulator metadrive-simulator-py3-12 || true


[0m

In [2]:
# !pip install stable-baselines3[extra] gymnasium metadrive numpy pandas matplotlib tensorboard opencv-python
# !pip install stable-baselines3[extra] tensorboard opencv-python
!pip install -q "stable-baselines3[extra]" "metadrive-simulator-py3-12" tensorboard opencv-python

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.0/55.0 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.2/256.2 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m99.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for progressbar (setup.py) ... [?25l[?25hdone


In [3]:
import metadrive

print("metadrive.__file__ :", getattr(metadrive, "__file__", None))
print("metadrive.__path__ :", getattr(metadrive, "__path__", None))




metadrive.__file__ : /usr/local/lib/python3.12/dist-packages/metadrive/__init__.py
metadrive.__path__ : ['/usr/local/lib/python3.12/dist-packages/metadrive']


In [None]:
# !git clone https://github.com/metadriverse/metadrive.git
# %cd metadrive
# !pip install -e .


In [None]:
# !python -m metadrive.pull_asset


In [None]:
# !python -m metadrive.examples.profile_metadrive


In [None]:
# %cd /content


In [4]:
import metadrive, inspect, os
print("Using metadrive from:", metadrive.__file__)
print("Contents:", os.listdir(os.path.dirname(metadrive.__file__)))


Using metadrive from: /usr/local/lib/python3.12/dist-packages/metadrive/__init__.py
Contents: ['pull_asset.py', 'constants.py', 'type.py', '__init__.py', 'render_pipeline', 'component', 'obs', 'policy', 'manager', 'shaders', 'scenario', 'base_class', 'examples', '__pycache__', 'tests', 'version.py', 'utils', 'engine', 'envs', 'third_party']


## 1. Imports

In [5]:
import os
import time
import json
import math
import random
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import logging

# SB3 imports
from metadrive.envs.metadrive_env import MetaDriveEnv
from metadrive.envs.varying_dynamics_env import VaryingDynamicsEnv
from stable_baselines3 import PPO, SAC, DQN
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, BaseCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# MetaDrive import guard
# try:
# from metadrive.envs.metadrive_env import MetaDriveEnv
# except Exception as e:
    # MetaDriveEnv = None
    # print('MetaDrive import failed.')

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  return datetime.utcnow().replace(tzinfo=utc)


In [6]:
import warnings

warnings.filterwarnings("ignore")                       # ignore everything
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", module="stable_baselines3")
warnings.filterwarnings("ignore", module="gymnasium")


## 2. Experiment configuration
setup: maps, stages, budgets, hyperparameters

In [7]:
### Maps, stages and budgets
# STAGES = [
#     ("C0_Straight", "Straight", 0.0, 200_000),
#     ("C1_Curve", "Curve", 0.0, 300_000),
#     ("C2_Roundabout","Roundabout",0.0,400_000),
#     ("C3_Dynamic", "20-block", 0.3, 400_000),
# ]

### Maps, curriculum stages and budgets (with reward configs)

STAGES = [
    # C0: straight road, no traffic – just learn to go forward safely.
    {
        "id": "C0",
        "name": "C0_Straight",
        "env_type": "general",
        "map": "S", # Straight
        "traffic": 0.0,
        "budget": 200_000,
        "reward": {
            "base_w": 1.0,              # scale MetaDrive base reward
            "speed_w": 0.05,            # weak speed shaping
            "max_speed_kmh": 80.0,
            "collision_penalty": -5.0,
            "offroad_penalty": -3.0,
            "traffic_violation_penalty": -2.0,
            "success_bonus": 10.0,
            "step_penalty": -0.001,
        },
    },

    # C1: roundabout, no traffic – topology harder, still single-ego.
    {
        "id": "C1",
        "name": "C1_Roundabout",
        "env_type": "general",
        "map": "O", # Roundabout
        "traffic": 0.0,
        "budget": 300_000,
        "reward": {
            "base_w": 1.0,
            "speed_w": 0.05,
            "max_speed_kmh": 80.0,
            "collision_penalty": -6.0,  # slightly harsher for bad manoeuvres
            "offroad_penalty": -4.0,
            "traffic_violation_penalty": -3.0,
            "success_bonus": 10.0,
            "step_penalty": -0.005,
        },
    },

    # C2: 20-block PG map with **light traffic** – first exposure to traffic.
    {
        "id": "C2",
        "name": "C2_LightTraffic",
        "map": 10, # 20-block
        "traffic": 0.05,               # light traffic
        "budget": 400_000,
        "reward": {
            "base_w": 1.0,
            "speed_w": 0.08,            # encourage moving at speed in traffic
            "max_speed_kmh": 80.0,
            "collision_penalty": -8.0,
            "offroad_penalty": -6.0,
            "traffic_violation_penalty": -4.0,
            "success_bonus": 12.0,
            "step_penalty": -0.01,
        },
    },

    # C3: same PG map with **dense traffic** – “multi-agent-ish” final stage.
    # Still single learning ego, but many interacting vehicles (like CuRLA's
    # higher-traffic final curriculum stage).
    {
        "id": "C3",
        "name": "C3_DenseTraffic",
        "map": 20,
        "traffic": 0.30,               # dense traffic ≈ mild multi-agent
        "budget": 400_000,
        "reward": {
            "base_w": 1.0,
            "speed_w": 0.10,
            "max_speed_kmh": 80.0,
            "collision_penalty": -10.0, # strong safety pressure
            "offroad_penalty": -8.0,
            "traffic_violation_penalty": -5.0,
            "success_bonus": 15.0,
            "step_penalty": -0.05,
        },
    },
]

TOTAL_CURRICULUM_BUDGET = sum(s["budget"] for s in STAGES)
print("Total curriculum budget (per algorithm) =", TOTAL_CURRICULUM_BUDGET)

Total curriculum budget (per algorithm) = 1300000


In [8]:
# Held-out 1: fixed 6-block map SCrRXO with medium traffic
HELDOUT_SCENARIO_STAGE = {
    "id": "HELDOUT_SCENARIO",
    "name": "HELDOUT_SCrRXO_MedTraffic",
    "map": "SCrRXO",        # Straight -> Circular -> in-ramp -> out-ramp -> intersection -> roundabout
    "traffic": 0.30,        # medium-ish traffic
    "budget": 0,            # no training budget, eval only
    "reward": STAGES[-1]["reward"],  # reuse hardest-stage shaping for comparability
}

# Held-out 2: VaryingDynamics environment (dynamics robustness test)
VARYING_DYNAMICS_CONFIG = dict(
    num_scenarios=100,
    map=5,                  # small PG map
    log_level=logging.ERROR,
    # random_dynamics uses default ranges from docs; that's enough to show robustness
)

In [9]:
# Folder convention
EXPERIMENT_ROOT = Path('experiments')
EXPERIMENT_ROOT.mkdir(exist_ok=True)

# Seeds and workers
SEEDS = [0]
N_ENVS = 1 # should be 8 but metadrive issues
EVAL_FREQ = 20_000
EVAL_EPISODES = 5

# Held-out test map
HELDOUT_MAP = ("Fork", 0.2)

In [10]:
# Hyperparameters
HYPERS = {
    'PPO': {
        'policy':'MlpPolicy',
        'policy_kwargs':{'net_arch':[64,64]}, # can do 64, 64 as well
        'learning_rate':3e-4,
        'n_steps':1024, # compute issue, can do 2048 if faster
        'batch_size':64,
        'n_epochs':10,
        'gamma':0.99,
        'clip_range':0.2
    },
    'SAC': {
        'policy':'MlpPolicy',
        'policy_kwargs':{'net_arch':[256,256]},
        'learning_rate':3e-4,
        'batch_size':256,
        'buffer_size':100_000,
        'gamma':0.99
    },
    'DQN': { # Atari setup
        'policy':'MlpPolicy',
        'policy_kwargs':{'net_arch':[64,64]},
        'learning_rate':1e-4,
        'buffer_size':100_000, # can do 50,000 if needed
        'batch_size':32,
        'train_freq':4
    }
}

Includes a discrete-action wrapper for DQN (maps discrete indices -> continuous steer/throttle).

In [11]:
import gymnasium as gym
from gymnasium import spaces

class DiscreteActionWrapper(gym.ActionWrapper):
    def __init__(self, env, mapping):
        super().__init__(env)
        self.mapping = mapping
        self.action_space = spaces.Discrete(len(mapping))

    def action(self, action):
        return np.array(self.mapping[action], dtype=np.float32)

In [12]:
class CurriculumRewardWrapper(gym.Wrapper):
    """
    Stage-dependent reward shaping:
    - Start from MetaDrive's base reward.
    - Add speed term.
    - Add collision / off-road / traffic-violation penalties.
    - Add success bonus.

    Uses the per-stage reward config from STAGES.
    """
    def __init__(self, env, reward_cfg):
        super().__init__(env)
        self.cfg = reward_cfg
        self.max_speed = self.cfg.get("max_speed_kmh", 80.0)

    def step(self, action):
        # MetaDrive uses Gymnasium API: obs, reward, terminated, truncated, info
        obs, base_r, terminated, truncated, info = self.env.step(action)

        # --- speed term ---
        # MetaDrive usually exposes speed either as 'speed' or 'velocity'
        raw_speed = float(info.get("speed", info.get("velocity", 0.0)))
        speed = max(0.0, min(raw_speed, self.max_speed))
        speed_term = self.cfg.get("speed_w", 0.0) * (speed / self.max_speed)

        # --- start from scaled base reward + speed shaping ---
        r = self.cfg.get("base_w", 1.0) * base_r + speed_term

        # --- per-step cost (encourage finishing sooner) ---
        r += self.cfg.get("step_penalty", 0.0)

        # --- collision penalties ---
        crashed = (
            info.get("crash_vehicle", False)
            or info.get("crash_object", False)
            or info.get("crash_building", False)
        )
        if crashed:
            r += self.cfg.get("collision_penalty", 0.0)
        info["collision"] = bool(crashed)

        # --- off-road / traffic-violation penalties ---
        offroad = info.get("out_of_road", False)
        if offroad:
            r += self.cfg.get("offroad_penalty", 0.0)
        # generic "traffic violation" flag for your metrics callback
        traffic_violation = bool(offroad or info.get("traffic_light_violation", False))
        if traffic_violation:
            r += self.cfg.get("traffic_violation_penalty", 0.0)
        info["traffic_violation"] = traffic_violation

        # --- success bonus at terminal step ---
        success = bool(info.get("arrive_dest", False) or info.get("success", False))
        if terminated and success:
            r += self.cfg.get("success_bonus", 0.0)
        info["success"] = success

        # For logging
        info["avg_speed"] = speed
        info["shaped_reward"] = r

        return obs, r, terminated, truncated, info

## 4. Functions

### 4.1 Environment factory

In [None]:
# from functools import partial

# def make_metadrive_env(map_name, traffic_density=0.0, use_discrete=False, seed=0, render=False):
#     def _init():
#         cfg = {
#             'map': map_name,
#             'traffic_density': traffic_density,
#             'use_render': False,
#             'start_seed': seed,
#             'random_spawn': True,
#             'debug': False,
#         }
#         env = MetaDriveEnv(cfg)
#         # Optionally wrap for DQN
#         if use_discrete:
#             mapping = [(-1.0,0.0),(-1.0,0.3),(0.0,0.5),(1.0,0.3),(1.0,0.0)]
#             env = DiscreteActionWrapper(env, mapping)
#         return Monitor(env)
#     return _init

In [None]:
# def make_vec_env(map_name, traffic_density=0.0, n_envs=8, use_discrete=False, seed=0, parallel=False):
#     factories = [make_metadrive_env(map_name, traffic_density, use_discrete, seed+i) for i in range(n_envs)]
#     if parallel:
#         return SubprocVecEnv(factories)
#     else:
#         return DummyVecEnv(factories)

In [13]:
import gymnasium as gym

class MetaDriveGymCompatibilityWrapper(gym.Wrapper):
    """
    Makes MetaDriveEnv follow the Gymnasium reset() and step() signature.
    Removes unsupported arguments like options.
    """
    def reset(self, *, seed=None, options=None):
        if seed is not None:
            obs, info = self.env.reset(seed=seed)
        else:
            obs, info = self.env.reset()
        return obs, info

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)
        # MetaDrive uses done only; Gymnasium expects (terminated, truncated)
        terminated = done
        return obs, reward, terminated, truncated, info

In [14]:
from functools import partial
import logging

def make_metadrive_env(stage, use_discrete=False, seed=0, render=False):
    """
    stage: one of the dicts from STAGES. Uses:
        stage["map"], stage["traffic"], stage["reward"]
    """
    map_name = stage["map"]
    traffic_density = stage["traffic"]
    reward_cfg = stage["reward"]

    def _init():
        cfg = {
            "map": map_name,
            "traffic_density": traffic_density,
            "use_render": render,
            "start_seed": seed,
            # "random_spawn": True,
            "debug": False,
            "log_level": logging.ERROR,
            # cap episode length so eval can't run forever
            "horizon": 1000, # max steps per episode
            "truncate_as_terminate": True,   # treat horizon as done
        }
        env = MetaDriveEnv(cfg)

        # Fix reset() signature mismatch
        env = MetaDriveGymCompatibilityWrapper(env)

        # CuRLA-style stage-dependent reward shaping
        env = CurriculumRewardWrapper(env, reward_cfg)

        # Optional discrete-action wrapper for DQN
        if use_discrete:
            mapping = [(-1.0, 0.0), (-1.0, 0.3), (0.0, 0.5), (1.0, 0.3), (1.0, 0.0)]
            env = DiscreteActionWrapper(env, mapping)

        return Monitor(env)

    return _init


In [15]:
# def make_vec_env(stage, n_envs=8, use_discrete=False, seed=0, parallel=False):
#     factories = [
#         make_metadrive_env(stage, use_discrete=use_discrete, seed=seed + i)
#         for i in range(n_envs)
#     ]
#     if parallel:
#         return SubprocVecEnv(factories)
#     else:
#         return DummyVecEnv(factories)

# from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

# def make_vec_env(stage, n_envs=1, use_discrete=False, seed=0):
#     """
#     Create vectorized MetaDrive envs.
#     IMPORTANT: MetaDrive can only have one engine per process.
#     So:
#       - n_envs == 1  -> use DummyVecEnv (single process, single env)
#       - n_envs > 1   -> use SubprocVecEnv (one env per process)
#     """
#     if n_envs == 1:
#         return DummyVecEnv([
#             make_metadrive_env(stage, use_discrete=use_discrete, seed=seed)
#         ])
#     else:
#         env_fns = [
#             make_metadrive_env(stage, use_discrete=use_discrete, seed=seed + i)
#             for i in range(n_envs)
#         ]
#         return SubprocVecEnv(env_fns)

from stable_baselines3.common.vec_env import DummyVecEnv

def make_vec_env(stage, n_envs=1, use_discrete=False, seed=0):
    """
    Create vectorized MetaDrive envs.
    FIX: We must ALWAYS use SubprocVecEnv for MetaDrive.
    Using DummyVecEnv (single process) prevents creating a second env
    (like eval_env) because MetaDrive allows only one engine per process.
    """
    # env_fns = [
    #     make_metadrive_env(stage, use_discrete=use_discrete, seed=seed + i)
    #     for i in range(n_envs)
    # ]

    # # FORCE SubprocVecEnv even if n_envs=1
    # return SubprocVecEnv(env_fns)

    return DummyVecEnv([
        make_metadrive_env(stage, use_discrete=use_discrete, seed=seed)
    ])

In [16]:
def make_eval_vec_env(stage, use_discrete=False, seed=0):
    """
    Eval env:
    - Always uses SubprocVecEnv (even with 1 worker) so that MetaDriveEnv
      is only ever created in subprocesses, not in the main process.
    """
    env_fns = [make_metadrive_env(stage, use_discrete=use_discrete, seed=seed)]
    return SubprocVecEnv(env_fns)


### 4.2 log per-eval metrics (success rate, collisions, speed etc)

In [None]:
# class MetricsCallback(BaseCallback):
#     """
#     Custom callback to compute and append evaluation metrics to CSV on each eval.
#     Assumes the eval_env yields episode info dicts with keys 'success','collision','avg_speed','traffic_violation'.
#     """
#     def __init__(self, eval_env, out_csv, eval_episodes=10, verbose=0):
#         super().__init__(verbose)
#         self.eval_env = eval_env
#         self.out_csv = out_csv
#         self.eval_episodes = eval_episodes
#         self._cols = ['timestamp','total_timesteps','mean_reward','std_reward','success_rate','collision_rate','avg_speed','traffic_violations']
#         if not os.path.exists(out_csv):
#             pd.DataFrame(columns=self._cols).to_csv(out_csv, index=False)

#     def _on_step(self) -> bool:
#         return True

#     def on_training_end(self):
#         pass

#     def record_eval(self, model, total_timesteps):
#         # run evaluation episodes and compute metrics
#         rewards = []
#         successes = []
#         collisions = []
#         speeds = []
#         traffic_viol = []
#         for ep in range(self.eval_episodes):
#             obs, _ = self.eval_env.reset()
#             done = False
#             ep_r = 0.0
#             while not done:
#                 action, _ = model.predict(obs, deterministic=True)
#                 obs, reward, terminated, truncated, info = self.eval_env.step(action)
#                 ep_r += reward
#                 done = terminated or truncated
#             rewards.append(ep_r)
#             info_ep = info if isinstance(info, dict) else {}
#             # fallback if env does not provide keys
#             successes.append(info_ep.get('success', 1.0 if ep_r>0 else 0.0))
#             collisions.append(info_ep.get('collision', 0.0))
#             speeds.append(info_ep.get('avg_speed', info_ep.get('speed', 0.0)))
#             traffic_viol.append(info_ep.get('traffic_violation', 0.0))

#         mean_r = np.mean(rewards)
#         std_r = np.std(rewards)
#         success_rate = np.mean(successes)
#         collision_rate = np.mean(collisions)
#         avg_speed = np.mean(speeds)
#         tv = np.mean(traffic_viol)
#         row = {
#             'timestamp': time.time(), 'total_timesteps': total_timesteps, 'mean_reward': mean_r, 'std_reward': std_r,
#             'success_rate': success_rate, 'collision_rate': collision_rate, 'avg_speed': avg_speed, 'traffic_violations': tv
#         }
#         df = pd.read_csv(self.out_csv)
#         df = df.append(row, ignore_index=True)
#         df.to_csv(self.out_csv, index=False)
#         return row

In [None]:
# class MetricsCallback(BaseCallback):
#     """
#     Run a short evaluation every eval_freq steps and log:
#       - mean_reward
#       - success_rate
#       - collision_rate
#       - traffic_violation_rate
#       - avg_speed
#       - avg_episode_length

#     Saves to a CSV at csv_path.
#     """

#     def __init__(self, eval_env, csv_path, eval_freq=50_000, eval_episodes=10, verbose=0):
#         super().__init__(verbose)
#         self.eval_env = eval_env
#         self.csv_path = csv_path
#         self.eval_freq = eval_freq
#         self.eval_episodes = eval_episodes

#         # Create dir if needed
#         os.makedirs(os.path.dirname(csv_path), exist_ok=True)

#         # Write header if file doesn't exist
#         if not os.path.exists(self.csv_path):
#             with open(self.csv_path, "w", newline="") as f:
#                 writer = csv.DictWriter(
#                     f,
#                     fieldnames=[
#                         "timesteps",
#                         "mean_reward",
#                         "success_rate",
#                         "collision_rate",
#                         "traffic_violation_rate",
#                         "avg_speed",
#                         "avg_episode_length",
#                     ],
#                 )
#                 writer.writeheader()

#     def _on_step(self) -> bool:
#         # Only evaluate every eval_freq calls
#         if self.n_calls % self.eval_freq != 0:
#             return True

#         rewards = []
#         successes = []
#         collisions = []
#         traffic_violations = []
#         speeds = []
#         ep_lengths = []

#         for _ in range(self.eval_episodes):
#             obs, _ = self.eval_env.reset()
#             done = False
#             truncated = False

#             ep_reward = 0.0
#             ep_success = False
#             ep_collision = False
#             ep_traffic_violation = False
#             ep_steps = 0
#             ep_speeds = []

#             while not (done or truncated):
#                 action, _ = self.model.predict(obs, deterministic=True)
#                 obs, r, done, truncated, info = self.eval_env.step(action)

#                 ep_reward += float(r)
#                 ep_steps += 1

#                 # flags from CurriculumRewardWrapper / MetaDrive info
#                 if info.get("success", False):
#                     ep_success = True
#                 if info.get("collision", False):
#                     ep_collision = True
#                 if info.get("traffic_violation", False):
#                     ep_traffic_violation = True

#                 if "avg_speed" in info:
#                     ep_speeds.append(float(info["avg_speed"]))
#                 elif "speed" in info:
#                     ep_speeds.append(float(info["speed"]))

#             rewards.append(ep_reward)
#             successes.append(1.0 if ep_success else 0.0)
#             collisions.append(1.0 if ep_collision else 0.0)
#             traffic_violations.append(1.0 if ep_traffic_violation else 0.0)
#             ep_lengths.append(ep_steps)
#             if ep_speeds:
#                 speeds.append(sum(ep_speeds) / len(ep_speeds))

#         mean_reward = float(sum(rewards) / len(rewards)) if rewards else 0.0
#         success_rate = float(sum(successes) / len(successes)) if successes else 0.0
#         collision_rate = float(sum(collisions) / len(collisions)) if collisions else 0.0
#         traffic_violation_rate = float(sum(traffic_violations) / len(traffic_violations)) if traffic_violations else 0.0
#         avg_speed = float(sum(speeds) / len(speeds)) if speeds else 0.0
#         avg_episode_length = float(sum(ep_lengths) / len(ep_lengths)) if ep_lengths else 0.0

#         row = {
#             "timesteps": int(self.num_timesteps),
#             "mean_reward": mean_reward,
#             "success_rate": success_rate,
#             "collision_rate": collision_rate,
#             "traffic_violation_rate": traffic_violation_rate,
#             "avg_speed": avg_speed,
#             "avg_episode_length": avg_episode_length,
#         }

#         with open(self.csv_path, "a", newline="") as f:
#             writer = csv.DictWriter(f, fieldnames=row.keys())
#             writer.writerow(row)

#         if self.verbose > 0:
#             print(f"[Metrics] t={self.num_timesteps}  succ={success_rate:.2f}  "
#                   f"coll={collision_rate:.2f}  len={avg_episode_length:.1f}")

#         return True

In [17]:
class MetricsCallback(BaseCallback):
    """
    Run a short evaluation every eval_freq steps and log:
      - mean_reward
      - success_rate
      - collision_rate
      - traffic_violation_rate
      - avg_speed
      - avg_episode_length

    Saves to a CSV at csv_path.
    """

    def __init__(self, eval_env, csv_path, eval_freq=50_000, eval_episodes=10, verbose=0):
        super().__init__(verbose)
        self.eval_env = eval_env
        self.csv_path = csv_path
        self.eval_freq = eval_freq
        self.eval_episodes = eval_episodes

        # Create dir if needed
        os.makedirs(os.path.dirname(csv_path), exist_ok=True)

        # Write header if file doesn't exist
        if not os.path.exists(self.csv_path):
            with open(self.csv_path, "w", newline="") as f:
                writer = csv.DictWriter(
                    f,
                    fieldnames=[
                        "timesteps",
                        "mean_reward",
                        "success_rate",
                        "collision_rate",
                        "traffic_violation_rate",
                        "avg_speed",
                        "avg_episode_length",
                    ],
                )
                writer.writeheader()

    def _on_step(self) -> bool:
        # Only evaluate every eval_freq calls
        if self.n_calls % self.eval_freq != 0:
            return True

        episode_rewards = []
        episode_successes = []
        episode_collisions = []
        episode_traffic_violations = []
        episode_speeds = []
        episode_lengths = []

        for _ in range(self.eval_episodes):
            # DummyVecEnv.reset() -> obs (no info, vec-batched)
            obs = self.eval_env.reset()
            done = False

            ep_reward = 0.0
            ep_success = False
            ep_collision = False
            ep_traffic_violation = False
            ep_steps = 0
            ep_speeds = []

            while not done:
                # obs shape: (n_envs, obs_dim); n_envs = 1 here
                action, _ = self.model.predict(obs, deterministic=True)
                # DummyVecEnv.step() -> obs, rewards, dones, infos
                obs, rewards, dones, infos = self.eval_env.step(action)

                # unwrap vec env outputs for single env
                if isinstance(rewards, (np.ndarray, list, tuple)):
                    r = float(rewards[0])
                else:
                    r = float(rewards)

                if isinstance(dones, (np.ndarray, list, tuple)):
                    d = bool(dones[0])
                else:
                    d = bool(dones)

                if isinstance(infos, (list, tuple)) and len(infos) > 0:
                    info = infos[0]
                else:
                    info = infos

                ep_reward += r
                ep_steps += 1
                done = d

                # if isinstance(info, dict):
                #     if info.get("success", False):
                #         ep_success = True
                #     if info.get("collision", False):
                #         ep_collision = True
                #     if info.get("traffic_violation", False):
                #         ep_traffic_violation = True

                #     if "avg_speed" in info:
                #         ep_speeds.append(float(info["avg_speed"]))
                #     elif "speed" in info:
                #         ep_speeds.append(float(info["speed"]))

                # flags from MetaDrive info / our wrapper
                if isinstance(info, dict):
                    # success: either our wrapper's "success" OR MetaDrive's arrive_dest
                    if info.get("success", False) or info.get("arrive_dest", False):
                        ep_success = True

                    # collision: either our wrapper's "collision" OR any crash/out-of-road
                    if (
                        info.get("collision", False)
                        or info.get("crash_vehicle", False)
                        or info.get("crash_object", False)
                        or info.get("crash_building", False)
                        or info.get("out_of_road", False)
                    ):
                        ep_collision = True

                    # traffic violation if we ever log it; otherwise this will stay 0
                    if info.get("traffic_violation", False):
                        ep_traffic_violation = True

                    # speed logging
                    if "avg_speed" in info:
                        ep_speeds.append(float(info["avg_speed"]))
                    elif "speed" in info:
                        ep_speeds.append(float(info["speed"]))


            episode_rewards.append(ep_reward)
            episode_successes.append(1.0 if ep_success else 0.0)
            episode_collisions.append(1.0 if ep_collision else 0.0)
            episode_traffic_violations.append(1.0 if ep_traffic_violation else 0.0)
            episode_lengths.append(ep_steps)
            if ep_speeds:
                episode_speeds.append(sum(ep_speeds) / len(ep_speeds))

        mean_reward = float(sum(episode_rewards) / len(episode_rewards)) if episode_rewards else 0.0
        success_rate = float(sum(episode_successes) / len(episode_successes)) if episode_successes else 0.0
        collision_rate = float(sum(episode_collisions) / len(episode_collisions)) if episode_collisions else 0.0
        traffic_violation_rate = float(sum(episode_traffic_violations) / len(episode_traffic_violations)) if episode_traffic_violations else 0.0
        avg_speed = float(sum(episode_speeds) / len(episode_speeds)) if episode_speeds else 0.0
        avg_episode_length = float(sum(episode_lengths) / len(episode_lengths)) if episode_lengths else 0.0

        row = {
            "timesteps": int(self.num_timesteps),
            "mean_reward": mean_reward,
            "success_rate": success_rate,
            "collision_rate": collision_rate,
            "traffic_violation_rate": traffic_violation_rate,
            "avg_speed": avg_speed,
            "avg_episode_length": avg_episode_length,
        }

        with open(self.csv_path, "a", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=row.keys())
            writer.writerow(row)

        if self.verbose > 0:
            print(f"[Metrics] t={self.num_timesteps}  succ={success_rate:.2f}  "
                  f"coll={collision_rate:.2f}  len={avg_episode_length:.1f}")

        return True


In [18]:
from stable_baselines3.common.callbacks import EvalCallback

class PrettyEvalCallback(EvalCallback):
    """
    Clean pretty printing for eval:
      - One separator before the first eval
      - No spam between evals
      - One separator at the end of training on this stage
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._started = False    # whether we have printed the first separator

    def _on_step(self) -> bool:
        # Check if it's time to evaluate
        if self.n_calls % self.eval_freq == 0:
            # On first eval, print top separator
            if not self._started:
                print("\n" + "-" * 60 + "\n")
                self._started = True

        return super()._on_step()

    def _on_training_end(self) -> None:
        # After training for this stage: print final separator
        if self._started:
            print("\n" + "-" * 60 + "\n")
        return super()._on_training_end()


In [19]:
def save_run_config(out_dir, algo, stage, seed):
    """
    Save basic run config (algo, stage, hyperparams, seed) to config.json
    so you can reproduce / inspect later.
    """
    cfg = {
        "algo": algo,
        "seed": seed,
        "stage": {
            "id": stage["id"],
            "name": stage["name"],
            "map": stage["map"],
            "traffic": stage["traffic"],
            "budget": stage["budget"],
            "reward": stage["reward"],
        },
        "hyperparams": HYPERS[algo],
        "heldout_map": {"map": HELDOUT_MAP[0], "traffic": HELDOUT_MAP[1]},
    }
    with open(out_dir / "config.json", "w") as f:
        json.dump(cfg, f, indent=2)

### 4.3 Training functions

These functions create models, attach callbacks, and run training. Each saves checkpoint, best-model and CSV metrics.


In [None]:
# def make_model(algo, env, hyperparams):
#     if algo == 'PPO':
#         model = PPO(hyperparams['policy'], env, verbose=1, tensorboard_log=str(EXPERIMENT_ROOT/'tensorboard'),
#                     policy_kwargs=hyperparams['policy_kwargs'], learning_rate=hyperparams['learning_rate'],
#                     n_steps=hyperparams['n_steps'], batch_size=hyperparams['batch_size'], n_epochs=hyperparams['n_epochs'], gamma=hyperparams['gamma'])
#         return model
#     if algo == 'SAC':
#         model = SAC(hyperparams['policy'], env, verbose=1, tensorboard_log=str(EXPERIMENT_ROOT/'tensorboard'),
#                     policy_kwargs=hyperparams['policy_kwargs'], learning_rate=hyperparams['learning_rate'],
#                     batch_size=hyperparams.get('batch_size',256), buffer_size=hyperparams.get('buffer_size',100000), gamma=hyperparams.get('gamma',0.99))
#         return model
#     if algo == 'DQN':
#         model = DQN(hyperparams['policy'], env, verbose=1, tensorboard_log=str(EXPERIMENT_ROOT/'tensorboard'),
#                     policy_kwargs=hyperparams['policy_kwargs'], learning_rate=hyperparams['learning_rate'],
#                     buffer_size=hyperparams.get('buffer_size',50000), batch_size=hyperparams.get('batch_size',32), train_freq=hyperparams.get('train_freq',4))
#         return model
#     raise ValueError('Unknown algo')

In [20]:
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def make_model(algo, env, hyperparams):
    common_kwargs = dict(
        verbose=0,  # make SB3 quiet in console
        tensorboard_log=str(EXPERIMENT_ROOT / 'tensorboard'),
        policy_kwargs=hyperparams['policy_kwargs'],
        learning_rate=hyperparams['learning_rate'],
    )

    if algo == 'PPO':
        model = PPO(
            hyperparams['policy'],
            env,
            n_steps=hyperparams['n_steps'],
            batch_size=hyperparams['batch_size'],
            n_epochs=hyperparams['n_epochs'],
            gamma=hyperparams['gamma'],
            device="cpu",
            **common_kwargs,
        )
        return model

    if algo == 'SAC':
        model = SAC(
            hyperparams['policy'],
            env,
            batch_size=hyperparams.get('batch_size', 256),
            buffer_size=hyperparams.get('buffer_size', 100_000),
            gamma=hyperparams.get('gamma', 0.99),
            device=DEVICE,
            **common_kwargs,
        )
        return model

    if algo == 'DQN':
        model = DQN(
            hyperparams['policy'],
            env,
            buffer_size=hyperparams.get('buffer_size', 50_000),
            batch_size=hyperparams.get('batch_size', 32),
            train_freq=hyperparams.get('train_freq', 4),
            device=DEVICE,
            **common_kwargs,
        )
        return model

    raise ValueError('Unknown algo')


In [None]:
# def train_noncurriculum(algo, stage, total_timesteps, seed, n_envs=N_ENVS):
#     """
#     Non-curriculum baseline:
#     - Train *from scratch* on a single stage for TOTAL_CURRICULUM_BUDGET steps.
#     - Stage carries map, traffic and reward config.
#     """
#     map_name = stage["map"]
#     out_dir = EXPERIMENT_ROOT / f"{algo}/noncurriculum/seed_{seed}/{stage['name']}"
#     out_dir.mkdir(parents=True, exist_ok=True)
#     print("Training non-curriculum:", algo, stage["name"], "seed", seed)

#     use_discrete = (algo == "DQN")

#     # training envs
#     env = make_vec_env(stage, n_envs=n_envs, use_discrete=use_discrete, seed=seed)

#     # eval env (single copy)
#     eval_env_vec = make_vec_env(stage, n_envs=1, use_discrete=use_discrete, seed=seed + 100)
#     eval_env = eval_env_vec.env_fns[0]() if hasattr(eval_env_vec, "env_fns") else eval_env_vec

#     model = make_model(algo, env, HYPERS[algo])

#     # callbacks
#     eval_cb = EvalCallback(
#         eval_env,
#         best_model_save_path=str(out_dir / "best_model"),
#         log_path=str(out_dir / "eval"),
#         eval_freq=EVAL_FREQ,
#         n_eval_episodes=EVAL_EPISODES,
#         deterministic=True,
#     )
#     ckpt_cb = CheckpointCallback(
#         save_freq=EVAL_FREQ,
#         save_path=str(out_dir / "checkpoints"),
#         name_prefix="ckpt",
#     )
#     metrics_csv = out_dir / "metrics.csv"
#     metrics_cb = MetricsCallback(eval_env, str(metrics_csv), eval_episodes=EVAL_EPISODES)

#     # train
#     model.learn(total_timesteps=total_timesteps, callback=[eval_cb, ckpt_cb])
#     model.save(str(out_dir / "model.zip"))

#     # held-out evaluation (reuse same reward shaping config for fairness)
#     held_stage = {
#         "id": "HELDOUT",
#         "name": f"HELDOUT_{HELDOUT_MAP[0]}",
#         "map": HELDOUT_MAP[0],
#         "traffic": HELDOUT_MAP[1],
#         "budget": 0,
#         "reward": stage["reward"],  # same shaping as training stage
#     }
#     held_env_vec = make_vec_env(held_stage, n_envs=1, use_discrete=use_discrete, seed=seed + 500)
#     held_env = held_env_vec.env_fns[0]() if hasattr(held_env_vec, "env_fns") else held_env_vec
#     mean_reward, std_reward = evaluate_policy(model, held_env, n_eval_episodes=100)
#     pd.DataFrame([{"mean_reward": mean_reward, "std_reward": std_reward}]).to_csv(
#         out_dir / "heldout_metrics.csv", index=False
#     )

#     print("Non-curriculum training complete and saved to", out_dir)
#     return out_dir


In [None]:
# def train_noncurriculum(algo, stage, total_timesteps, seed, n_envs=N_ENVS):
#     """
#     Non-curriculum baseline:
#     - Train from scratch on a SINGLE (hard) stage for 'total_timesteps'.
#     - Evaluate on held-out map at the end.
#     """

#     out_dir = EXPERIMENT_ROOT / f"{algo}/noncurriculum/seed_{seed}/{stage['name']}"
#     out_dir.mkdir(parents=True, exist_ok=True)
#     print("Training NON-CURRICULUM:", algo, stage["name"], "seed", seed)

#     use_discrete = (algo == "DQN")

#     # training envs
#     env = make_vec_env(stage, n_envs=n_envs, use_discrete=use_discrete, seed=seed)

#     # eval env (for callbacks)
#     eval_env = env
#     # eval_env = make_eval_vec_env(stage, use_discrete=use_discrete, seed=seed + 100)
#     # eval_env = eval_env_vec.env_fns[0]() if hasattr(eval_env_vec, "env_fns") else eval_env_vec

#     model = make_model(algo, env, HYPERS[algo])

#     # callbacks
#     eval_cb = PrettyEvalCallback(
#         eval_env,
#         best_model_save_path=str(out_dir / "best_model"),
#         log_path=str(out_dir / "eval"),
#         eval_freq=EVAL_FREQ,
#         n_eval_episodes=EVAL_EPISODES,
#         deterministic=True,
#         verbose=1
#     )
#     ckpt_cb = CheckpointCallback(
#         save_freq=EVAL_FREQ,
#         save_path=str(out_dir / "checkpoints"),
#         name_prefix="ckpt",
#     )
#     metrics_csv = out_dir / "metrics.csv"
#     metrics_cb = MetricsCallback(
#         eval_env,
#         csv_path=str(metrics_csv),
#         eval_freq=EVAL_FREQ,
#         eval_episodes=EVAL_EPISODES,
#         verbose=0,
#     )

#     # save run config
#     save_run_config(out_dir, algo, stage, seed)

#     # train
#     model.learn(
#         total_timesteps=total_timesteps,
#         callback=[eval_cb, ckpt_cb, metrics_cb],
#     )
#     model.save(str(out_dir / "model.zip"))

#     # held-out evaluation
#     # held_stage = {
#     #     "id": "HELDOUT",
#     #     "name": f"HELDOUT_{HELDOUT_MAP[0]}",
#     #     "map": HELDOUT_MAP[0],
#     #     "traffic": HELDOUT_MAP[1],
#     #     "budget": 0,
#     #     "reward": stage["reward"],  # use same shaping as training stage
#     # }
#     # Held-out evaluation map: 20-block PG map with medium traffic
#     HELDOUT_STAGE = {
#         "id": "HELDOUT",
#         "name": "HELDOUT_20Block_MedTraffic",
#         "map": 20,                   # ✅ 20-block PG map (int, not "20-block")
#         "traffic": 0.15,             # medium traffic
#         "budget": 0,                 # no training budget, eval only
#         # reuse the hardest-stage reward shaping so it's comparable
#         "reward": STAGES[-1]["reward"],
#     }

#     # held_env = make_eval_vec_env(held_stage, use_discrete=use_discrete, seed=seed + 500)
#     # held_env = held_env_vec.env_fns[0]() if hasattr(held_env_vec, "env_fns") else held_env_vec

#     # mean_reward, std_reward = evaluate_policy(model, held_env, n_eval_episodes=100)
#     # pd.DataFrame([{"mean_reward": mean_reward, "std_reward": std_reward}]).to_csv(
#         # out_dir / "heldout_metrics.csv", index=False
#     # )

#     env.close()
#     eval_env.close()
#     # held_env.close()

#     print("Non-curriculum training complete and saved to", out_dir)
#     return out_dir


In [21]:
def evaluate_env_episodes(model, env, n_episodes=20, deterministic=True):
    """
    Run n_episodes on a *single* (non-vec) env.
    Returns (mean_reward, std_reward, mean_ep_len).
    """
    rewards = []
    lengths = []

    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        truncated = False
        ep_r = 0.0
        steps = 0

        while not (done or truncated):
            action, _ = model.predict(obs, deterministic=deterministic)
            obs, r, done, truncated, info = env.step(action)
            ep_r += float(r)
            steps += 1

        rewards.append(ep_r)
        lengths.append(steps)

    rewards = np.array(rewards, dtype=np.float32)
    lengths = np.array(lengths, dtype=np.float32)
    return float(rewards.mean()), float(rewards.std()), float(lengths.mean())


In [22]:
def train_noncurriculum(algo, stage, total_timesteps, seed, n_envs=1):
    """
    Non-curriculum baseline:
    - Train from scratch on a SINGLE stage for 'total_timesteps'.
    - Eval during training on SAME env via PrettyEvalCallback (already set up).
    - After training, eval on two held-out envs:
        1) SCrRXO + medium traffic (MetaDriveEnv)
        2) VaryingDynamicsEnv (randomized dynamics)
    """

    out_dir = EXPERIMENT_ROOT / f"{algo}/noncurriculum/seed_{seed}/{stage['name']}"
    out_dir.mkdir(parents=True, exist_ok=True)
    print(f"Training NON-CURRICULUM: {algo} {stage['name']} seed {seed}\n")

    use_discrete = (algo == "DQN")

    # ---- training env (single vec env) ----
    env = make_vec_env(stage, n_envs=n_envs, use_discrete=use_discrete, seed=seed)

    model = make_model(algo, env, HYPERS[algo])

    # in-training eval (same env)
    eval_cb = PrettyEvalCallback(
        env,
        best_model_save_path=str(out_dir / "best_model"),
        log_path=str(out_dir / "eval"),
        eval_freq=EVAL_FREQ,
        n_eval_episodes=EVAL_EPISODES,
        deterministic=True,
        verbose=1,
    )

    ckpt_cb = CheckpointCallback(
        save_freq=EVAL_FREQ,
        save_path=str(out_dir / "checkpoints"),
        name_prefix="ckpt",
    )

    save_run_config(out_dir, algo, stage, seed)

    # ---- train ----
    model.learn(
        total_timesteps=total_timesteps,
        callback=[eval_cb, ckpt_cb],
    )
    model.save(str(out_dir / "model.zip"))

    print(f"\nNon-curriculum training complete and saved to {out_dir}")

    # Close training vec env to avoid engine conflicts before new envs
    env.close()

    # =====================================================================
    # HELD-OUT 1: SCrRXO + medium traffic
    # =====================================================================
    print("\n[HELD-OUT 1] Evaluating on SCrRXO (fixed 6-block) with medium traffic...")

    # build a *single* env instance using same wrappers
    held1_make = make_metadrive_env(
        HELDOUT_SCENARIO_STAGE,
        use_discrete=use_discrete,
        seed=seed + 1000,
        render=False,
    )
    held1_env = held1_make()

    h1_mean, h1_std = evaluate_policy( #, h1_len if manual function
        model,
        held1_env,
        n_eval_episodes=20,
        deterministic=True,
    )

    print(
        f"HELD-OUT 1 (SCrRXO): mean_reward={h1_mean:.2f} ± {h1_std:.2f}, "
        # f"mean_ep_len={h1_len:.1f}"
    )

    pd.DataFrame(
        [{
            "algo": algo,
            "train_stage": stage["name"],
            "heldout_name": HELDOUT_SCENARIO_STAGE["name"],
            "mean_reward": h1_mean,
            "std_reward": h1_std,
            # "mean_ep_len": h1_len,
        }]
    ).to_csv(out_dir / "heldout_scrrxo_metrics.csv", index=False)

    held1_env.close()

    # =====================================================================
    # HELD-OUT 2: VaryingDynamicsEnv
    # =====================================================================
    # print("\n[HELD-OUT 2] Evaluating on VaryingDynamicsEnv (randomized dynamics)...")

    # # build varying dynamics env
    # vd_env = VaryingDynamicsEnv(VARYING_DYNAMICS_CONFIG)
    # vd_env = MetaDriveGymCompatibilityWrapper(vd_env)

    # if use_discrete:
    #     # same discrete mapping you used for training DQN
    #     discrete_mapping = [(-1.0, 0.0), (-1.0, 0.3), (0.0, 0.5), (1.0, 0.3), (1.0, 0.0)]
    #     vd_env = DiscreteActionWrapper(vd_env, discrete_mapping)

    # h2_mean, h2_std, h2_len = evaluate_env_episodes(
    #     model,
    #     vd_env,
    #     n_episodes=20,
    #     deterministic=True,
    # )

    # print(
    #     f"HELD-OUT 2 (VaryingDynamics): mean_reward={h2_mean:.2f} ± {h2_std:.2f}, "
    #     f"mean_ep_len={h2_len:.1f}"
    # )

    # pd.DataFrame(
    #     [{
    #         "algo": algo,
    #         "train_stage": stage["name"],
    #         "heldout_name": "VaryingDynamicsEnv",
    #         "mean_reward": h2_mean,
    #         "std_reward": h2_std,
    #         "mean_ep_len": h2_len,
    #     }]
    # ).to_csv(out_dir / "heldout_varying_metrics.csv", index=False)

    # vd_env.close()

    return out_dir


### 4.4. Visualization (plot metrics, learning curves, and display videos)

In [23]:
def plot_metrics(csv_path, title=None):
    if not os.path.exists(csv_path):
        print('CSV not found:', csv_path); return
    df = pd.read_csv(csv_path)
    fig, axs = plt.subplots(2,2, figsize=(12,8))
    axs = axs.flatten()
    axs[0].plot(df['total_timesteps'], df['mean_reward'], marker='o'); axs[0].set_title('Mean reward')
    axs[1].plot(df['total_timesteps'], df['success_rate'], marker='o'); axs[1].set_title('Success rate')
    axs[2].plot(df['total_timesteps'], df['collision_rate'], marker='o'); axs[2].set_title('Collision rate')
    axs[3].plot(df['total_timesteps'], df['avg_speed'], marker='o'); axs[3].set_title('Avg speed')
    if title: fig.suptitle(title)
    plt.tight_layout(); plt.show()

print('Plot helper ready')

Plot helper ready


## 5. Toy pilot run

Small pilot run to validate pipeline.


In [None]:
# # run one tiny non-curriculum PPO for 2000 steps on Straight map

# out = train_noncurriculum('SAC', STAGES[2], total_timesteps=2000, seed=0, n_envs=1)
# print('Pilot saved at:', out)

## 6. Full experiment

For each algorithm, for each seed.


In [None]:
algos = ["SAC", "DQN", "PPO"]

for algo in algos:
    for seed in SEEDS:
        print("=" * 80)
        print(f"ALGO={algo}  SEED={seed}")

        for stage in STAGES:
            train_noncurriculum(
                algo=algo,
                stage=stage,
                # total_timesteps=5000,
                total_timesteps=stage["budget"],  # use this stage's budget
                seed=seed,
                n_envs=N_ENVS,
            )

ALGO=SAC  SEED=0
Training NON-CURRICULUM: SAC C0_Straight seed 0


------------------------------------------------------------

Eval num_timesteps=20000, episode_reward=-0.13 +/- 0.00
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=40000, episode_reward=-3.24 +/- 0.00
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=60000, episode_reward=1.46 +/- 0.00
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=-0.76 +/- 0.00
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=100000, episode_reward=1.50 +/- 0.00
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=120000, episode_reward=1.70 +/- 0.00
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=140000, episode_reward=1.32 +/- 0.00
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=160000, episode_reward=1.50 +/- 0.00
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=180000, episode_reward=2.26 +/- 0.00
Episode length: 

You can use the `pwd` command to see your current working directory and `ls -F` to list its contents. The root directory in Colab is usually `/content/`.

In [1]:
%%bash
pwd
ls -F

/content
sample_data/


In [2]:
import os

experiment_path = os.path.join('/content', 'experiments')
if os.path.exists(experiment_path):
    print(f"The 'experiments' directory exists at: {experiment_path}")
    print("Contents of 'experiments' directory:")
    print(os.listdir(experiment_path))
else:
    print(f"The 'experiments' directory was NOT found at: {experiment_path}")


The 'experiments' directory was NOT found at: /content/experiments


In [None]:
# algos = ['PPO','SAC','DQN']
# for algo in algos:
#     # non-curriculum: for each target map train for the total curriculum budget (to match sample budget)
#     for seed in SEEDS:
#         for (_, map_name, traffic, _) in STAGES:
#             train_noncurriculum(algo, map_name, traffic, total_timesteps=TOTAL_CURRICULUM_BUDGET, seed=seed, n_envs=N_ENVS)

In [None]:
# import shutil

# shutil.make_archive("experiments_backup", "zip", "experiments")

# print("Zipped to experiments_backup.zip")