In [1]:
!apt-get update
!apt-get install -y xvfb python-opengl ffmpeg  # For rendering
!pip install pyvirtualdisplay
!pip install gymnasium[mujoco]         # Gymnasium + MuJoCo support
!pip install stable-baselines3         # RL library
!apt-get update
!apt-get install -y xvfb

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to security.ubuntu.com (185.125.190                                                                                                    Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,315 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa

In [2]:
import gymnasium as gym
import os
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize, VecVideoRecorder
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.utils import set_random_seed

from pyvirtualdisplay import Display
from IPython.display import HTML
from base64 import b64encode

In [4]:
# Start virtual display for video rendering in Colab
display = Display(visible=0, size=(800, 600))
display.start()

  and should_run_async(code)


<pyvirtualdisplay.display.Display at 0x7d6f6448ba10>

In [5]:
env_id = "InvertedDoublePendulum-v5"

# Directories for logs
train_log_dir = "logs/train_logs"
eval_log_dir = "logs/eval_logs"

os.makedirs(train_log_dir, exist_ok=True)
os.makedirs(eval_log_dir, exist_ok=True)

In [7]:
def make_custom_env(seed=0):
    env = gym.make(
        env_id,
        render_mode="rgb_array"    # Removed reward_control_weight
    )
    # 1) Unwrap the default TimeLimit
    env = env.unwrapped

    # 2) Override the environment spec (update max steps to 1000, as per the InvertedDoublePendulum default)
    env.spec.max_episode_steps = 1000

    # 3) Now wrap it with your own TimeLimit
    env = gym.wrappers.TimeLimit(env, max_episode_steps=1000)

    # Optional: use Monitor for logging (Training logs)
    monitor_file = os.path.join(train_log_dir, f"monitor_train_{seed}.csv")
    env = Monitor(env, filename=monitor_file)

    env.reset(seed=seed)
    return env


In [8]:
num_envs = 4  # Try 12 parallel environments

def make_env_fn(rank):
    """
    A helper to create environment with different seeds
    to ensure more diverse experiences.
    """
    def _init():
        env_ = make_custom_env(seed=123 + rank)
        return env_
    return _init

# SubprocVecEnv for parallelization (faster than DummyVecEnv if available)
env_fns = [make_env_fn(i) for i in range(num_envs)]
vec_env = SubprocVecEnv(env_fns)

In [9]:
vec_env = VecNormalize(
    vec_env,
    norm_obs=True,     # Normalize observations
    norm_reward=False, # Keep rewards in original scale
    clip_obs=10.0
)

In [10]:
model = PPO(
    policy="MlpPolicy",
    env=vec_env,
    verbose=1,
    # A good start for MuJoCo-like tasks:
    n_steps=2048,
    batch_size=256,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    learning_rate=3e-4,
    ent_coef=0.0,      # Entropy cost coefficient (can help exploration if > 0)
    clip_range=0.2
)

Using cuda device




In [15]:
total_timesteps = 500_000_0
model.learn(total_timesteps=total_timesteps)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    loss                 | 40.2         |
|    n_updates            | 3840         |
|    policy_gradient_loss | 0.00144      |
|    std                  | 0.0268       |
|    value_loss           | 292          |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 4.67e+03     |
| time/                   |              |
|    fps                  | 1250         |
|    iterations           | 385          |
|    time_elapsed         | 2521         |
|    total_timesteps      | 3153920      |
| train/                  |              |
|    approx_kl            | 0.0031298134 |
|    clip_fraction        | 0.0224       |
|    clip_range           | 0.2          |
|    entropy_loss         | 2.2          |
|    explained_variance   | 0.969        |
|    learning_rate        | 0.00

<stable_baselines3.ppo.ppo.PPO at 0x7d6e38b04850>

In [16]:
model.save("ppo_reacher")
vec_env.save("vec_normalize.pkl")

In [17]:
def make_eval_env():
    # Create environment specifically for evaluation
    test_env = gym.make(
        env_id,
        render_mode="rgb_array"    # Removed reward_control_weight
    )
    test_env = test_env.unwrapped
    test_env.spec.max_episode_steps = 1000   # Update max steps
    test_env = gym.wrappers.TimeLimit(test_env, max_episode_steps=1000)

    # Write to separate evaluation log file
    monitor_file_eval = os.path.join(eval_log_dir, "monitor_eval.csv")
    test_env = Monitor(test_env, filename=monitor_file_eval)

    test_env.reset(seed=999)

    # Wrap it with the same VecNormalize config, but in eval mode
    eval_vec_env = DummyVecEnv([lambda: test_env])
    eval_vec_env = VecNormalize.load("vec_normalize.pkl", eval_vec_env)
    eval_vec_env.training = False
    eval_vec_env.norm_reward = False

    return eval_vec_env


# Now instantiate the actual evaluation environment
eval_env = make_eval_env()

video_folder = "videos"
os.makedirs(video_folder, exist_ok=True)
video_length = 5000 #10000  # Number of timesteps for the video

eval_env = VecVideoRecorder(
    eval_env,
    video_folder,
    record_video_trigger=lambda step: step == 0,
    video_length=video_length,
    name_prefix="reacher_test"
)

obs = eval_env.reset()

episode_count = 1
episode_timesteps = 0

# Because we are using a vectorized environment with only 1 environment inside,
# done will be an array of shape (1,), so we'll check done[0].
for _ in range(video_length):
    action, _ = model.predict(obs, deterministic=True)
    obs, rewards, dones, infos = eval_env.step(action)
    episode_timesteps += 1

    if dones[0]:
        print(f"Episode {episode_count} ended after {episode_timesteps} timesteps.")
        episode_count += 1
        episode_timesteps = 0
        obs = eval_env.reset()

Episode 1 ended after 54 timesteps.
Episode 2 ended after 1000 timesteps.
Episode 3 ended after 54 timesteps.
Episode 4 ended after 1000 timesteps.
Episode 5 ended after 1000 timesteps.
Episode 6 ended after 1000 timesteps.
Saving video to /content/videos/reacher_test-step-0-to-step-5000.mp4
Moviepy - Building video /content/videos/reacher_test-step-0-to-step-5000.mp4.
Moviepy - Writing video /content/videos/reacher_test-step-0-to-step-5000.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/reacher_test-step-0-to-step-5000.mp4


In [18]:
import pandas as pd
import os
import re  # for parsing the seed from filenames

def export_training_rewards_to_excel(output_filename="training_rewards.xlsx"):
    train_monitor_files = [
        f for f in os.listdir(train_log_dir)
        if f.startswith("monitor_train_") and f.endswith(".csv")
    ]

    if not train_monitor_files:
        print("No training monitor files found in", train_log_dir)
        return

    series_list = []

    for filename in train_monitor_files:
        # Example filename: "monitor_train_123.csv"
        # Extract the seed (e.g. 123)
        match = re.search(r"monitor_train_(\d+)\.csv", filename)
        if match:
            seed_str = match.group(1)  # e.g. "123"
        else:
            # Fallback if the filename doesn't match the pattern
            seed_str = filename

        filepath = os.path.join(train_log_dir, filename)
        df = pd.read_csv(filepath, skiprows=1)  # skip the first metadata row

        # df columns are typically: r, l, t
        # We only want the 'r' column (the total reward)
        rewards = df["r"].copy()

        # Rename it to the seed (for the column name)
        # We'll make it a pandas Series with a nice name
        rewards.name = f"seed_{seed_str}"

        # Index from 1..N episodes for clarity
        rewards.index = range(1, len(rewards) + 1)

        series_list.append(rewards)

    # Combine them side-by-side
    combined_df = pd.concat(series_list, axis=1)

    # Save to Excel
    combined_df.to_excel(output_filename, index_label="Episode")
    print(f"Training rewards exported to Excel: {output_filename}")


# Actually run the export
export_training_rewards_to_excel("training_rewards.xlsx")

Training rewards exported to Excel: training_rewards.xlsx
