<a href="https://colab.research.google.com/github/Rhodes-CS-comp377/comp377-colab-rl-training-example/blob/main/colab_training_overview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook provides an **Overview** and **Quick Start Guide** to help you begin applying "state-of-the-art" RL algorithms to solve complex environments (such as [Atari](https://gymnasium.farama.org/environments/atari/) and [Mujoco](https://gymnasium.farama.org/environments/mujoco/)).


We will be using the algorithm implementations provided in the [Stable Baselines 3](https://stable-baselines3.readthedocs.io/) library. These implementations allow us to train RL agents on any environment that conforms to the [Gymnasium](https://gymnasium.farama.org/) (formerly, "OpenAI Gym") environment specification.

This includes [custom environments](https://stable-baselines3.readthedocs.io/en/master/guide/custom_env.html) that you might create yourself.

## Sources of Additional Information


Gymnasium: https://gymnasium.farama.org/

Basic Usage: https://gymnasium.farama.org/content/basic_usage/

Stable Baselines3: https://stable-baselines3.readthedocs.io/

Stable Baselines3 RL Tutorial: https://github.com/araffin/rl-tutorial-jnrr19/tree/sb3

RL Tips-And-Tricks: https://stable-baselines3.readthedocs.io/en/master/guide/rl_tips.html


## Preliminaries

### Software Installation

Uses `pip` to install the Gymnasium environments (including Atari environments) and Stable Baselines 3 onto our cloud-hosted machine.

In [None]:
%%capture

# 1. Remove legacy gym (installed in Colab by default)
!pip uninstall -y gym

# environments
! pip install "gymnasium[box2d, atari, mujoco]"

! sudo apt install swig # needed for box2d

# RL algorithms
! pip install "stable-baselines3[extra]"

# For visualization
! apt-get update && apt-get install ffmpeg freeglut3-dev xvfb

### Imports

In [None]:
# primary package imports
import stable_baselines3
import gymnasium as gym
import numpy as np

import os
import re
import base64

from time import sleep
from pathlib import Path

# for pretty display in notebook
from IPython import display as ipythondisplay
from IPython.display import clear_output

from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback, CallbackList
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results

# suppresses deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# display library versions
print(f'stable_baselines3 version: {stable_baselines3.__version__}')
print(f'gym version: {gym.__version__}')

### Checking Runtime Configuration (CPU or GPU)

In [None]:
import subprocess, textwrap

try:
    out = subprocess.check_output(["nvidia-smi"], stderr=subprocess.STDOUT).decode()
    print("Runtime hardware: GPU")
    print(textwrap.shorten(out.splitlines()[2], width=120))
except Exception as e:
    print("Runtime hardware: CPU-only")

### Helper Code

In [None]:
def create_dirs(dirs):
  for dir in dirs:
    if not os.path.exists(dir):
      os.makedirs(dir)
      print(f"Created directory: {dir}")

def create_env_instance(env_name, log_dir=None, wrap=True, **env_kwargs):
  env = gym.make(env_name, **env_kwargs)

  # wraps the environment to provide additional functionality
  if wrap:
    env = Monitor(env, filename=log_dir)

  return env

def show_env_info(env):
  spec = gym.spec(env.unwrapped.spec.id)

  print(f'Environment Name: {spec.id}')
  print(f'Action Space: {env.action_space}')
  print(f'Observation Space: {env.observation_space}')
  print(f'Max Episode Steps: {spec.max_episode_steps}')
  print(f'Nondeterministic: {spec.nondeterministic}')

def get_session_name(env, algorithm):
    """
    Build a filesystem-friendly session name from an environment and an
    SB3 algorithm instance.

    Example output: "CartPole-v1_PPO"
    """
    if isinstance(env, str):
        env_name = env
    else:
        # handle vectorized envs (DummyVecEnv, SubprocVecEnv, etc.)
        base_env = env
        if hasattr(env, "envs") and len(env.envs) > 0:
            base_env = env.envs[0]

        # unwrap if needed (Monitor, TimeLimit, etc.)
        base_env = getattr(base_env, "unwrapped", base_env)

        # try spec.id, fall back to class name
        spec = getattr(base_env, "spec", None)
        env_name = getattr(spec, "id", None) or type(base_env).__name__

    # get algorithm name from instance
    alg_name = type(algorithm).__name__   # e.g., "PPO", "TD3", "SAC"

    # combine and make filesystem-safe
    raw_name = f"{env_name}_{alg_name}"
    safe_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", raw_name)

    return safe_name

class SaveOnBestTrainingRewardCallback(BaseCallback):
    def __init__(self, check_freq: int, log_dir: str, save_dir: str, save_filename: str, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_dir = save_dir
        self.save_path = os.path.join(save_dir, save_filename)
        self.best_mean_reward = -np.inf

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = results_plotter.ts2xy(
                results_plotter.load_results(self.log_dir), "timesteps")
            if len(x) > 0:
                # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    clear_output(wait=True)

                    print(f"Num timesteps: {self.num_timesteps}")
                    print(
                        f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}"
                    )

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print(f"Saving new best model to {self.save_path}.zip")
                    self.model.save(self.save_path)

        return True

def record_video(env_id, model, video_path, prefix="rl-video", video_length=500):
    eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_path,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix
    )

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    eval_env.close()

def show_videos(video_path, prefix="rl-video"):
    html = []
    for mp4 in Path(video_path).glob(f"{prefix}*.mp4"):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            """<video alt="{}" autoplay
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>""".format(
                mp4, video_b64.decode("ascii")
            )
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

### Google Drive Configuration

Because training can take a LONG time, you will want to save the models you learn to Google Drive as you go.

If you have to take a break, you can load the model from Drive and start where you left off.

This will require you to granting the notebook access to your Google Drive and creating a directory within Google Drive to save your current model.

In [None]:
# mounts your Google Drive as a directory in your Colab virtual machine
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# creates the top-level directory where all Google Drive content will be saved
drive_basedir='/content/drive/MyDrive/comp377/colab'
os.makedirs(drive_basedir, exist_ok=True)

In [None]:
! ls $drive_basedir

### Global Configuration

In [None]:
# contains temporary files created during training
train_dir = '/tmp/gym/model/train/'
tensorboard_dir = '/tmp/gym/tensorboard'

# directories that will contain content generated during training
model_dir  = os.path.join(drive_basedir, 'models')
video_dir  = os.path.join(drive_basedir, 'videos')
figure_dir = os.path.join(drive_basedir, 'figures')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(tensorboard_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
os.makedirs(video_dir, exist_ok=True)
os.makedirs(figure_dir, exist_ok=True)

In [None]:
# set up fake display; otherwise video rendering will fail
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

### Environments



A large number of pre-made environments are available to choose. (See [Gymnasium Docs](https://gymnasium.farama.org/)).

### Initializing the RL Algorithm



Stable-baselines 3 has a nice set of ready-to-use, modern [RL algorithms](https://stable-baselines3.readthedocs.io/en/master/guide/algos.html).

However, not all algorithms are suitable for every environment. Pay careful attention to the types of environments each algorithm supports (Box, Discrete, MultiDiscrete, MultiBinary, or Multi Processing).

In [None]:
# import algorithms
from stable_baselines3 import PPO, TD3, HER, A2C, SAC, DQN, DDPG

## Training with CPU Only

In [None]:
env_id = 'CartPole-v1'

In [None]:
# create an instance of the environment to use for training
env = create_env_instance(env_id, train_dir, render_mode='rgb_array')
show_env_info(env)

We'll be using the [Proximal Policy Optimization (PPO)](https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html) algorithm for this demonstration. The first argument to the PPO initializer is the **type of neural network** it will use to approximate the policy function.

For this environment, we'll use a simple multilayer perceptron (`'MlpPolicy'`).

When learning directly from pixels, such as we might do when training on an Atari environment, it would likely be better to use a convolutional neural network (CNN) based policy `'CnnPolicy'`.

In [None]:
algorithm = PPO('MlpPolicy', env, verbose=0, tensorboard_log=tensorboard_dir)

In [None]:
# identifier for this training session based on environment and algorithm
session_name = get_session_name(env, algorithm)

In [None]:
# identifier for this training session based on environment and algorithm
session_name = get_session_name(env, algorithm)

# change to true to load the model from disk
load_model = False
if load_model:
  algorithm.load(f'{model_dir}/{session_name}.zip')

### Evaluate Model (Before Training)

In [None]:
eval_env = create_env_instance(env_id)
mean_reward, std_reward = evaluate_policy(algorithm, eval_env, n_eval_episodes=25)

print(f"mean_reward (before training):{mean_reward:.2f} +/- {std_reward:.2f}")

Show result of "random" behaviors

In [None]:
record_video(env_id, algorithm, '/tmp', video_length=500, prefix=session_name)
show_videos('/tmp', session_name)

### Callbacks

"A callback is a set of functions that will be called at given stages of the training procedure. You can use callbacks to access internal state of the RL model during training. It allows one to do monitoring, auto saving, model manipulation, progress bars, etc." (Read more about callbacks [here](https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html).)

In this demonstration, we'll be using callbacks to monitor the average reward and save the learned `model`.

The term "model" is used here to refer to whatever parameters and functions the algorithm learned during training, which are used to construct a policy. In most modern RL algorithms, the learned models include some flavor of neural networks.



### Tensorboard

[Tensorboard](https://www.tensorflow.org/tensorboard) is a visualization tool that is broadly applicable to all types of machine learning. It is particularly useful for understanding what is going on during the training process and understanding how the optimization process affects the learned parameters (e.g., weights and biases in a neural network).

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
# launches a tensorboard that reads data saved in $tensorboard_dir that was
# generated during the training session
%tensorboard --logdir $tensorboard_dir

### Training Loop

In [None]:
print(f'session_name: {session_name}')
print(f'saving models to {model_dir}')

In [None]:
n_training_steps = 50_000
n_steps_per_eval = 5_000

# start training loop (periodically invoking callbacks)
callback_list = CallbackList([
  SaveOnBestTrainingRewardCallback(check_freq=n_steps_per_eval,
                                   log_dir=train_dir,
                                   save_dir=model_dir,
                                   save_filename=session_name,
                                   )
])

model = algorithm.learn(total_timesteps=n_training_steps,
                        callback=callback_list)

### Plot Training Curve

In [None]:
results_plotter.plot_results([train_dir],
                             num_timesteps=n_training_steps,
                             x_axis=results_plotter.X_TIMESTEPS,
                             task_name=env.unwrapped.spec.id,
                             figsize=(8,6))

### Evaluate Model (After Training)

In [None]:
eval_env = create_env_instance(env_id)
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=25)

print(f"mean_reward (after training):{mean_reward:.2f} +/- {std_reward:.2f}")

### Visualizing Policy (After Training)

In [None]:
# filenames for videos will be prefixed with the session name
# (e.g., 'CartPole-v1-PPO-')
record_video(env_id, model, video_dir, video_length=500, prefix=session_name)
show_videos(video_dir, session_name)

### Using the Trained Model

This environment uses the same Gymnasium interface that we used in lab3, so we
can use our trained model in exactly the same way! (**Yay for consistent APIs!**)

The environment has a `step()` method that takes an action and returns an `observation`, `reward`, and whether the environment has `terminated`.

The `model` we trained earlier contains a policy, which we can use to select actions. In particular, `model.predict()` will return an action for the current environment `observation`.

In [None]:
def run(env_id, model, delay=0.1):
  eval_env = gym.make(env_id, render_mode="human")

  obs, info = eval_env.reset()

  terminated, truncated = False, False
  t = 1

  while not (terminated or truncated):
      action, _ = model.predict(obs, deterministic=True)
      obs, reward, terminated, truncated, info = eval_env.step(action)

      print(f't: {t}')
      print(f'observation: {obs}')
      print(f'selected action: {action}')
      print(f'reward: {reward}')
      print(f'terminated: {terminated}')
      print(f'truncated: {truncated}')

      clear_output(wait=True)
      if delay > 0:
        sleep(delay)

      t += 1

In [None]:
run(env_id, model)