In [None]:
%%capture
!pip install swig
!pip install gymnasium[box2d]
!pip install stable_baselines3

In [2]:
# NumPy
import numpy as np

# Gym
import gymnasium as gym

# Stable Baselines
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

from stable_baselines3.common.utils import set_random_seed

In [3]:
# Define Environment Creation
def make_env(rank: int, seed: int = 50):
    def _init():
        env = gym.make("CarRacing-v2", continuous=True, render_mode="rgb_array")
        env = gym.wrappers.GrayScaleObservation(env)
        env = gym.wrappers.FrameStack(env, 4)
        env.reset(seed = np.random.randint(9999))
        return env
    set_random_seed(np.random.randint(9999))
    return _init

In [4]:
# Make Training Environment
num_cpu = 4
vec_env = DummyVecEnv([make_env(i) for i in range(num_cpu)])
vec_env = VecNormalize(vec_env, norm_reward=False)
_ = vec_env.reset()

In [5]:
# Train New Agent (!!!OVERWRITES EXISTING WEIGHTS!!!)
model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=50000, progress_bar=True)

model.save("ppo_carracing")

Using cuda device


Output()

-----------------------------
| time/              |      |
|    fps             | 68   |
|    iterations      | 1    |
|    time_elapsed    | 119  |
|    total_timesteps | 8192 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 66          |
|    iterations           | 2           |
|    time_elapsed         | 245         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.008074686 |
|    clip_fraction        | 0.0815      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.22       |
|    explained_variance   | 0.133       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.138       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00674    |
|    std                  | 0.985       |
|    value_loss           | 0.563       |
----------------------------------

In [6]:
# Load Agent & Perform Additional Training
model = PPO.load("ppo_carracing", env=vec_env)
model.learn(total_timesteps=50000, progress_bar=True)

model.save("ppo_carracing")

Output()

-----------------------------
| time/              |      |
|    fps             | 67   |
|    iterations      | 1    |
|    time_elapsed    | 120  |
|    total_timesteps | 8192 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 64          |
|    iterations           | 2           |
|    time_elapsed         | 253         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.015329387 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.93       |
|    explained_variance   | 0.507       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.32        |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.00484    |
|    std                  | 0.892       |
|    value_loss           | 0.79        |
----------------------------------