In [1]:
import numpy as np
import torch

import gymnasium as gym

In [2]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

In [3]:
env_id = "MountainCarContinuous-v0"
NUM_CPU = 8  # Number of processes to use

In [5]:
# Parallel environments
train_env = make_vec_env(env_id, n_envs=NUM_CPU)
eval_callback = EvalCallback(train_env, best_model_save_path="./logs/",
                             log_path="./logs/", eval_freq=500,
                             deterministic=True, render=False)

model = PPO("MlpPolicy", train_env, verbose=1, tensorboard_log="./logs/ppo_MountainCar_tensorboard/")
model.learn(total_timesteps=1_000_000,
            gae_lambda=0.98,
            gamma=0.99,
            n_epochs=4,
            n_steps=NUM_CPU,
            callback=eval_callback)
model.save("ppo_MountainCar")

del model # remove to demonstrate saving and loading

Using cuda device
Logging to ./logs/ppo_MountainCar_tensorboard/PPO_1
Eval num_timesteps=2000, episode_reward=-0.00 +/- 0.00
Episode length: 999.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 999      |
|    mean_reward     | -1.3e-05 |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
New best mean reward!
Eval num_timesteps=4000, episode_reward=-0.00 +/- 0.00
Episode length: 999.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 999      |
|    mean_reward     | -1.3e-05 |
| time/              |          |
|    total_timesteps | 4000     |
---------------------------------
Eval num_timesteps=6000, episode_reward=-0.00 +/- 0.00
Episode length: 999.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 999      |
|    mean_reward     | -1.3e-05 |
| time/              |          |
|

In [8]:
test_env = make_vec_env(env_id, n_envs=1)

model = PPO.load("ppo_MountainCar")

obs = test_env.reset()
for _ in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = test_env.step(action)
    test_env.render()
