In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common import results_plotter
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.monitor import Monitor


# PPO

## Create Callback

In [13]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """

    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, "best_model")
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), "timesteps")
            if len(x) > 0:
                # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print(f"Num timesteps: {self.num_timesteps}")
                    print(
                        f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}"
                    )

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print(f"Saving new best model to {self.save_path}.zip")
                    self.model.save(self.save_path)

        return True

## Plot Functions

In [3]:
def moving_average(values, window):
    """
    Smooth values by doing a moving average
    :param values: (numpy array)
    :param window: (int)
    :return: (numpy array)
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, "valid")


def plot_results(log_folder, title="Learning Curve"):
    """
    plot the results

    :param log_folder: (str) the save location of the results to plot
    :param title: (str) the title of the task to plot
    """
    x, y = ts2xy(load_results(log_folder), "timesteps")
    #y = moving_average(y, window=50)
    # Truncate x
    #x = x[len(x) - len(y) :]

    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel("Number of Timesteps")
    plt.ylabel("Rewards")
    plt.title(title + " Smoothed")
    plt.show()

## Create Gif

In [4]:
import imageio
import numpy as np
from stable_baselines3 import A2C

def create_gif(env, model_path, path, name):
    model = A2C.load(model_path, env=env)
    images = []

    vec_env = model.get_env()
    obs = vec_env.reset()
    img = vec_env.render()
    for i in range(500):
        images.append(img)
        action, _ = model.predict(obs)
        obs, _, _ ,_ = vec_env.step(action)
        img = vec_env.render(mode='rgb_array')
    gif_name =  path + name + '.gif'
    imageio.mimsave(gif_name, [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=29)

## Normal parameters

In [6]:
# Create environment
env = gym.make("CarRacing-v2")

In [7]:
#setup callback

# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

# Wrap the environment
env = Monitor(env, log_dir)

callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

### Train agent

In [7]:
# Instantiate the agent
model = PPO("MlpPolicy", env, verbose=1)
# Train the agent and display a progress bar
timesteps = 90000
model.learn(total_timesteps=int(timesteps), callback=callback)


Using cpu device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Num timesteps: 1000
Best mean reward: -inf - Last mean reward per episode: -62.16
Saving new best model to tmp/best_model.zip
Num timesteps: 2000
Best mean reward: -62.16 - Last mean reward per episode: -59.84
Saving new best model to tmp/best_model.zip
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -59.8    |
| time/              |          |
|    fps             | 38       |
|    iterations      | 1        |
|    time_elapsed    | 52       |
|    total_timesteps | 2048     |
---------------------------------
Num timesteps: 3000
Best mean reward: -59.84 - Last mean reward per episode: -58.92
Saving new best model to tmp/best_model.zip
Num timesteps: 4000
Best mean reward: -58.92 - Last mean reward per episode: -58.32
Saving new best model to tmp/best_model.zip
-----------------------------------------
| rollout/     

KeyboardInterrupt: 

: 

In [18]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=5)

In [19]:
print(mean_reward, std_reward)

-49.3209038 29.719247237129636


In [15]:
# Helper from the library
results_plotter.plot_results(
    [log_dir], timesteps, results_plotter.X_TIMESTEPS, "PPO CarRacing-v2"
)

IndexError: index -1 is out of bounds for axis 0 with size 0

: 

: 

In [1]:
plot_results(log_dir)

NameError: name 'plot_results' is not defined

### Load pre-trained model

In [28]:
test_env = gym.make("CarRacing-v2", render_mode='human')
dir = os.path.join(log_dir, "best_model.zip")
model = PPO.load(dir, env=test_env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [29]:
vec_env = model.get_env()
obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = vec_env.step(action)
    #obs, rewards, dones, info = test_env.step(action)
    vec_env.render()
    if done:
        break

vec_env.close()

In [42]:
test_env = gym.make("CarRacing-v2", render_mode='rgb_array')
dir = os.path.join(log_dir, "best_model")
create_gif(test_env, dir, log_dir, "ppo_car")

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


## Discrete space

In [11]:
# Create environment
env = gym.make("CarRacing-v2", continuous=False)

### Train agent

In [12]:
# Instantiate the agent
model = PPO("MlpPolicy", env, verbose=1)
# Train the agent and display a progress bar
model.learn(total_timesteps=int(1000), callback=callback)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -58      |
| time/              |          |
|    fps             | 29       |
|    iterations      | 1        |
|    time_elapsed    | 69       |
|    total_timesteps | 2048     |
---------------------------------


In [13]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

In [14]:
print(mean_reward, std_reward)

-35.794415900000004 45.31521641763898


### Load pre-trained model

In [15]:
test_env = gym.make("CarRacing-v2", render_mode='human', continuous=False)
dir = os.path.join(log_dir, "best_model.zip")
model = PPO.load(dir, env=test_env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [16]:
vec_env = model.get_env()
obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = vec_env.step(action)
    #obs, rewards, dones, info = test_env.step(action)
    vec_env.render()
    if done:
        break

vec_env.close()

In [None]:
test_env = gym.make("CarRacing-v2", render_mode='rgb_array', contunous=False)
dir = os.path.join(log_dir, "best_model")
create_gif(test_env, dir, log_dir, "ppo_car")

## Lap completion percentage change 

In [None]:
# Create environment
env = gym.make("CarRacing-v2", lap_complete_percentage=0.5)

### Train agent

In [None]:
# Instantiate the agent
model = PPO("MlpPolicy", env, verbose=1)
# Train the agent and display a progress bar
model.learn(total_timesteps=int(1000))

In [None]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

In [None]:
print(mean_reward, std_reward)

### Load pre-trained model

In [None]:
test_env = gym.make("CarRacing-v2", render_mode='human', continuous=False)
dir = os.path.join(log_dir, "best_model.zip")
model = PPO.load(dir, env=test_env)

In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = vec_env.step(action)
    #obs, rewards, dones, info = test_env.step(action)
    vec_env.render()
    if done:
        break

vec_env.close()

In [None]:
test_env = gym.make("CarRacing-v2", render_mode='rgb_array')
dir = os.path.join(log_dir, "best_model")
create_gif(test_env, dir, log_dir, "ppo_car")

## Train with CNN instead of MLP

In [23]:
# Instantiate the agent
env = gym.make("CarRacing-v2", render_mode = "rgb_array")
model = PPO("MlpLstmPolicy", env, verbose=1)
# Train the agent and display a progress bar
timesteps = 500000
model.learn(total_timesteps=int(timesteps), callback=callback)
model.save("ppo_cnn")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -55.2    |
| time/              |          |
|    fps             | 35       |
|    iterations      | 1        |
|    time_elapsed    | 58       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -56.4       |
| time/                   |             |
|    fps                  | 23          |
|    iterations           | 2           |
|    time_elapsed         | 172         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008156224 |
|    clip_fraction        | 0.0817      |
|    clip_range      

In [24]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

print(mean_reward, std_reward)

-29.445576 25.004705077465335


In [35]:
test_env = gym.make("CarRacing-v2", render_mode='human')
dir = os.path.join('', "ppo_cnn.zip")
model = PPO.load(dir, env=test_env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [36]:
vec_env = model.get_env()
obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = vec_env.step(action)
    #obs, rewards, dones, info = test_env.step(action)
    vec_env.render()
    if done:
        break

vec_env.close()

In [29]:
test_env = gym.make("CarRacing-v2", render_mode='rgb_array')
dir = os.path.join(log_dir, "best_model")
create_gif(test_env, dir, '', "ppo_car")

PermissionError: [Errno 13] Permission denied: 'tmp\\best_model'