In [2]:
import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy


# PPO

## Normal parameters

In [2]:
# Create environment
env = gym.make("CarRacing-v2")

### Train agent

In [3]:
# Instantiate the agent
model = PPO("MlpPolicy", env, verbose=1)
# Train the agent and display a progress bar
model.learn(total_timesteps=int(10000))
# Save the agent
model.save("ppo_car")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -61.4    |
| time/              |          |
|    fps             | 26       |
|    iterations      | 1        |
|    time_elapsed    | 77       |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | -56.4      |
| time/                   |            |
|    fps                  | 21         |
|    iterations           | 2          |
|    time_elapsed         | 193        |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00836505 |
|    clip_fraction        | 0.0771     |
|    clip_range           | 0.2  

KeyboardInterrupt: 

In [None]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

### Load pre-trained model

In [4]:
test_env = gym.make("CarRacing-v2", render_mode='human')
model = PPO.load("ppo_car", env=test_env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [6]:
vec_env = model.get_env()
obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = vec_env.step(action)
    #obs, rewards, dones, info = test_env.step(action)
    vec_env.render()
    if done:
        break

vec_env.close()

## Discrete space

In [3]:
# Create environment
env = gym.make("CarRacing-v2", continuous=False)

### Train agent

In [4]:
# Instantiate the agent
model = PPO("MlpPolicy", env, verbose=1)
# Train the agent and display a progress bar
model.learn(total_timesteps=int(1000))
# Save the agent
model.save("ppo_car_discrete")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -57.1    |
| time/              |          |
|    fps             | 29       |
|    iterations      | 1        |
|    time_elapsed    | 69       |
|    total_timesteps | 2048     |
---------------------------------


In [5]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

In [6]:
print(mean_reward, std_reward)

-18.244713 59.37324067176865


### Load pre-trained model

In [9]:
test_env = gym.make("CarRacing-v2", render_mode='human', continuous=False)
model = PPO.load("ppo_car_discrete", env=test_env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [10]:
vec_env = model.get_env()
obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = vec_env.step(action)
    #obs, rewards, dones, info = test_env.step(action)
    vec_env.render()
    if done:
        break

vec_env.close()

## Randomized action space (colors)

In [None]:
# Create environment
env = gym.make("CarRacing-v2", domain_randomize=True)

### Train agent

In [None]:
# Instantiate the agent
model = PPO("MlpPolicy", env, verbose=1)
# Train the agent and display a progress bar
model.learn(total_timesteps=int(1000))
# Save the agent
model.save("ppo_car_discrete")