# Proximal Policy Optimization (PPO)

In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy




### Create environment

In [2]:
env = gym.make("CartPole-v1", render_mode="rgb_array")

### Instantiate the agent

In [3]:
model = PPO("MlpPolicy", env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


### Train the agent 

In [4]:
model.learn(total_timesteps=10000, progress_bar=True) 

Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.3     |
|    ep_rew_mean     | 21.3     |
| time/              |          |
|    fps             | 580      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 27.5         |
|    ep_rew_mean          | 27.5         |
| time/                   |              |
|    fps                  | 359          |
|    iterations           | 2            |
|    time_elapsed         | 11           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0076230243 |
|    clip_fraction        | 0.0922       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.687       |
|    explained_variance   | 0.00421      |
|    learning_rate        | 0.0003       |
|    loss                 | 6.46         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0123      |
|    value_loss           | 50.6         |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 35.5        |
|    ep_rew_mean          | 35.5        |
| time/                   |             |
|    fps                  | 314         |
|    iterations           | 3           |
|    time_elapsed         | 19          |
|    total_timesteps      | 6144        |
| train/                  |             |
|    approx_kl            | 0.009718858 |
|    clip_fraction        | 0.0593      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.665      |
|    explained_variance   | 0.0566      |
|    learning_rate        | 0.0003      |
|    loss                 | 13.8        |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.0176     |
|    value_loss           | 38          |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 50.5        |
|    ep_rew_mean          | 50.5        |
| time/                   |             |
|    fps                  | 303         |
|    iterations           | 4           |
|    time_elapsed         | 27          |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.010068482 |
|    clip_fraction        | 0.101       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.631      |
|    explained_variance   | 0.239       |
|    learning_rate        | 0.0003      |
|    loss                 | 18.4        |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.0204     |
|    value_loss           | 55.5        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 64          |
|    ep_rew_mean          | 64          |
| time/                   |             |
|    fps                  | 299         |
|    iterations           | 5           |
|    time_elapsed         | 34          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.007854286 |
|    clip_fraction        | 0.0647      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.612      |
|    explained_variance   | 0.376       |
|    learning_rate        | 0.0003      |
|    loss                 | 21.8        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0141     |
|    value_loss           | 59.6        |
-----------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x18c4a2ba508>

### Test the agent

In [5]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) 
vec_env = model.get_env() 
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True) 
    obs, rewards, dones, info = vec_env.step(action) 
    vec_env.render("human")