In [6]:
import gymnasium as gym
from gymnasium.wrappers import TimeLimit
import mani_skill.envs
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.env_checker import check_env
from mani_skill.utils import gym_utils
from mani_skill.utils.wrappers.record import RecordEpisode
from mani_skill.vector.wrappers.sb3 import ManiSkillSB3VectorEnv
import torch
import numpy as np

SEED = 17
NUM_ENVS = 8



In [15]:
env = gym.make("CartPole-v1", render_mode = "rgb_array")
states, _ = env.reset(seed = SEED, options = {"low": -0.1, "high": 0.1})
check_env(env, warn = True)

ppo_model = PPO(policy = "MlpPolicy", env = env, learning_rate = 3e-4, n_steps = 2048, batch_size = 64, n_epochs = 10, gamma = 0.99, gae_lambda = .95,
clip_range = .2, verbose = 1, seed = SEED)



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [17]:
total_timesteps = 100000

ppo_model.learn(total_timesteps = total_timesteps, progress_bar = True)
ppo_model.save("ppo_cartpole")

Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 175      |
|    ep_rew_mean     | 175      |
| time/              |          |
|    fps             | 941      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 204          |
|    ep_rew_mean          | 204          |
| time/                   |              |
|    fps                  | 749          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0092073465 |
|    clip_fraction        | 0.061        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.585       |
|    explained_variance   | 0.7057566    |
|    learning_rate        | 0.0003       |
|    loss                 | 11.3         |
|    n_updates            | 60           |
|    policy_gradient_loss | -0.00513     |
|    value_loss           | 36.7         |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 220         |
|    ep_rew_mean          | 220         |
| time/                   |             |
|    fps                  | 702         |
|    iterations           | 3           |
|    time_elapsed         | 8           |
|    total_timesteps      | 6144        |
| train/                  |             |
|    approx_kl            | 0.005809396 |
|    clip_fraction        | 0.0337      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.575      |
|    explained_variance   | 0.7294357   |
|    learning_rate        | 0.0003      |
|    loss                 | 10.2        |
|    n_updates            | 70          |
|    policy_gradient_loss | -0.00507    |
|    value_loss           | 43.1        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 228         |
|    ep_rew_mean          | 228         |
| time/                   |             |
|    fps                  | 682         |
|    iterations           | 4           |
|    time_elapsed         | 11          |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.004491276 |
|    clip_fraction        | 0.0319      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.566      |
|    explained_variance   | 0.6930959   |
|    learning_rate        | 0.0003      |
|    loss                 | 8.97        |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.00378    |
|    value_loss           | 45.5        |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 254          |
|    ep_rew_mean          | 254          |
| time/                   |              |
|    fps                  | 671          |
|    iterations           | 5            |
|    time_elapsed         | 15           |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0045262324 |
|    clip_fraction        | 0.0595       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.556       |
|    explained_variance   | 0.80357015   |
|    learning_rate        | 0.0003       |
|    loss                 | 12.2         |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.00721     |
|    value_loss           | 39.4         |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 273          |
|    ep_rew_mean          | 273          |
| time/                   |              |
|    fps                  | 662          |
|    iterations           | 6            |
|    time_elapsed         | 18           |
|    total_timesteps      | 12288        |
| train/                  |              |
|    approx_kl            | 0.0053300816 |
|    clip_fraction        | 0.0309       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.561       |
|    explained_variance   | 0.88912964   |
|    learning_rate        | 0.0003       |
|    loss                 | 1.73         |
|    n_updates            | 100          |
|    policy_gradient_loss | -0.00432     |
|    value_loss           | 20.4         |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 288          |
|    ep_rew_mean          | 288          |
| time/                   |              |
|    fps                  | 657          |
|    iterations           | 7            |
|    time_elapsed         | 21           |
|    total_timesteps      | 14336        |
| train/                  |              |
|    approx_kl            | 0.0040760506 |
|    clip_fraction        | 0.0412       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.553       |
|    explained_variance   | 0.8885408    |
|    learning_rate        | 0.0003       |
|    loss                 | 0.365        |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00109     |
|    value_loss           | 9.2          |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 302          |
|    ep_rew_mean          | 302          |
| time/                   |              |
|    fps                  | 653          |
|    iterations           | 8            |
|    time_elapsed         | 25           |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0043042726 |
|    clip_fraction        | 0.0235       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.561       |
|    explained_variance   | 0.008643448  |
|    learning_rate        | 0.0003       |
|    loss                 | 59           |
|    n_updates            | 120          |
|    policy_gradient_loss | -0.00176     |
|    value_loss           | 25.1         |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 305          |
|    ep_rew_mean          | 305          |
| time/                   |              |
|    fps                  | 650          |
|    iterations           | 9            |
|    time_elapsed         | 28           |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0038824955 |
|    clip_fraction        | 0.0168       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.581       |
|    explained_variance   | 0.60730374   |
|    learning_rate        | 0.0003       |
|    loss                 | 13           |
|    n_updates            | 130          |
|    policy_gradient_loss | -0.00193     |
|    value_loss           | 26.2         |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 317         |
|    ep_rew_mean          | 317         |
| time/                   |             |
|    fps                  | 648         |
|    iterations           | 10          |
|    time_elapsed         | 31          |
|    total_timesteps      | 20480       |
| train/                  |             |
|    approx_kl            | 0.005915752 |
|    clip_fraction        | 0.0213      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.561      |
|    explained_variance   | 0.6776563   |
|    learning_rate        | 0.0003      |
|    loss                 | 6.11        |
|    n_updates            | 140         |
|    policy_gradient_loss | -0.00482    |
|    value_loss           | 40.1        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 326         |
|    ep_rew_mean          | 326         |
| time/                   |             |
|    fps                  | 646         |
|    iterations           | 11          |
|    time_elapsed         | 34          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.008935766 |
|    clip_fraction        | 0.0643      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.534      |
|    explained_variance   | -0.40261602 |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0128      |
|    n_updates            | 150         |
|    policy_gradient_loss | -0.00261    |
|    value_loss           | 1.35        |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 336          |
|    ep_rew_mean          | 336          |
| time/                   |              |
|    fps                  | 644          |
|    iterations           | 12           |
|    time_elapsed         | 38           |
|    total_timesteps      | 24576        |
| train/                  |              |
|    approx_kl            | 0.0034804686 |
|    clip_fraction        | 0.011        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.535       |
|    explained_variance   | 0.0753836    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.65         |
|    n_updates            | 160          |
|    policy_gradient_loss | -0.0037      |
|    value_loss           | 25.3         |
------------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 345        |
|    ep_rew_mean          | 345        |
| time/                   |            |
|    fps                  | 643        |
|    iterations           | 13         |
|    time_elapsed         | 41         |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00407054 |
|    clip_fraction        | 0.0102     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.536     |
|    explained_variance   | 0.95244193 |
|    learning_rate        | 0.0003     |
|    loss                 | 0.122      |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.00399   |
|    value_loss           | 1.86       |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 352         |
|    ep_rew_mean          | 352         |
| time/                   |             |
|    fps                  | 642         |
|    iterations           | 14          |
|    time_elapsed         | 44          |
|    total_timesteps      | 28672       |
| train/                  |             |
|    approx_kl            | 0.010891408 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.527      |
|    explained_variance   | 0.82087654  |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0141      |
|    n_updates            | 180         |
|    policy_gradient_loss | -0.0143     |
|    value_loss           | 0.327       |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 361          |
|    ep_rew_mean          | 361          |
| time/                   |              |
|    fps                  | 641          |
|    iterations           | 15           |
|    time_elapsed         | 47           |
|    total_timesteps      | 30720        |
| train/                  |              |
|    approx_kl            | 0.0040750904 |
|    clip_fraction        | 0.033        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.516       |
|    explained_variance   | 0.39388204   |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0114       |
|    n_updates            | 190          |
|    policy_gradient_loss | -0.0016      |
|    value_loss           | 0.171        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 367          |
|    ep_rew_mean          | 367          |
| time/                   |              |
|    fps                  | 640          |
|    iterations           | 16           |
|    time_elapsed         | 51           |
|    total_timesteps      | 32768        |
| train/                  |              |
|    approx_kl            | 0.002644848  |
|    clip_fraction        | 0.0206       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.497       |
|    explained_variance   | -0.010939598 |
|    learning_rate        | 0.0003       |
|    loss                 | 0.00477      |
|    n_updates            | 200          |
|    policy_gradient_loss | -0.00151     |
|    value_loss           | 0.114        |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 373         |
|    ep_rew_mean          | 373         |
| time/                   |             |
|    fps                  | 640         |
|    iterations           | 17          |
|    time_elapsed         | 54          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.006174568 |
|    clip_fraction        | 0.0573      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.498      |
|    explained_variance   | 0.18576276  |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0187      |
|    n_updates            | 210         |
|    policy_gradient_loss | -0.00354    |
|    value_loss           | 0.0738      |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 378          |
|    ep_rew_mean          | 378          |
| time/                   |              |
|    fps                  | 638          |
|    iterations           | 18           |
|    time_elapsed         | 57           |
|    total_timesteps      | 36864        |
| train/                  |              |
|    approx_kl            | 0.0008494233 |
|    clip_fraction        | 0.00645      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.518       |
|    explained_variance   | -0.024662733 |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0136       |
|    n_updates            | 220          |
|    policy_gradient_loss | 4.94e-05     |
|    value_loss           | 0.0448       |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 385         |
|    ep_rew_mean          | 385         |
| time/                   |             |
|    fps                  | 638         |
|    iterations           | 19          |
|    time_elapsed         | 60          |
|    total_timesteps      | 38912       |
| train/                  |             |
|    approx_kl            | 0.004190262 |
|    clip_fraction        | 0.0313      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.512      |
|    explained_variance   | 0.02092284  |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00627     |
|    n_updates            | 230         |
|    policy_gradient_loss | -0.00169    |
|    value_loss           | 0.0291      |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 398          |
|    ep_rew_mean          | 398          |
| time/                   |              |
|    fps                  | 637          |
|    iterations           | 20           |
|    time_elapsed         | 64           |
|    total_timesteps      | 40960        |
| train/                  |              |
|    approx_kl            | 0.0031769006 |
|    clip_fraction        | 0.0208       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.514       |
|    explained_variance   | 0.042858362  |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00535     |
|    n_updates            | 240          |
|    policy_gradient_loss | -0.00122     |
|    value_loss           | 0.0185       |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 412         |
|    ep_rew_mean          | 412         |
| time/                   |             |
|    fps                  | 637         |
|    iterations           | 21          |
|    time_elapsed         | 67          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.004298759 |
|    clip_fraction        | 0.0283      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.506      |
|    explained_variance   | 0.048983037 |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00562     |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.00254    |
|    value_loss           | 0.0119      |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 425         |
|    ep_rew_mean          | 425         |
| time/                   |             |
|    fps                  | 636         |
|    iterations           | 22          |
|    time_elapsed         | 70          |
|    total_timesteps      | 45056       |
| train/                  |             |
|    approx_kl            | 0.002847682 |
|    clip_fraction        | 0.0176      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.497      |
|    explained_variance   | -0.04234469 |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0118      |
|    n_updates            | 260         |
|    policy_gradient_loss | -0.000611   |
|    value_loss           | 0.00791     |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 433          |
|    ep_rew_mean          | 433          |
| time/                   |              |
|    fps                  | 636          |
|    iterations           | 23           |
|    time_elapsed         | 74           |
|    total_timesteps      | 47104        |
| train/                  |              |
|    approx_kl            | 0.0036245375 |
|    clip_fraction        | 0.0116       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.493       |
|    explained_variance   | -0.012575507 |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00645     |
|    n_updates            | 270          |
|    policy_gradient_loss | 0.000223     |
|    value_loss           | 0.00542      |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 444          |
|    ep_rew_mean          | 444          |
| time/                   |              |
|    fps                  | 636          |
|    iterations           | 24           |
|    time_elapsed         | 77           |
|    total_timesteps      | 49152        |
| train/                  |              |
|    approx_kl            | 0.006437978  |
|    clip_fraction        | 0.0506       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.494       |
|    explained_variance   | -0.036315918 |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0011      |
|    n_updates            | 280          |
|    policy_gradient_loss | -0.00244     |
|    value_loss           | 0.00364      |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 454          |
|    ep_rew_mean          | 454          |
| time/                   |              |
|    fps                  | 636          |
|    iterations           | 25           |
|    time_elapsed         | 80           |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0063810474 |
|    clip_fraction        | 0.0694       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.5         |
|    explained_variance   | -0.037405133 |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0125       |
|    n_updates            | 290          |
|    policy_gradient_loss | -0.00287     |
|    value_loss           | 0.00238      |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 464         |
|    ep_rew_mean          | 464         |
| time/                   |             |
|    fps                  | 635         |
|    iterations           | 26          |
|    time_elapsed         | 83          |
|    total_timesteps      | 53248       |
| train/                  |             |
|    approx_kl            | 0.0046428   |
|    clip_fraction        | 0.0431      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.481      |
|    explained_variance   | 0.029098332 |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00475     |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.00173    |
|    value_loss           | 0.00174     |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 475          |
|    ep_rew_mean          | 475          |
| time/                   |              |
|    fps                  | 635          |
|    iterations           | 27           |
|    time_elapsed         | 87           |
|    total_timesteps      | 55296        |
| train/                  |              |
|    approx_kl            | 0.0023334974 |
|    clip_fraction        | 0.0416       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.483       |
|    explained_variance   | -0.051526546 |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0058      |
|    n_updates            | 310          |
|    policy_gradient_loss | -0.00112     |
|    value_loss           | 0.0011       |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 480          |
|    ep_rew_mean          | 480          |
| time/                   |              |
|    fps                  | 635          |
|    iterations           | 28           |
|    time_elapsed         | 90           |
|    total_timesteps      | 57344        |
| train/                  |              |
|    approx_kl            | 0.0019509331 |
|    clip_fraction        | 0.00835      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.484       |
|    explained_variance   | -0.06936324  |
|    learning_rate        | 0.0003       |
|    loss                 | 0.00449      |
|    n_updates            | 320          |
|    policy_gradient_loss | -0.00019     |
|    value_loss           | 0.000773     |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 480         |
|    ep_rew_mean          | 480         |
| time/                   |             |
|    fps                  | 634         |
|    iterations           | 29          |
|    time_elapsed         | 93          |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.005763336 |
|    clip_fraction        | 0.0598      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.481      |
|    explained_variance   | 0.008199036 |
|    learning_rate        | 0.0003      |
|    loss                 | 0.000119    |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.00421    |
|    value_loss           | 0.000521    |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 486         |
|    ep_rew_mean          | 486         |
| time/                   |             |
|    fps                  | 634         |
|    iterations           | 30          |
|    time_elapsed         | 96          |
|    total_timesteps      | 61440       |
| train/                  |             |
|    approx_kl            | 0.006545791 |
|    clip_fraction        | 0.0514      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.475      |
|    explained_variance   | -0.10390878 |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0158      |
|    n_updates            | 340         |
|    policy_gradient_loss | -0.0019     |
|    value_loss           | 0.000383    |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 486          |
|    ep_rew_mean          | 486          |
| time/                   |              |
|    fps                  | 634          |
|    iterations           | 31           |
|    time_elapsed         | 100          |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0011501942 |
|    clip_fraction        | 0.00542      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.492       |
|    explained_variance   | -0.11170626  |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00158     |
|    n_updates            | 350          |
|    policy_gradient_loss | 0.000305     |
|    value_loss           | 0.000266     |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 489         |
|    ep_rew_mean          | 489         |
| time/                   |             |
|    fps                  | 633         |
|    iterations           | 32          |
|    time_elapsed         | 103         |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.006859035 |
|    clip_fraction        | 0.0319      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.483      |
|    explained_variance   | -0.16727316 |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00489     |
|    n_updates            | 360         |
|    policy_gradient_loss | -0.00118    |
|    value_loss           | 0.000178    |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 499         |
|    ep_rew_mean          | 499         |
| time/                   |             |
|    fps                  | 633         |
|    iterations           | 33          |
|    time_elapsed         | 106         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.003779778 |
|    clip_fraction        | 0.0317      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.459      |
|    explained_variance   | -0.16888964 |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00126     |
|    n_updates            | 370         |
|    policy_gradient_loss | -0.00149    |
|    value_loss           | 0.00015     |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 499          |
|    ep_rew_mean          | 499          |
| time/                   |              |
|    fps                  | 633          |
|    iterations           | 34           |
|    time_elapsed         | 109          |
|    total_timesteps      | 69632        |
| train/                  |              |
|    approx_kl            | 0.004489993  |
|    clip_fraction        | 0.0559       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.456       |
|    explained_variance   | 0.0002360344 |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00589     |
|    n_updates            | 380          |
|    policy_gradient_loss | -0.00526     |
|    value_loss           | 0.0001       |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 633          |
|    iterations           | 35           |
|    time_elapsed         | 113          |
|    total_timesteps      | 71680        |
| train/                  |              |
|    approx_kl            | 0.0054258467 |
|    clip_fraction        | 0.0747       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.443       |
|    explained_variance   | -0.36722887  |
|    learning_rate        | 0.0003       |
|    loss                 | 0.00566      |
|    n_updates            | 390          |
|    policy_gradient_loss | -0.00399     |
|    value_loss           | 6.58e-05     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 633          |
|    iterations           | 36           |
|    time_elapsed         | 116          |
|    total_timesteps      | 73728        |
| train/                  |              |
|    approx_kl            | 0.0050165905 |
|    clip_fraction        | 0.048        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.452       |
|    explained_variance   | -0.17900991  |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00649     |
|    n_updates            | 400          |
|    policy_gradient_loss | -0.00159     |
|    value_loss           | 4.81e-05     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 633          |
|    iterations           | 37           |
|    time_elapsed         | 119          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0029731875 |
|    clip_fraction        | 0.0229       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.423       |
|    explained_variance   | -0.08306801  |
|    learning_rate        | 0.0003       |
|    loss                 | 0.00101      |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.00148     |
|    value_loss           | 3.32e-05     |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 500         |
|    ep_rew_mean          | 500         |
| time/                   |             |
|    fps                  | 633         |
|    iterations           | 38          |
|    time_elapsed         | 122         |
|    total_timesteps      | 77824       |
| train/                  |             |
|    approx_kl            | 0.006203338 |
|    clip_fraction        | 0.0636      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.414      |
|    explained_variance   | -0.33942056 |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00129    |
|    n_updates            | 420         |
|    policy_gradient_loss | -0.00329    |
|    value_loss           | 2.25e-05    |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 633          |
|    iterations           | 39           |
|    time_elapsed         | 126          |
|    total_timesteps      | 79872        |
| train/                  |              |
|    approx_kl            | 0.0036859303 |
|    clip_fraction        | 0.0374       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.389       |
|    explained_variance   | -0.19909906  |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0026      |
|    n_updates            | 430          |
|    policy_gradient_loss | -0.00345     |
|    value_loss           | 1.82e-05     |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 500         |
|    ep_rew_mean          | 500         |
| time/                   |             |
|    fps                  | 632         |
|    iterations           | 40          |
|    time_elapsed         | 129         |
|    total_timesteps      | 81920       |
| train/                  |             |
|    approx_kl            | 0.004161536 |
|    clip_fraction        | 0.0328      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.377      |
|    explained_variance   | -0.4374317  |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00237    |
|    n_updates            | 440         |
|    policy_gradient_loss | -0.00211    |
|    value_loss           | 1.46e-05    |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 632          |
|    iterations           | 41           |
|    time_elapsed         | 132          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0036725542 |
|    clip_fraction        | 0.0432       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.385       |
|    explained_variance   | -0.2920233   |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00242     |
|    n_updates            | 450          |
|    policy_gradient_loss | -0.00162     |
|    value_loss           | 1.03e-05     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 632          |
|    iterations           | 42           |
|    time_elapsed         | 135          |
|    total_timesteps      | 86016        |
| train/                  |              |
|    approx_kl            | 0.0050641634 |
|    clip_fraction        | 0.0483       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.361       |
|    explained_variance   | -0.6206548   |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0116       |
|    n_updates            | 460          |
|    policy_gradient_loss | -0.00351     |
|    value_loss           | 1.11e-05     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 632          |
|    iterations           | 43           |
|    time_elapsed         | 139          |
|    total_timesteps      | 88064        |
| train/                  |              |
|    approx_kl            | 0.0024167488 |
|    clip_fraction        | 0.0305       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.341       |
|    explained_variance   | -0.6017729   |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00264     |
|    n_updates            | 470          |
|    policy_gradient_loss | -0.00236     |
|    value_loss           | 7.3e-06      |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 632          |
|    iterations           | 44           |
|    time_elapsed         | 142          |
|    total_timesteps      | 90112        |
| train/                  |              |
|    approx_kl            | 0.0025287475 |
|    clip_fraction        | 0.023        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.337       |
|    explained_variance   | -0.2576791   |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00194     |
|    n_updates            | 480          |
|    policy_gradient_loss | 0.000161     |
|    value_loss           | 6.58e-06     |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 500         |
|    ep_rew_mean          | 500         |
| time/                   |             |
|    fps                  | 632         |
|    iterations           | 45          |
|    time_elapsed         | 145         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.003385966 |
|    clip_fraction        | 0.0458      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.322      |
|    explained_variance   | -0.52983654 |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00456    |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.00206    |
|    value_loss           | 5.21e-06    |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 500         |
|    ep_rew_mean          | 500         |
| time/                   |             |
|    fps                  | 632         |
|    iterations           | 46          |
|    time_elapsed         | 148         |
|    total_timesteps      | 94208       |
| train/                  |             |
|    approx_kl            | 0.004396654 |
|    clip_fraction        | 0.0404      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.319      |
|    explained_variance   | -0.53680384 |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0106     |
|    n_updates            | 500         |
|    policy_gradient_loss | -0.0016     |
|    value_loss           | 3.52e-06    |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 632          |
|    iterations           | 47           |
|    time_elapsed         | 152          |
|    total_timesteps      | 96256        |
| train/                  |              |
|    approx_kl            | 0.0016928999 |
|    clip_fraction        | 0.0196       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.318       |
|    explained_variance   | -0.1793971   |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00438     |
|    n_updates            | 510          |
|    policy_gradient_loss | -0.00119     |
|    value_loss           | 2.8e-06      |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 632          |
|    iterations           | 48           |
|    time_elapsed         | 155          |
|    total_timesteps      | 98304        |
| train/                  |              |
|    approx_kl            | 0.0038388374 |
|    clip_fraction        | 0.0534       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.322       |
|    explained_variance   | -0.4521613   |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00405     |
|    n_updates            | 520          |
|    policy_gradient_loss | -0.00245     |
|    value_loss           | 2.28e-06     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 500          |
|    ep_rew_mean          | 500          |
| time/                   |              |
|    fps                  | 632          |
|    iterations           | 49           |
|    time_elapsed         | 158          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0012079959 |
|    clip_fraction        | 0.0355       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.316       |
|    explained_variance   | -1.1099062   |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0156      |
|    n_updates            | 530          |
|    policy_gradient_loss | -0.00247     |
|    value_loss           | 1.77e-06     |
------------------------------------------


In [13]:
num_envs = 4
max_episode_steps = 75
model = PPO.load('ppo_cartpole')
eval_env = gym.make("CartPole-v1", render_mode = "rgb_array")
obs, _ = eval_env.reset()
success = []

for i in range(max_episode_steps):
    action, _states = model.predict(obs, deterministic=True)
    # obs, rewards, dones, _, info = eval_env.step(action)
    output = eval_env.step(action)
    reward = output[1]
    
    print(f"output {i}: {output}")
    print(f"Reward {i}: {reward}\n")

    success.append(reward)


success_rate = np.mean(np.array(success))

print(f"Success Rate: {success_rate}")

output 0: (array([-0.01819461, -0.21334863,  0.01467597,  0.29898977], dtype=float32), 1.0, False, False, {})
Reward 0: 1.0

output 1: (array([-0.02246159, -0.40867665,  0.02065576,  0.5962649 ], dtype=float32), 1.0, False, False, {})
Reward 1: 1.0

output 2: (array([-0.03063512, -0.6040815 ,  0.03258106,  0.895382  ], dtype=float32), 1.0, False, False, {})
Reward 2: 1.0

output 3: (array([-0.04271675, -0.79962975,  0.0504887 ,  1.1981257 ], dtype=float32), 1.0, False, False, {})
Reward 3: 1.0

output 4: (array([-0.05870935, -0.99536735,  0.07445122,  1.5061954 ], dtype=float32), 1.0, False, False, {})
Reward 4: 1.0

output 5: (array([-0.07861669, -1.1913092 ,  0.10457513,  1.8211623 ], dtype=float32), 1.0, False, False, {})
Reward 5: 1.0

output 6: (array([-0.10244288, -1.3874259 ,  0.14099838,  2.1444194 ], dtype=float32), 1.0, False, False, {})
Reward 6: 1.0

output 7: (array([-0.13019139, -1.5836293 ,  0.18388677,  2.4771202 ], dtype=float32), 1.0, False, False, {})
Reward 7: 1.0

