In [12]:
import numpy as np
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ppo import MlpPolicy
import torch

from imitation.algorithms.adversarial.gail import GAIL
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env

SEED = 42

In [2]:

env = make_vec_env(
    "seals:seals/CartPole-v0",
    rng=np.random.default_rng(SEED),
    n_envs=8,
    post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # to compute rollouts
)
expert = load_policy(
    "ppo-huggingface",
    organization="HumanCompatibleAI",
    env_name="seals-CartPole-v0",
    venv=env,
)


In [3]:

rollouts = rollout.rollout(
    expert,
    env,
    rollout.make_sample_until(min_timesteps=None, min_episodes=60),
    rng=np.random.default_rng(SEED),
)


In [4]:

learner = PPO(
    env=env,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0004,
    gamma=0.95,
    n_epochs=5,
    seed=SEED,
)
reward_net = BasicRewardNet(
    observation_space=env.observation_space,
    action_space=env.action_space,
    normalize_input_layer=RunningNorm,
)
gail_trainer = GAIL(
    demonstrations=rollouts,
    demo_batch_size=1024,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=8,
    venv=env,
    gen_algo=learner,
    reward_net=reward_net,
)


In [5]:

# evaluate the learner before training
env.seed(SEED)
learner_rewards_before_training, _ = evaluate_policy(
    learner, env, 100, return_episode_rewards=True,
)


In [7]:

# train the learner and evaluate again
gail_trainer.train(800000)  # Train for 800_000 steps to match expert.
env.seed(SEED)


round:   0%|          | 0/48 [00:00<?, ?it/s]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 35.3        |
|    gen/rollout/ep_rew_wrapped_mean | 270         |
|    gen/time/fps                    | 5854        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 32768       |
|    gen/train/approx_kl             | 0.006985949 |
|    gen/train/clip_fraction         | 0.0338      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.687      |
|    gen/train/explained_variance    | 0.0555256   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0315      |
|    gen/train/n_updates             | 5           |
|    gen/train/policy_gradient_loss  | -0.00158    |
|    gen/train/value_loss            | 4.7    

round:   2%|▏         | 1/48 [00:05<04:40,  5.97s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 33.3        |
|    gen/rollout/ep_rew_wrapped_mean | 284         |
|    gen/time/fps                    | 5796        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 49152       |
|    gen/train/approx_kl             | 0.008539271 |
|    gen/train/clip_fraction         | 0.0651      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.677      |
|    gen/train/explained_variance    | 0.7282779   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0787      |
|    gen/train/n_updates             | 10          |
|    gen/train/policy_gradient_loss  | -0.00236    |
|    gen/train/value_loss            | 0.26   

round:   4%|▍         | 2/48 [00:12<04:38,  6.05s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 37.2        |
|    gen/rollout/ep_rew_wrapped_mean | 278         |
|    gen/time/fps                    | 5743        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 65536       |
|    gen/train/approx_kl             | 0.010250306 |
|    gen/train/clip_fraction         | 0.0683      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.678      |
|    gen/train/explained_variance    | 0.8897878   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0143      |
|    gen/train/n_updates             | 15          |
|    gen/train/policy_gradient_loss  | -0.00383    |
|    gen/train/value_loss            | 0.0453 

round:   6%|▋         | 3/48 [00:18<04:30,  6.02s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 40.9        |
|    gen/rollout/ep_rew_wrapped_mean | 278         |
|    gen/time/fps                    | 5770        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 81920       |
|    gen/train/approx_kl             | 0.012314845 |
|    gen/train/clip_fraction         | 0.137       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.664      |
|    gen/train/explained_variance    | 0.9172405   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.00381    |
|    gen/train/n_updates             | 20          |
|    gen/train/policy_gradient_loss  | -0.0127     |
|    gen/train/value_loss            | 0.0192 

round:   8%|▊         | 4/48 [00:23<04:23,  5.98s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 44.2        |
|    gen/rollout/ep_rew_wrapped_mean | 276         |
|    gen/time/fps                    | 6226        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 98304       |
|    gen/train/approx_kl             | 0.012895412 |
|    gen/train/clip_fraction         | 0.147       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.649      |
|    gen/train/explained_variance    | 0.90165746  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0011      |
|    gen/train/n_updates             | 25          |
|    gen/train/policy_gradient_loss  | -0.0139     |
|    gen/train/value_loss            | 0.0171 

round:  10%|█         | 5/48 [00:29<04:13,  5.89s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 46          |
|    gen/rollout/ep_rew_wrapped_mean | 282         |
|    gen/time/fps                    | 6188        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 114688      |
|    gen/train/approx_kl             | 0.011982788 |
|    gen/train/clip_fraction         | 0.134       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.622      |
|    gen/train/explained_variance    | 0.92562973  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.013       |
|    gen/train/n_updates             | 30          |
|    gen/train/policy_gradient_loss  | -0.00982    |
|    gen/train/value_loss            | 0.0153 

round:  12%|█▎        | 6/48 [00:35<04:03,  5.81s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 52.8        |
|    gen/rollout/ep_rew_wrapped_mean | 284         |
|    gen/time/fps                    | 6260        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 131072      |
|    gen/train/approx_kl             | 0.010335151 |
|    gen/train/clip_fraction         | 0.104       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.609      |
|    gen/train/explained_variance    | 0.94261414  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.0168     |
|    gen/train/n_updates             | 35          |
|    gen/train/policy_gradient_loss  | -0.00843    |
|    gen/train/value_loss            | 0.0174 

round:  15%|█▍        | 7/48 [00:40<03:55,  5.75s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 66.7        |
|    gen/rollout/ep_rew_wrapped_mean | 279         |
|    gen/time/fps                    | 6182        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 147456      |
|    gen/train/approx_kl             | 0.006036477 |
|    gen/train/clip_fraction         | 0.0534      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.6        |
|    gen/train/explained_variance    | 0.94457066  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.00626    |
|    gen/train/n_updates             | 40          |
|    gen/train/policy_gradient_loss  | -0.00288    |
|    gen/train/value_loss            | 0.0228 

round:  17%|█▋        | 8/48 [00:46<03:48,  5.70s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 84          |
|    gen/rollout/ep_rew_wrapped_mean | 267         |
|    gen/time/fps                    | 6278        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 163840      |
|    gen/train/approx_kl             | 0.005594599 |
|    gen/train/clip_fraction         | 0.0525      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.588      |
|    gen/train/explained_variance    | 0.9530987   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0194      |
|    gen/train/n_updates             | 45          |
|    gen/train/policy_gradient_loss  | -0.00227    |
|    gen/train/value_loss            | 0.0284 

round:  19%|█▉        | 9/48 [00:52<03:39,  5.63s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 100         |
|    gen/rollout/ep_rew_wrapped_mean | 259         |
|    gen/time/fps                    | 6145        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 180224      |
|    gen/train/approx_kl             | 0.008202201 |
|    gen/train/clip_fraction         | 0.0833      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.587      |
|    gen/train/explained_variance    | 0.9638382   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.0186     |
|    gen/train/n_updates             | 50          |
|    gen/train/policy_gradient_loss  | -0.00359    |
|    gen/train/value_loss            | 0.0343 

round:  21%|██        | 10/48 [00:57<03:33,  5.62s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 112         |
|    gen/rollout/ep_rew_wrapped_mean | 254         |
|    gen/time/fps                    | 6181        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 196608      |
|    gen/train/approx_kl             | 0.007861432 |
|    gen/train/clip_fraction         | 0.0818      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.592      |
|    gen/train/explained_variance    | 0.969068    |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.014       |
|    gen/train/n_updates             | 55          |
|    gen/train/policy_gradient_loss  | -0.00368    |
|    gen/train/value_loss            | 0.039  

round:  23%|██▎       | 11/48 [01:03<03:27,  5.62s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 500        |
|    gen/rollout/ep_rew_mean         | 126        |
|    gen/rollout/ep_rew_wrapped_mean | 250        |
|    gen/time/fps                    | 6226       |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 2          |
|    gen/time/total_timesteps        | 212992     |
|    gen/train/approx_kl             | 0.00809122 |
|    gen/train/clip_fraction         | 0.0936     |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -0.583     |
|    gen/train/explained_variance    | 0.97057635 |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.0208     |
|    gen/train/n_updates             | 60         |
|    gen/train/policy_gradient_loss  | -0.0067    |
|    gen/train/value_loss            | 0.0468     |
------------

round:  25%|██▌       | 12/48 [01:08<03:21,  5.60s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 152         |
|    gen/rollout/ep_rew_wrapped_mean | 243         |
|    gen/time/fps                    | 6185        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 229376      |
|    gen/train/approx_kl             | 0.008411401 |
|    gen/train/clip_fraction         | 0.0865      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.567      |
|    gen/train/explained_variance    | 0.97749835  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.0239     |
|    gen/train/n_updates             | 65          |
|    gen/train/policy_gradient_loss  | -0.00807    |
|    gen/train/value_loss            | 0.0634 

round:  27%|██▋       | 13/48 [01:14<03:16,  5.61s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 183         |
|    gen/rollout/ep_rew_wrapped_mean | 234         |
|    gen/time/fps                    | 6231        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 245760      |
|    gen/train/approx_kl             | 0.010468682 |
|    gen/train/clip_fraction         | 0.125       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.543      |
|    gen/train/explained_variance    | 0.9879351   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0382      |
|    gen/train/n_updates             | 70          |
|    gen/train/policy_gradient_loss  | -0.0103     |
|    gen/train/value_loss            | 0.0577 

round:  29%|██▉       | 14/48 [01:20<03:10,  5.60s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 218         |
|    gen/rollout/ep_rew_wrapped_mean | 231         |
|    gen/time/fps                    | 6069        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 262144      |
|    gen/train/approx_kl             | 0.010522071 |
|    gen/train/clip_fraction         | 0.129       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.513      |
|    gen/train/explained_variance    | 0.9905956   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0163      |
|    gen/train/n_updates             | 75          |
|    gen/train/policy_gradient_loss  | -0.00986    |
|    gen/train/value_loss            | 0.0387 

round:  31%|███▏      | 15/48 [01:25<03:04,  5.59s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 257         |
|    gen/rollout/ep_rew_wrapped_mean | 241         |
|    gen/time/fps                    | 6149        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 278528      |
|    gen/train/approx_kl             | 0.008849228 |
|    gen/train/clip_fraction         | 0.0932      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.485      |
|    gen/train/explained_variance    | 0.9899747   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.000324    |
|    gen/train/n_updates             | 80          |
|    gen/train/policy_gradient_loss  | -0.00643    |
|    gen/train/value_loss            | 0.036  

round:  33%|███▎      | 16/48 [01:31<02:59,  5.61s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 295         |
|    gen/rollout/ep_rew_wrapped_mean | 259         |
|    gen/time/fps                    | 6207        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 294912      |
|    gen/train/approx_kl             | 0.008478321 |
|    gen/train/clip_fraction         | 0.096       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.458      |
|    gen/train/explained_variance    | 0.98989534  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0247      |
|    gen/train/n_updates             | 85          |
|    gen/train/policy_gradient_loss  | -0.00664    |
|    gen/train/value_loss            | 0.0341 

round:  35%|███▌      | 17/48 [01:36<02:53,  5.60s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 331         |
|    gen/rollout/ep_rew_wrapped_mean | 275         |
|    gen/time/fps                    | 6203        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 311296      |
|    gen/train/approx_kl             | 0.007077975 |
|    gen/train/clip_fraction         | 0.0768      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.433      |
|    gen/train/explained_variance    | 0.9871621   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.00987    |
|    gen/train/n_updates             | 90          |
|    gen/train/policy_gradient_loss  | -0.00374    |
|    gen/train/value_loss            | 0.0359 

round:  38%|███▊      | 18/48 [01:42<02:47,  5.58s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 363          |
|    gen/rollout/ep_rew_wrapped_mean | 292          |
|    gen/time/fps                    | 6248         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 327680       |
|    gen/train/approx_kl             | 0.0054968186 |
|    gen/train/clip_fraction         | 0.0594       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.405       |
|    gen/train/explained_variance    | 0.98111755   |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.0204       |
|    gen/train/n_updates             | 95           |
|    gen/train/policy_gradient_loss  | -0.00257     |
|    gen/train/value_loss   

round:  40%|███▉      | 19/48 [01:47<02:41,  5.57s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 386         |
|    gen/rollout/ep_rew_wrapped_mean | 308         |
|    gen/time/fps                    | 6174        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 344064      |
|    gen/train/approx_kl             | 0.006043961 |
|    gen/train/clip_fraction         | 0.0669      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.399      |
|    gen/train/explained_variance    | 0.9790024   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0179      |
|    gen/train/n_updates             | 100         |
|    gen/train/policy_gradient_loss  | -0.0026     |
|    gen/train/value_loss            | 0.0341 

round:  42%|████▏     | 20/48 [01:53<02:35,  5.56s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 400         |
|    gen/rollout/ep_rew_wrapped_mean | 330         |
|    gen/time/fps                    | 6120        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 360448      |
|    gen/train/approx_kl             | 0.011664008 |
|    gen/train/clip_fraction         | 0.112       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.381      |
|    gen/train/explained_variance    | 0.9935869   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0047      |
|    gen/train/n_updates             | 105         |
|    gen/train/policy_gradient_loss  | -0.00483    |
|    gen/train/value_loss            | 0.0228 

round:  44%|████▍     | 21/48 [01:59<02:30,  5.57s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 402          |
|    gen/rollout/ep_rew_wrapped_mean | 327          |
|    gen/time/fps                    | 6177         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 376832       |
|    gen/train/approx_kl             | 0.0042705694 |
|    gen/train/clip_fraction         | 0.0499       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.346       |
|    gen/train/explained_variance    | 0.98318624   |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.0141       |
|    gen/train/n_updates             | 110          |
|    gen/train/policy_gradient_loss  | -0.000795    |
|    gen/train/value_loss   

round:  46%|████▌     | 22/48 [02:04<02:25,  5.58s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 416         |
|    gen/rollout/ep_rew_wrapped_mean | 300         |
|    gen/time/fps                    | 6222        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 393216      |
|    gen/train/approx_kl             | 0.007729129 |
|    gen/train/clip_fraction         | 0.0847      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.353      |
|    gen/train/explained_variance    | 0.97821736  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.00819    |
|    gen/train/n_updates             | 115         |
|    gen/train/policy_gradient_loss  | -0.00256    |
|    gen/train/value_loss            | 0.0742 

round:  48%|████▊     | 23/48 [02:10<02:19,  5.59s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 433         |
|    gen/rollout/ep_rew_wrapped_mean | 273         |
|    gen/time/fps                    | 6287        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 409600      |
|    gen/train/approx_kl             | 0.010709075 |
|    gen/train/clip_fraction         | 0.116       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.34       |
|    gen/train/explained_variance    | 0.9929297   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0548      |
|    gen/train/n_updates             | 120         |
|    gen/train/policy_gradient_loss  | -0.00755    |
|    gen/train/value_loss            | 0.0327 

round:  50%|█████     | 24/48 [02:15<02:13,  5.57s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 463         |
|    gen/rollout/ep_rew_wrapped_mean | 268         |
|    gen/time/fps                    | 6200        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 425984      |
|    gen/train/approx_kl             | 0.017956175 |
|    gen/train/clip_fraction         | 0.178       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.327      |
|    gen/train/explained_variance    | 0.9964464   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0141      |
|    gen/train/n_updates             | 125         |
|    gen/train/policy_gradient_loss  | -0.0131     |
|    gen/train/value_loss            | 0.0171 

round:  52%|█████▏    | 25/48 [02:21<02:08,  5.59s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 484          |
|    gen/rollout/ep_rew_wrapped_mean | 298          |
|    gen/time/fps                    | 6236         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 442368       |
|    gen/train/approx_kl             | 0.0109030465 |
|    gen/train/clip_fraction         | 0.0884       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.251       |
|    gen/train/explained_variance    | 0.92902386   |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | -0.00872     |
|    gen/train/n_updates             | 130          |
|    gen/train/policy_gradient_loss  | -0.0123      |
|    gen/train/value_loss   

round:  54%|█████▍    | 26/48 [02:26<02:02,  5.58s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 499          |
|    gen/rollout/ep_rew_wrapped_mean | 328          |
|    gen/time/fps                    | 6219         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 458752       |
|    gen/train/approx_kl             | 0.0069940872 |
|    gen/train/clip_fraction         | 0.0675       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.252       |
|    gen/train/explained_variance    | 0.9693673    |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | -0.0156      |
|    gen/train/n_updates             | 135          |
|    gen/train/policy_gradient_loss  | -0.00896     |
|    gen/train/value_loss   

round:  56%|█████▋    | 27/48 [02:32<01:57,  5.59s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | 344         |
|    gen/time/fps                    | 6299        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 475136      |
|    gen/train/approx_kl             | 0.004562853 |
|    gen/train/clip_fraction         | 0.0497      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.242      |
|    gen/train/explained_variance    | 0.949576    |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.00138     |
|    gen/train/n_updates             | 140         |
|    gen/train/policy_gradient_loss  | -0.00548    |
|    gen/train/value_loss            | 0.00374

round:  58%|█████▊    | 28/48 [02:38<01:51,  5.60s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | 338          |
|    gen/time/fps                    | 6270         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 491520       |
|    gen/train/approx_kl             | 0.0048105796 |
|    gen/train/clip_fraction         | 0.0621       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.24        |
|    gen/train/explained_variance    | 0.96562505   |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | -0.00123     |
|    gen/train/n_updates             | 145          |
|    gen/train/policy_gradient_loss  | -0.00467     |
|    gen/train/value_loss   

round:  60%|██████    | 29/48 [02:43<01:47,  5.64s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | 329          |
|    gen/time/fps                    | 6242         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 507904       |
|    gen/train/approx_kl             | 0.0044105146 |
|    gen/train/clip_fraction         | 0.0435       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.237       |
|    gen/train/explained_variance    | 0.9862264    |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | -2.92e-05    |
|    gen/train/n_updates             | 150          |
|    gen/train/policy_gradient_loss  | -0.0015      |
|    gen/train/value_loss   

round:  62%|██████▎   | 30/48 [02:49<01:41,  5.62s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | 324          |
|    gen/time/fps                    | 6293         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 524288       |
|    gen/train/approx_kl             | 0.0034112008 |
|    gen/train/clip_fraction         | 0.0316       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.235       |
|    gen/train/explained_variance    | 0.9868423    |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | -0.00698     |
|    gen/train/n_updates             | 155          |
|    gen/train/policy_gradient_loss  | -0.00028     |
|    gen/train/value_loss   

round:  65%|██████▍   | 31/48 [02:55<01:35,  5.63s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | 324         |
|    gen/time/fps                    | 6201        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 540672      |
|    gen/train/approx_kl             | 0.002037228 |
|    gen/train/clip_fraction         | 0.0231      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.232      |
|    gen/train/explained_variance    | 0.98133254  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.00529     |
|    gen/train/n_updates             | 160         |
|    gen/train/policy_gradient_loss  | 0.000505    |
|    gen/train/value_loss            | 0.00062

round:  67%|██████▋   | 32/48 [03:00<01:30,  5.64s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | 325          |
|    gen/time/fps                    | 6242         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 557056       |
|    gen/train/approx_kl             | 0.0025858209 |
|    gen/train/clip_fraction         | 0.0311       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.238       |
|    gen/train/explained_variance    | 0.97793746   |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.00112      |
|    gen/train/n_updates             | 165          |
|    gen/train/policy_gradient_loss  | -0.000275    |
|    gen/train/value_loss   

round:  69%|██████▉   | 33/48 [03:06<01:23,  5.60s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | 321          |
|    gen/time/fps                    | 6260         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 573440       |
|    gen/train/approx_kl             | 0.0034417314 |
|    gen/train/clip_fraction         | 0.0472       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.238       |
|    gen/train/explained_variance    | 0.98351884   |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | -0.0081      |
|    gen/train/n_updates             | 170          |
|    gen/train/policy_gradient_loss  | -0.00202     |
|    gen/train/value_loss   

round:  71%|███████   | 34/48 [03:11<01:18,  5.60s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | 324          |
|    gen/time/fps                    | 6289         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 589824       |
|    gen/train/approx_kl             | 0.0038509478 |
|    gen/train/clip_fraction         | 0.0517       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.24        |
|    gen/train/explained_variance    | 0.9761284    |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.00102      |
|    gen/train/n_updates             | 175          |
|    gen/train/policy_gradient_loss  | -0.00139     |
|    gen/train/value_loss   

round:  73%|███████▎  | 35/48 [03:17<01:12,  5.59s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | 322         |
|    gen/time/fps                    | 6284        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 606208      |
|    gen/train/approx_kl             | 0.006584014 |
|    gen/train/clip_fraction         | 0.0858      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.245      |
|    gen/train/explained_variance    | 0.98460144  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.00251     |
|    gen/train/n_updates             | 180         |
|    gen/train/policy_gradient_loss  | -0.00598    |
|    gen/train/value_loss            | 0.00189

round:  75%|███████▌  | 36/48 [03:22<01:06,  5.56s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | 329          |
|    gen/time/fps                    | 6255         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 622592       |
|    gen/train/approx_kl             | 0.0075204456 |
|    gen/train/clip_fraction         | 0.0919       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.249       |
|    gen/train/explained_variance    | 0.97248274   |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.00379      |
|    gen/train/n_updates             | 185          |
|    gen/train/policy_gradient_loss  | -0.005       |
|    gen/train/value_loss   

round:  77%|███████▋  | 37/48 [03:28<01:01,  5.56s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 500        |
|    gen/rollout/ep_rew_mean         | 500        |
|    gen/rollout/ep_rew_wrapped_mean | 325        |
|    gen/time/fps                    | 6054       |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 2          |
|    gen/time/total_timesteps        | 638976     |
|    gen/train/approx_kl             | 0.01148787 |
|    gen/train/clip_fraction         | 0.15       |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -0.253     |
|    gen/train/explained_variance    | 0.97678745 |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | -0.0181    |
|    gen/train/n_updates             | 190        |
|    gen/train/policy_gradient_loss  | -0.0106    |
|    gen/train/value_loss            | 0.0117     |
------------

round:  79%|███████▉  | 38/48 [03:34<00:55,  5.56s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 495         |
|    gen/rollout/ep_rew_wrapped_mean | 325         |
|    gen/time/fps                    | 6264        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 655360      |
|    gen/train/approx_kl             | 0.055347748 |
|    gen/train/clip_fraction         | 0.226       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.314      |
|    gen/train/explained_variance    | 0.9881687   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.247       |
|    gen/train/n_updates             | 195         |
|    gen/train/policy_gradient_loss  | -0.026      |
|    gen/train/value_loss            | 0.0031 

round:  81%|████████▏ | 39/48 [03:39<00:50,  5.58s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 484         |
|    gen/rollout/ep_rew_wrapped_mean | 342         |
|    gen/time/fps                    | 6235        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 671744      |
|    gen/train/approx_kl             | 0.01623483  |
|    gen/train/clip_fraction         | 0.135       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.285      |
|    gen/train/explained_variance    | 0.009290397 |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0289      |
|    gen/train/n_updates             | 200         |
|    gen/train/policy_gradient_loss  | -0.00127    |
|    gen/train/value_loss            | 93.2   

round:  83%|████████▎ | 40/48 [03:45<00:45,  5.65s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 471          |
|    gen/rollout/ep_rew_wrapped_mean | 344          |
|    gen/time/fps                    | 6266         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 688128       |
|    gen/train/approx_kl             | 0.0054224357 |
|    gen/train/clip_fraction         | 0.055        |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.273       |
|    gen/train/explained_variance    | 0.2288267    |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.0611       |
|    gen/train/n_updates             | 205          |
|    gen/train/policy_gradient_loss  | 0.000909     |
|    gen/train/value_loss   

round:  85%|████████▌ | 41/48 [03:51<00:39,  5.64s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 475         |
|    gen/rollout/ep_rew_wrapped_mean | 467         |
|    gen/time/fps                    | 6267        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 704512      |
|    gen/train/approx_kl             | 0.016602855 |
|    gen/train/clip_fraction         | 0.142       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.276      |
|    gen/train/explained_variance    | 0.30046666  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0868      |
|    gen/train/n_updates             | 210         |
|    gen/train/policy_gradient_loss  | -0.000154   |
|    gen/train/value_loss            | 747    

round:  88%|████████▊ | 42/48 [03:56<00:33,  5.64s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 487         |
|    gen/rollout/ep_rew_wrapped_mean | 459         |
|    gen/time/fps                    | 6157        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 720896      |
|    gen/train/approx_kl             | 0.014087208 |
|    gen/train/clip_fraction         | 0.172       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.275      |
|    gen/train/explained_variance    | 0.9686014   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.0169     |
|    gen/train/n_updates             | 215         |
|    gen/train/policy_gradient_loss  | -0.0136     |
|    gen/train/value_loss            | 0.0172 

round:  90%|████████▉ | 43/48 [04:02<00:28,  5.63s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | 489         |
|    gen/time/fps                    | 6274        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 737280      |
|    gen/train/approx_kl             | 0.076581925 |
|    gen/train/clip_fraction         | 0.393       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.382      |
|    gen/train/explained_variance    | 0.98156023  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.0278     |
|    gen/train/n_updates             | 220         |
|    gen/train/policy_gradient_loss  | -0.02       |
|    gen/train/value_loss            | 0.0117 

round:  92%|█████████▏| 44/48 [04:08<00:22,  5.62s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 500        |
|    gen/rollout/ep_rew_mean         | 500        |
|    gen/rollout/ep_rew_wrapped_mean | 399        |
|    gen/time/fps                    | 6253       |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 2          |
|    gen/time/total_timesteps        | 753664     |
|    gen/train/approx_kl             | 0.00915049 |
|    gen/train/clip_fraction         | 0.111      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -0.333     |
|    gen/train/explained_variance    | 0.97632694 |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.00371    |
|    gen/train/n_updates             | 225        |
|    gen/train/policy_gradient_loss  | -0.00761   |
|    gen/train/value_loss            | 0.0176     |
------------

round:  94%|█████████▍| 45/48 [04:13<00:16,  5.61s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | 412         |
|    gen/time/fps                    | 6056        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 770048      |
|    gen/train/approx_kl             | 0.011859164 |
|    gen/train/clip_fraction         | 0.124       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.324      |
|    gen/train/explained_variance    | 0.97653854  |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.00208    |
|    gen/train/n_updates             | 230         |
|    gen/train/policy_gradient_loss  | -0.00658    |
|    gen/train/value_loss            | 0.0111 

round:  96%|█████████▌| 46/48 [04:19<00:11,  5.61s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 500         |
|    gen/rollout/ep_rew_wrapped_mean | 414         |
|    gen/time/fps                    | 6280        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 786432      |
|    gen/train/approx_kl             | 0.009355076 |
|    gen/train/clip_fraction         | 0.106       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.295      |
|    gen/train/explained_variance    | 0.9822101   |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.00842     |
|    gen/train/n_updates             | 235         |
|    gen/train/policy_gradient_loss  | -0.00484    |
|    gen/train/value_loss            | 0.0067 

round:  98%|█████████▊| 47/48 [04:24<00:05,  5.59s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 500          |
|    gen/rollout/ep_rew_wrapped_mean | 408          |
|    gen/time/fps                    | 6258         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 802816       |
|    gen/train/approx_kl             | 0.0058190124 |
|    gen/train/clip_fraction         | 0.0626       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.278       |
|    gen/train/explained_variance    | 0.9946737    |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | -0.00358     |
|    gen/train/n_updates             | 240          |
|    gen/train/policy_gradient_loss  | -0.00311     |
|    gen/train/value_loss   

round: 100%|██████████| 48/48 [04:30<00:00,  5.63s/it]


[42, 43, 44, 45, 46, 47, 48, 49]

In [8]:

learner_rewards_after_training, _ = evaluate_policy(
    learner, env, 100, return_episode_rewards=True,
)


In [9]:

print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

mean reward after training: 500.0
mean reward before training: 102.6


In [10]:
learner.save("models/gail_cartpole_learner")

In [31]:
expert = PPO.load("models/gail_cartpole_learner", env=env)

In [35]:
env  = gym.make("seals:seals/CartPole-v0", render_mode='human')
obs, _ = env.reset()
env.render()
done = False
truncated = False   
g_reward = 0
for _ in range(10):
    while not done and not truncated:
        # Predict action
        action, _states = expert.predict(obs, deterministic=True)

        # Step in environment
        obs, reward, done, truncated, info = env.step(action)
        g_reward += reward
        # Optional: render is handled automatically with render_mode="human"
        
        print(f"Reward: {g_reward}")
env.close()

Reward: 1.0
Reward: 2.0
Reward: 3.0
Reward: 4.0
Reward: 5.0
Reward: 6.0
Reward: 7.0
Reward: 8.0
Reward: 9.0
Reward: 10.0
Reward: 11.0
Reward: 12.0
Reward: 13.0
Reward: 14.0
Reward: 15.0
Reward: 16.0
Reward: 17.0
Reward: 18.0
Reward: 19.0
Reward: 20.0
Reward: 21.0
Reward: 22.0
Reward: 23.0
Reward: 24.0
Reward: 25.0
Reward: 26.0
Reward: 27.0
Reward: 28.0
Reward: 29.0
Reward: 30.0
Reward: 31.0
Reward: 32.0
Reward: 33.0
Reward: 34.0
Reward: 35.0
Reward: 36.0
Reward: 37.0
Reward: 38.0
Reward: 39.0
Reward: 40.0
Reward: 41.0
Reward: 42.0
Reward: 43.0
Reward: 44.0
Reward: 45.0
Reward: 46.0
Reward: 47.0
Reward: 48.0
Reward: 49.0
Reward: 50.0
Reward: 51.0
Reward: 52.0
Reward: 53.0
Reward: 54.0
Reward: 55.0
Reward: 56.0
Reward: 57.0
Reward: 58.0
Reward: 59.0
Reward: 60.0
Reward: 61.0
Reward: 62.0
Reward: 63.0
Reward: 64.0
Reward: 65.0
Reward: 66.0
Reward: 67.0
Reward: 68.0
Reward: 69.0
Reward: 70.0
Reward: 71.0
Reward: 72.0
Reward: 73.0
Reward: 74.0
Reward: 75.0
Reward: 76.0
Reward: 77.0
Reward: 