### Para o PPO a estrutura é muito parecida

In [1]:
import gymnasium as gym
import numpy as np
import tensorboard

%load_ext tensorboard

import stable_baselines3
stable_baselines3.__version__

'2.1.0'

In [2]:
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

# para treinar com ambientes Atari
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack

In [3]:
from stable_baselines3.ppo import CnnPolicy

In [4]:
# Defina os hiperparâmetros desejados
gamma = 0.99
learning_rate = 0.0003
n_epochs = 10
n_steps = 2046

ENV_NAME = 'ALE/SpaceInvaders-v5'
env = make_atari_env(ENV_NAME, n_envs=4, seed=12306488)
env = VecFrameStack(env, n_stack=4)

# Cria com uma rede convolucional como política e define os hiperparâmetros
model = PPO(CnnPolicy, env, gamma=gamma, learning_rate=learning_rate, n_epochs=n_epochs, tensorboard_log="log_dir", n_steps=n_steps, verbose=1)


Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2046 and n_envs=4)


In [13]:
%tensorboard --logdir log_dir

Reusing TensorBoard on port 6006 (pid 18444), started 1 day, 17:01:50 ago. (Use '!kill 18444' to kill it.)

In [32]:
# Treina por 200 mil passos -- pode demorar mais de 5 min!
model.learn(total_timesteps=100_000)

Logging to log_dir\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 550      |
|    ep_rew_mean     | 166      |
| time/              |          |
|    fps             | 231      |
|    iterations      | 1        |
|    time_elapsed    | 35       |
|    total_timesteps | 8184     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 494         |
|    ep_rew_mean          | 129         |
| time/                   |             |
|    fps                  | 191         |
|    iterations           | 2           |
|    time_elapsed         | 85          |
|    total_timesteps      | 16368       |
| train/                  |             |
|    approx_kl            | 0.009032685 |
|    clip_fraction        | 0.101       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.78       |
|    explained_variance   | -0.00326    |
|    

<stable_baselines3.ppo.ppo.PPO at 0x1dafc9f3890>

### Para esse caso precisamos avaliar os valores utilizando a lib optuna

In [33]:
import optuna

Como sugerido vou manter o range de otimização das variáveis entre 50% e 150% do valor original 

In [34]:
def train_PPO(trial : optuna.Trial):
    # chama os métodos do "trial" (tentativa) para sugerir valores para os parâmetros
    n_steps    = trial.suggest_int('n_steps', 1023, 3069)
    gamma      = trial.suggest_float('gamma', 0.5, 1.50)
    lr         = trial.suggest_float('lr', 0.00015, 0.00045)
    n_epochs   = trial.suggest_int('n_epochs', 5, 15)
    # Defina os hiperparâmetros desejado

    ENV_NAME = 'ALE/SpaceInvaders-v5'
    env = make_atari_env(ENV_NAME, n_envs=4, seed=12306488)
    env = VecFrameStack(env, n_stack=4)

    # Crie o modelo com os hiperparâmetros sugeridos pelo Optuna
    model = PPO(CnnPolicy, env, gamma=gamma, learning_rate=lr, n_epochs=n_epochs, n_steps=n_steps, verbose=1)

    print(f"\nTRIAL NUMBER #{trial.number}: {trial.params}")

    # Treine o modelo por um número fixo de etapas
    model.learn(total_timesteps=100_000)

    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)

    return mean_reward

In [35]:
study_PPO = optuna.create_study(direction='maximize',
                        storage='sqlite:///stud_PPO2.db',
                        study_name='validation_PPO',
                        load_if_exists=True)

# maximiza o valor de retorno de train_exp_sarsa, rodando "n_trials" vezes
# o parâmetro "n_jobs" indica a quantidade de CPUs a serem usadas (-1 para usar todas)
study_PPO.optimize(train_PPO, n_trials=12, n_jobs=-1)


[I 2024-02-18 05:48:15,673] A new study created in RDB with name: validation_PPO


Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2422 and n_envs=4)


Using cuda device
Wrapping the env in a VecTransposeImage.

TRIAL NUMBER #2: {'n_steps': 2422, 'gamma': 0.850299704721077, 'lr': 0.0002281152050716582, 'n_epochs': 6}


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2398 and n_envs=4)


Using cuda device
Wrapping the env in a VecTransposeImage.

TRIAL NUMBER #0: {'n_steps': 2398, 'gamma': 0.6908464665742965, 'lr': 0.0003518997075672511, 'n_epochs': 12}


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1467 and n_envs=4)


Using cuda device
Wrapping the env in a VecTransposeImage.
Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2895 and n_envs=4)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2504 and n_envs=4)


Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1078 and n_envs=4)



TRIAL NUMBER #5: {'n_steps': 1078, 'gamma': 0.797139096633719, 'lr': 0.00041944564908379795, 'n_epochs': 12}

TRIAL NUMBER #4: {'n_steps': 2895, 'gamma': 0.6036412324990864, 'lr': 0.000375336135585277, 'n_epochs': 12}

TRIAL NUMBER #3: {'n_steps': 1467, 'gamma': 0.6447064145967081, 'lr': 0.00023030603034713325, 'n_epochs': 7}

TRIAL NUMBER #1: {'n_steps': 2504, 'gamma': 1.4688020233407568, 'lr': 0.00029710135406832834, 'n_epochs': 11}
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 506      |
|    ep_rew_mean     | 122      |
| time/              |          |
|    fps             | 42       |
|    iterations      | 1        |
|    time_elapsed    | 101      |
|    total_timesteps | 4312     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 490      |
|    ep_rew_mean     | 125      |
| time/              |          |
|    fps             | 50       |
|    iterations 

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  return np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 490         |
|    ep_rew_mean          | 125         |
| time/                   |             |
|    fps                  | 48          |
|    iterations           | 3           |
|    time_elapsed         | 361         |
|    total_timesteps      | 17604       |
| train/                  |             |
|    approx_kl            | 0.013265374 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.77       |
|    explained_variance   | 0.173       |
|    learning_rate        | 0.00023     |
|    loss                 | -0.0289     |
|    n_updates            | 14          |
|    policy_gradient_loss | -0.0249     |
|    value_loss           | 0.0727      |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 521     

[I 2024-02-18 06:28:14,609] Trial 2 finished with value: 333.0 and parameters: {'n_steps': 2422, 'gamma': 0.850299704721077, 'lr': 0.0002281152050716582, 'n_epochs': 6}. Best is trial 2 with value: 333.0.


Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2180 and n_envs=4)



TRIAL NUMBER #6: {'n_steps': 2180, 'gamma': 0.5695731586684158, 'lr': 0.000294970703864418, 'n_epochs': 8}


[I 2024-02-18 06:28:50,663] Trial 3 finished with value: 210.5 and parameters: {'n_steps': 1467, 'gamma': 0.6447064145967081, 'lr': 0.00023030603034713325, 'n_epochs': 7}. Best is trial 2 with value: 333.0.


Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2287 and n_envs=4)



TRIAL NUMBER #7: {'n_steps': 2287, 'gamma': 0.734780302742167, 'lr': 0.0003629340451380195, 'n_epochs': 11}
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 692       |
|    ep_rew_mean          | 0         |
| time/                   |           |
|    fps                  | 36        |
|    iterations           | 9         |
|    time_elapsed         | 2437      |
|    total_timesteps      | 90144     |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | 0         |
|    explained_variance   | 0         |
|    learning_rate        | 0.000297  |
|    loss                 | 9.2e+30   |
|    n_updates            | 88        |
|    policy_gradient_loss | -1.03e-09 |
|    value_loss           | 1.41e+33  |
---------------------------------------
----------------------------------------
| rollout/

[I 2024-02-18 06:36:23,595] Trial 1 finished with value: 0.5 and parameters: {'n_steps': 2504, 'gamma': 1.4688020233407568, 'lr': 0.00029710135406832834, 'n_epochs': 11}. Best is trial 2 with value: 333.0.


Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2426 and n_envs=4)



TRIAL NUMBER #8: {'n_steps': 2426, 'gamma': 0.7769360757451391, 'lr': 0.00019881786569299022, 'n_epochs': 13}
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 540         |
|    ep_rew_mean          | 156         |
| time/                   |             |
|    fps                  | 46          |
|    iterations           | 3           |
|    time_elapsed         | 561         |
|    total_timesteps      | 26160       |
| train/                  |             |
|    approx_kl            | 0.023517895 |
|    clip_fraction        | 0.242       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.76       |
|    explained_variance   | 0.0618      |
|    learning_rate        | 0.000295    |
|    loss                 | -0.0352     |
|    n_updates            | 16          |
|    policy_gradient_loss | -0.0446     |
|    value_loss           | 0.0622      |
-----------------------------------------


[I 2024-02-18 06:37:53,024] Trial 0 finished with value: 201.5 and parameters: {'n_steps': 2398, 'gamma': 0.6908464665742965, 'lr': 0.0003518997075672511, 'n_epochs': 12}. Best is trial 2 with value: 333.0.


Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1633 and n_envs=4)



TRIAL NUMBER #9: {'n_steps': 1633, 'gamma': 1.2020778146262363, 'lr': 0.00028109118812255683, 'n_epochs': 8}
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 486      |
|    ep_rew_mean     | 113      |
| time/              |          |
|    fps             | 91       |
|    iterations      | 1        |
|    time_elapsed    | 106      |
|    total_timesteps | 9704     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 565        |
|    ep_rew_mean          | 265        |
| time/                   |            |
|    fps                  | 29         |
|    iterations           | 21         |
|    time_elapsed         | 3024       |
|    total_timesteps      | 90552      |
| train/                  |            |
|    approx_kl            | 0.37107635 |
|    clip_fraction        | 0.674      |
|    clip_range           | 0.2        |
|    entropy_loss

[I 2024-02-18 06:39:31,971] Trial 4 finished with value: 263.5 and parameters: {'n_steps': 2895, 'gamma': 0.6036412324990864, 'lr': 0.000375336135585277, 'n_epochs': 12}. Best is trial 2 with value: 333.0.


Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2157 and n_envs=4)



TRIAL NUMBER #10: {'n_steps': 2157, 'gamma': 0.5671624686016697, 'lr': 0.00037206379938648426, 'n_epochs': 9}
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 567        |
|    ep_rew_mean          | 262        |
| time/                   |            |
|    fps                  | 29         |
|    iterations           | 22         |
|    time_elapsed         | 3174       |
|    total_timesteps      | 94864      |
| train/                  |            |
|    approx_kl            | 0.39374647 |
|    clip_fraction        | 0.675      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.07      |
|    explained_variance   | 0.492      |
|    learning_rate        | 0.000419   |
|    loss                 | -0.121     |
|    n_updates            | 252        |
|    policy_gradient_loss | -0.104     |
|    value_loss           | 0.0439     |
----------------------------------------
----------------------------

[I 2024-02-18 06:47:55,337] Trial 5 finished with value: 256.5 and parameters: {'n_steps': 1078, 'gamma': 0.797139096633719, 'lr': 0.00041944564908379795, 'n_epochs': 12}. Best is trial 2 with value: 333.0.


Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3007 and n_envs=4)



TRIAL NUMBER #11: {'n_steps': 3007, 'gamma': 0.7598685207617438, 'lr': 0.0001577475259813369, 'n_epochs': 15}
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 492         |
|    ep_rew_mean          | 145         |
| time/                   |             |
|    fps                  | 40          |
|    iterations           | 3           |
|    time_elapsed         | 718         |
|    total_timesteps      | 29112       |
| train/                  |             |
|    approx_kl            | 0.030408332 |
|    clip_fraction        | 0.298       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.76       |
|    explained_variance   | 0.154       |
|    learning_rate        | 0.000199    |
|    loss                 | -0.0813     |
|    n_updates            | 26          |
|    policy_gradient_loss | -0.0621     |
|    value_loss           | 0.0682      |
-----------------------------------------
-------

[I 2024-02-18 07:13:11,461] Trial 6 finished with value: 244.5 and parameters: {'n_steps': 2180, 'gamma': 0.5695731586684158, 'lr': 0.000294970703864418, 'n_epochs': 8}. Best is trial 2 with value: 333.0.


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 538         |
|    ep_rew_mean          | 160         |
| time/                   |             |
|    fps                  | 31          |
|    iterations           | 4           |
|    time_elapsed         | 1546        |
|    total_timesteps      | 48112       |
| train/                  |             |
|    approx_kl            | 0.038117345 |
|    clip_fraction        | 0.377       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.72       |
|    explained_variance   | 0.209       |
|    learning_rate        | 0.000158    |
|    loss                 | -0.0844     |
|    n_updates            | 45          |
|    policy_gradient_loss | -0.0761     |
|    value_loss           | 0.0601      |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 564     

[I 2024-02-18 07:15:22,837] Trial 9 finished with value: 0.5 and parameters: {'n_steps': 1633, 'gamma': 1.2020778146262363, 'lr': 0.00028109118812255683, 'n_epochs': 8}. Best is trial 2 with value: 333.0.


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 580        |
|    ep_rew_mean          | 214        |
| time/                   |            |
|    fps                  | 36         |
|    iterations           | 9          |
|    time_elapsed         | 2390       |
|    total_timesteps      | 87336      |
| train/                  |            |
|    approx_kl            | 0.09801621 |
|    clip_fraction        | 0.571      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.54      |
|    explained_variance   | 0.252      |
|    learning_rate        | 0.000199   |
|    loss                 | -0.117     |
|    n_updates            | 104        |
|    policy_gradient_loss | -0.0991    |
|    value_loss           | 0.0524     |
----------------------------------------


[I 2024-02-18 07:16:29,619] Trial 7 finished with value: 365.0 and parameters: {'n_steps': 2287, 'gamma': 0.734780302742167, 'lr': 0.0003629340451380195, 'n_epochs': 11}. Best is trial 7 with value: 365.0.


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 543       |
|    ep_rew_mean          | 206       |
| time/                   |           |
|    fps                  | 46        |
|    iterations           | 12        |
|    time_elapsed         | 2243      |
|    total_timesteps      | 103536    |
| train/                  |           |
|    approx_kl            | 0.1591251 |
|    clip_fraction        | 0.608     |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.46     |
|    explained_variance   | 0.356     |
|    learning_rate        | 0.000372  |
|    loss                 | -0.125    |
|    n_updates            | 99        |
|    policy_gradient_loss | -0.092    |
|    value_loss           | 0.035     |
---------------------------------------


[I 2024-02-18 07:17:45,955] Trial 10 finished with value: 279.5 and parameters: {'n_steps': 2157, 'gamma': 0.5671624686016697, 'lr': 0.00037206379938648426, 'n_epochs': 9}. Best is trial 7 with value: 365.0.


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 560         |
|    ep_rew_mean          | 181         |
| time/                   |             |
|    fps                  | 32          |
|    iterations           | 5           |
|    time_elapsed         | 1850        |
|    total_timesteps      | 60140       |
| train/                  |             |
|    approx_kl            | 0.050047863 |
|    clip_fraction        | 0.449       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.69       |
|    explained_variance   | 0.248       |
|    learning_rate        | 0.000158    |
|    loss                 | -0.0882     |
|    n_updates            | 60          |
|    policy_gradient_loss | -0.0868     |
|    value_loss           | 0.0545      |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 604     

[I 2024-02-18 07:21:10,503] Trial 8 finished with value: 292.0 and parameters: {'n_steps': 2426, 'gamma': 0.7769360757451391, 'lr': 0.00019881786569299022, 'n_epochs': 13}. Best is trial 7 with value: 365.0.


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 592        |
|    ep_rew_mean          | 212        |
| time/                   |            |
|    fps                  | 35         |
|    iterations           | 6          |
|    time_elapsed         | 2006       |
|    total_timesteps      | 72168      |
| train/                  |            |
|    approx_kl            | 0.06316606 |
|    clip_fraction        | 0.501      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.65      |
|    explained_variance   | 0.276      |
|    learning_rate        | 0.000158   |
|    loss                 | -0.105     |
|    n_updates            | 75         |
|    policy_gradient_loss | -0.0933    |
|    value_loss           | 0.0522     |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 544        |
|    ep_rew_mean

[I 2024-02-18 07:26:13,218] Trial 11 finished with value: 205.0 and parameters: {'n_steps': 3007, 'gamma': 0.7598685207617438, 'lr': 0.0001577475259813369, 'n_epochs': 15}. Best is trial 7 with value: 365.0.


In [36]:
print("MELHORES PARÂMETROS:")
print(study_PPO.best_params)

MELHORES PARÂMETROS:
{'n_steps': 2287, 'gamma': 0.734780302742167, 'lr': 0.0003629340451380195, 'n_epochs': 11}


In [37]:
# Defina os hiperparâmetros desejados
gamma = study_PPO.best_params["gamma"]
learning_rate = study_PPO.best_params["lr"]
n_epochs = study_PPO.best_params["n_epochs"]
n_steps = study_PPO.best_params["n_steps"]

ENV_NAME = 'ALE/SpaceInvaders-v5'
env = make_atari_env(ENV_NAME, n_envs=4, seed=12306488)
env = VecFrameStack(env, n_stack=4)

# Cria com uma rede convolucional como política e define os hiperparâmetros
model = PPO(CnnPolicy, env, gamma=gamma, learning_rate=learning_rate, n_epochs=n_epochs, tensorboard_log="log_dir", n_steps=n_steps, verbose=1)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [14]:
%tensorboard --logdir=log_dir

Reusing TensorBoard on port 6009 (pid 20732), started 1 day, 9:43:22 ago. (Use '!kill 20732' to kill it.)

In [39]:
model.learn(total_timesteps=1_200_000)

Logging to log_dir_1\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 512      |
|    ep_rew_mean     | 128      |
| time/              |          |
|    fps             | 219      |
|    iterations      | 1        |
|    time_elapsed    | 41       |
|    total_timesteps | 9148     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 504        |
|    ep_rew_mean          | 140        |
| time/                   |            |
|    fps                  | 188        |
|    iterations           | 2          |
|    time_elapsed         | 97         |
|    total_timesteps      | 18296      |
| train/                  |            |
|    approx_kl            | 0.02612856 |
|    clip_fraction        | 0.196      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.78      |
|    explained_variance   | -0.0607    |
|    learning_rate

<stable_baselines3.ppo.ppo.PPO at 0x1db0849e590>

In [41]:
# Defina os hiperparâmetros desejados
gamma = study_PPO.best_params["gamma"]
learning_rate = study_PPO.best_params["lr"]
n_epochs = study_PPO.best_params["n_epochs"]
n_steps = study_PPO.best_params["n_steps"]

ENV_NAME = 'ALE/SpaceInvaders-v5'
env2 = make_atari_env(ENV_NAME, n_envs=4, seed=12306488)
env2 = VecFrameStack(env, n_stack=4)

# Cria com uma rede convolucional como política e define os hiperparâmetros
model2 = PPO(CnnPolicy, env2, gamma=gamma, learning_rate=learning_rate, n_epochs=n_epochs, tensorboard_log="log_dir", n_steps=n_steps, verbose=1)

Using cuda device
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2287 and n_envs=4)


In [42]:
model2.learn(total_timesteps=5_000_000)

Logging to log_dir\PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 512      |
|    ep_rew_mean     | 124      |
| time/              |          |
|    fps             | 224      |
|    iterations      | 1        |
|    time_elapsed    | 40       |
|    total_timesteps | 9148     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 545         |
|    ep_rew_mean          | 143         |
| time/                   |             |
|    fps                  | 160         |
|    iterations           | 2           |
|    time_elapsed         | 114         |
|    total_timesteps      | 18296       |
| train/                  |             |
|    approx_kl            | 0.025526892 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.78       |
|    explained_variance   | -0.0223     |
|    

<stable_baselines3.ppo.ppo.PPO at 0x1db08476cd0>