In [2]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [4]:
environment_name = 'CartPole-v1'

log_path = os.path.join('Training', 'Logs')
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

model.learn(total_timesteps=20000)

PPO_path = os.path.join('Training', 'Models_Saved','PPO_Cartpole')
model.save(PPO_path)

Using cpu device
Logging to Training/Logs/PPO_9
-----------------------------
| time/              |      |
|    fps             | 3875 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 2643        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009679826 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.685      |
|    explained_variance   | -0.00174    |
|    learning_rate        | 0.0003      |
|    loss                 | 8.19        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.02       |
|    value_loss           | 48          |
----------------------------

In [8]:
log_path = os.path.join('Training', 'Logs')


In [5]:
del model

env = gym.make(environment_name, render_mode="human")

model = PPO.load(PPO_path, env=env)

evaluate_policy(model, env, n_eval_episodes=4, render=True)
env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [None]:
env = gym.make(environment_name, render_mode="human")

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, tmp, info = env.step(action)
        score += reward
        
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

!tensorboard --logdir=(training_log_path)

Adding a callback to the training stage

In [3]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [4]:
save_path = os.path.join('Training', 'Models_Saved')

In [6]:
env = gym.make('CartPole-v0')
env = DummyVecEnv([lambda: env])

stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best=stop_callback, eval_freq=10000, best_model_save_path=save_path, verbose=1)

  logger.deprecation(


In [9]:

model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [10]:
model.learn(20000, callback=eval_callback)

Logging to Training/Logs/PPO_15
-----------------------------
| time/              |      |
|    fps             | 3745 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 2667         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0077673215 |
|    clip_fraction        | 0.0866       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.687       |
|    explained_variance   | -0.00183     |
|    learning_rate        | 0.0003       |
|    loss                 | 5.74         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0126      |
|    value_loss           | 50.9         |
---------------------------



New best mean reward!
------------------------------
| time/              |       |
|    fps             | 2258  |
|    iterations      | 5     |
|    time_elapsed    | 4     |
|    total_timesteps | 10240 |
------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 2241         |
|    iterations           | 6            |
|    time_elapsed         | 5            |
|    total_timesteps      | 12288        |
| train/                  |              |
|    approx_kl            | 0.0057770135 |
|    clip_fraction        | 0.0459       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.596       |
|    explained_variance   | 0.466        |
|    learning_rate        | 0.0003       |
|    loss                 | 13.1         |
|    n_updates            | 50           |
|    policy_gradient_loss | -0.0124      |
|    value_loss           | 53.9         |
------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x29bebad60>

New Architecture

In [11]:
net_arch = [dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]
model = PPO('MlpPolicy', env,  verbose=1, tensorboard_log=log_path, policy_kwargs={"net_arch": net_arch})