In [2]:
import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy


In [3]:
environment_name = 'CartPole-v1'
env = gym.make(environment_name, render_mode="rgb_array")


In [4]:
episodes = 5
for episode in range(1, episodes+1):
    state, _ = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()


  from pkg_resources import resource_stream, resource_exists


Episode:1 Score:12.0
Episode:2 Score:21.0
Episode:3 Score:64.0
Episode:4 Score:11.0
Episode:5 Score:31.0


In [5]:
log_path = os.path.join('Training', 'Logs')

env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)


Using cpu device




In [6]:
model.learn(total_timesteps=20000)


Logging to Training\Logs\PPO_5
-----------------------------
| time/              |      |
|    fps             | 1537 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 659         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009597906 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.000715    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.45        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0166     |
|    value_loss           | 48.3        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x297beeb5fd0>

In [7]:
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')
model.save(PPO_Path)


In [8]:

del model


In [9]:
model = PPO.load(PPO_Path)


In [10]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)




(np.float64(453.7), np.float64(62.11932066595706))

In [11]:
env.close()


In [12]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
# env.close()


Episode:1 Score:[363.]
Episode:2 Score:[500.]
Episode:3 Score:[486.]
Episode:4 Score:[251.]
Episode:5 Score:[206.]


In [13]:
env.close()


In [14]:
training_log_path = os.path.join(log_path, 'PPO_1')
!tensorboard --logdir={training_log_path}


  import pkg_resources
Traceback (most recent call last):
  File [35m"<frozen runpy>"[0m, line [35m198[0m, in [35m_run_module_as_main[0m
  File [35m"<frozen runpy>"[0m, line [35m88[0m, in [35m_run_code[0m
  File [35m"c:\Users\rahul\OneDrive\Desktop\Synaptic_RL\.venv\Scripts\tensorboard.exe\__main__.py"[0m, line [35m4[0m, in [35m<module>[0m
    from tensorboard.main import run_main
  File [35m"c:\Users\rahul\OneDrive\Desktop\Synaptic_RL\.venv\Lib\site-packages\tensorboard\main.py"[0m, line [35m27[0m, in [35m<module>[0m
    from tensorboard import default
  File [35m"c:\Users\rahul\OneDrive\Desktop\Synaptic_RL\.venv\Lib\site-packages\tensorboard\default.py"[0m, line [35m40[0m, in [35m<module>[0m
    from tensorboard.plugins.image import images_plugin
  File [35m"c:\Users\rahul\OneDrive\Desktop\Synaptic_RL\.venv\Lib\site-packages\tensorboard\plugins\image\images_plugin.py"[0m, line [35m18[0m, in [35m<module>[0m
    import imghdr
[1;35mModuleNotFoundErr

In [17]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
save_path = os.path.join('Training', 'Saved Models')
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=500, verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)


In [18]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)


Using cpu device


In [19]:
model.learn(total_timesteps=20000, callback=eval_callback)


Logging to Training\Logs\PPO_6
-----------------------------
| time/              |      |
|    fps             | 2387 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1296        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008445682 |
|    clip_fraction        | 0.0723      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | -0.00372    |
|    learning_rate        | 0.0003      |
|    loss                 | 4.54        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0115     |
|    value_loss           | 48.1        |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=395.60 +/- 95.82
Episode length: 395.60 +/- 95.82
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 396          |
|    mean_reward          | 396          |
| time/                   |              |
|    total_timesteps      | 10000        |
| train/                  |              |
|    approx_kl            | 0.0064241155 |
|    clip_fraction        | 0.0505       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.617       |
|    explained_variance   | 0.206        |
|    learning_rate        | 0.0003       |
|    loss                 | 31.1         |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.0149      |
|    value_loss           | 61.8         |
------------------------------------------
New best mean reward!
------------------------------
| time/              |       |
|    fps             | 1040  |
|    iterations     

<stable_baselines3.ppo.ppo.PPO at 0x297c8215950>

In [21]:
net_arch = dict(pi=[128,128,128,128], vf=[128,128,128,128])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch': net_arch})



Using cpu device


In [22]:
model.learn(total_timesteps=20000, callback=eval_callback)


Logging to Training\Logs\PPO_7
-----------------------------
| time/              |      |
|    fps             | 2444 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 1029       |
|    iterations           | 2          |
|    time_elapsed         | 3          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.01457873 |
|    clip_fraction        | 0.216      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.682     |
|    explained_variance   | -0.00217   |
|    learning_rate        | 0.0003     |
|    loss                 | 5.02       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0249    |
|    value_loss           | 20.8       |
----------------------------------------
---------------------



Eval num_timesteps=10000, episode_reward=456.00 +/- 67.40
Episode length: 456.00 +/- 67.40
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 456          |
|    mean_reward          | 456          |
| time/                   |              |
|    total_timesteps      | 10000        |
| train/                  |              |
|    approx_kl            | 0.0090819225 |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.557       |
|    explained_variance   | 0.519        |
|    learning_rate        | 0.0003       |
|    loss                 | 19.1         |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.0192      |
|    value_loss           | 41.5         |
------------------------------------------
------------------------------
| time/              |       |
|    fps             | 655   |
|    iterations      | 5     |
|    time_e

<stable_baselines3.ppo.ppo.PPO at 0x297c8216d50>

In [24]:
from stable_baselines3 import DQN
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)


Using cpu device


In [25]:
model.learn(total_timesteps=20000)



Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.972    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3584     |
|    time_elapsed     | 0        |
|    total_timesteps  | 59       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.927    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1474     |
|    time_elapsed     | 0        |
|    total_timesteps  | 153      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.517    |
|    n_updates        | 13       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.9      |
| time/               |          |
|    episodes         | 12       |
|    fps              | 

<stable_baselines3.dqn.dqn.DQN at 0x297c8217b10>