In [1]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
enviornment_name = "CartPole-v0"
env = gym.make(enviornment_name)

In [24]:
episodes = 10
for episode in range(1, episodes+1):
    # Get initital set of observations
    state = env.reset()
    done =False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print("Episode:{} Score:{}".format(episode, score))
env.close()

Episode:1 Score:19.0
Episode:2 Score:64.0
Episode:3 Score:11.0
Episode:4 Score:18.0
Episode:5 Score:23.0
Episode:6 Score:15.0
Episode:7 Score:11.0
Episode:8 Score:15.0
Episode:9 Score:33.0
Episode:10 Score:32.0


In [None]:
env.action_space

In [None]:
env.action_space.sample()

In [None]:
env.observation_space

In [None]:
env.observation_space.sample()

# Train RL Model #

In [9]:
# Make your directories #
log_path = os.path.join("Training", "Logs")


In [7]:
env = gym.make(enviornment_name)
env = DummyVecEnv([lambda:env])
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [8]:
#model.learn(total_timesteps=20000)

# Save and reload model #

In [3]:
PPO_PATH =os.path.join("Training", "Saved Models", "PPO_Model_Cartpole")

In [None]:
#model.save(PPO_PATH)

In [None]:
del model

In [4]:
model = PPO.load(PPO_PATH,env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [12]:
model

<stable_baselines3.ppo.ppo.PPO at 0x7ff1b84f39d0>

# Evaluation #

In [5]:
evaluate_policy(model, env, n_eval_episodes=5, render=True)



(200.0, 0.0)

In [6]:
env.close()

# Test Model #

In [5]:
episodes = 5
for episode in range(1, episodes+1):
    # Get initital set of observations
    obs = env.reset()
    done =False
    score = 0
    
    while not done:
        # Rendering the enviornment
        env.render()
        # Now using model to make predictions
        action, _ = model.predict(obs) 
        obs, reward, done, info = env.step(action)
        score += reward
    print("Episode:{} Score:{}".format(episode, score))
env.close()

Episode:1 Score:200.0
Episode:2 Score:200.0
Episode:3 Score:200.0
Episode:4 Score:200.0
Episode:5 Score:200.0


In [22]:
env.close()

# Viewing logs #

In [24]:
training_log_path = os.path.join(log_path,"PPO_2")

In [25]:
training_log_path

'Training/Logs/PPO_2'

In [None]:
!tensorboard --logdir={training_log_path}

2022-08-22 21:21:52.014535: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.9.1 at http://localhost:6006/ (Press CTRL+C to quit)
X-Content-Type-Options is required to be "nosniff"
X-Content-Type-Options is required to be "nosniff"
Requires default-src for Content-Security-Policy
X-Content-Type-Options is required to be "nosniff"
W0822 21:24:54.138146 140150991550208 application.py:556] path /data/plugin/whatif/data/plugins_listing not found, sending 404
W0822 21:24:54.195803 140

# Adding a callback to the training stage #

In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [7]:
save_path = os.path.join("Training","Saved Models")

In [11]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1
                             )

In [12]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [13]:
model.learn(total_timesteps=20000, callback=eval_callback)

2022-08-25 02:52:06.743429: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Logging to Training/Logs/PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 24.8     |
|    ep_rew_mean     | 24.8     |
| time/              |          |
|    fps             | 484      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28.8        |
|    ep_rew_mean          | 28.8        |
| time/                   |             |
|    fps                  | 523         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007396089 |
|    clip_fraction        | 0.0792      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | -0.00107    |



Eval num_timesteps=10000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 200        |
|    mean_reward          | 200        |
| time/                   |            |
|    total_timesteps      | 10000      |
| train/                  |            |
|    approx_kl            | 0.00863434 |
|    clip_fraction        | 0.069      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.606     |
|    explained_variance   | 0.206      |
|    learning_rate        | 0.0003     |
|    loss                 | 17.8       |
|    n_updates            | 40         |
|    policy_gradient_loss | -0.0151    |
|    value_loss           | 69.5       |
----------------------------------------
New best mean reward!
Stopping training because the mean reward 200.00  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x7f840a5b4df0>

# Changing Policies #

In [16]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

In [17]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path, policy_kwargs={"net_arch":net_arch})

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [20]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training/Logs/PPO_4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.7     |
|    ep_rew_mean     | 21.7     |
| time/              |          |
|    fps             | 1097     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28.6        |
|    ep_rew_mean          | 28.6        |
| time/                   |             |
|    fps                  | 654         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014456034 |
|    clip_fraction        | 0.227       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.00962    |

<stable_baselines3.ppo.ppo.PPO at 0x7f83189e67f0>

# Using alternate Algorythm #

In [21]:
from stable_baselines3 import DQN

In [22]:
model = DQN("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [23]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 20.2     |
|    ep_rew_mean      | 20.2     |
|    exploration_rate | 0.962    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2650     |
|    time_elapsed     | 0        |
|    total_timesteps  | 81       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 20.4     |
|    ep_rew_mean      | 20.4     |
|    exploration_rate | 0.923    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3281     |
|    time_elapsed     | 0        |
|    total_timesteps  | 163      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 19.7     |
|    ep_rew_mean      | 19.7     |
|    exploration_rate | 0.888    |
| time/               | 

<stable_baselines3.dqn.dqn.DQN at 0x7f83189e6a00>