# Importing Dependencies

In [1]:
import os 
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

# Load Environment

In [2]:
env = gym.make('CartPole-v1', render_mode='human')

In [3]:
episodes = 5
for ep in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        env.render()
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        score += reward
    print(f'Episode: {ep} Score: {score}')
env.close()

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 1 Score: 30.0
Episode: 2 Score: 28.0
Episode: 3 Score: 12.0
Episode: 4 Score: 12.0
Episode: 5 Score: 21.0


# Understanding Environment

In [4]:
print(f'Action Space: {env.action_space}')
print(f'Observation Space: {env.observation_space}')

Action Space: Discrete(2)
Observation Space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


# Train RL Model

In [2]:
train_env = DummyVecEnv([lambda: gym.make('CartPole-v1', render_mode='human')])
eval_env = DummyVecEnv([lambda: gym.make('CartPole-v1', render_mode='human')])



In [6]:
save_path = os.path.join('Training', 'Saved_Models')
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=480, verbose=1)
eval_callback = EvalCallback(
    eval_env, 
    callback_on_new_best=stop_callback, 
    eval_freq=10000,
    best_model_save_path=save_path,
    verbose=1
)

In [7]:
log_path = os.path.join('Training', 'Logs')
model = PPO('MlpPolicy', train_env, verbose=1, tensorboard_log=log_path)

model.learn(total_timesteps=50000, callback=eval_callback)


Using cpu device
Logging to Training/Logs/PPO_2


  if not isinstance(terminated, (bool, np.bool8)):


-----------------------------
| time/              |      |
|    fps             | 47   |
|    iterations      | 1    |
|    time_elapsed    | 43   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 47          |
|    iterations           | 2           |
|    time_elapsed         | 86          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009055142 |
|    clip_fraction        | 0.103       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00379     |
|    learning_rate        | 0.0003      |
|    loss                 | 8.2         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0177     |
|    value_loss           | 55          |
-----------------------------------------
----------------------------------



Eval num_timesteps=10000, episode_reward=395.80 +/- 110.79
Episode length: 395.80 +/- 110.79
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 396         |
|    mean_reward          | 396         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.008973369 |
|    clip_fraction        | 0.069       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.598      |
|    explained_variance   | 0.254       |
|    learning_rate        | 0.0003      |
|    loss                 | 34.6        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0172     |
|    value_loss           | 73.8        |
-----------------------------------------
New best mean reward!
------------------------------
| time/              |       |
|    fps             | 40    |
|    iterations      | 5     |
|    

<stable_baselines3.ppo.ppo.PPO at 0x7a0f8571eb90>

# Reload Model

In [6]:
PPO_Path = os.path.join('Training', 'Saved_Models', 'best_model')

# del model 
model = PPO.load(PPO_Path, env=train_env)

# Evaluation

In [9]:
evaluate_policy(model, eval_env, n_eval_episodes=2, render=True)
eval_env.close()

# Test Model 

In [10]:
env = gym.make('CartPole-v1', render_mode='human')

In [11]:
episodes = 5
for ep in range(1, episodes + 1):
    obs, _ = env.reset()
    done = False
    score = 0
    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        score += reward
    print(f'Episode: {ep} Score: {score}')
env.close()

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 1 Score: 284.0
Episode: 2 Score: 490.0
Episode: 3 Score: 500.0
Episode: 4 Score: 500.0
Episode: 5 Score: 500.0


# Viewing Logs in Tensorboard

In [None]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [None]:
!tensorboard --logdir={training_log_path}

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.18.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C
