# 1. Import Dependencies

In [1]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Load Environment

In [2]:
env_name = 'CartPole-v0'
env = gym.make(env_name)

# 3. Render and Understand the Environment

In [3]:
episodes = 30

for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
     
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        
    print('Episode:{}, Score:{}'.format(episode, score))
env.close()

Episode:1, Score:11.0
Episode:2, Score:46.0
Episode:3, Score:11.0
Episode:4, Score:10.0
Episode:5, Score:12.0
Episode:6, Score:17.0
Episode:7, Score:15.0
Episode:8, Score:13.0
Episode:9, Score:24.0
Episode:10, Score:26.0
Episode:11, Score:30.0
Episode:12, Score:56.0
Episode:13, Score:52.0
Episode:14, Score:16.0
Episode:15, Score:14.0
Episode:16, Score:37.0
Episode:17, Score:13.0
Episode:18, Score:53.0
Episode:19, Score:58.0
Episode:20, Score:21.0
Episode:21, Score:10.0
Episode:22, Score:10.0
Episode:23, Score:16.0
Episode:24, Score:19.0
Episode:25, Score:16.0
Episode:26, Score:22.0
Episode:27, Score:27.0
Episode:28, Score:19.0
Episode:29, Score:25.0
Episode:30, Score:48.0


In [4]:
#  observation_space : { Cart Position     => (-4.8,4.8),
#                        Cart Velocity     => (-Inf, Inf)
#                        Pole Angle        => (-0.418 rad [-24 deg], 0.418 rad [24deg])
#                        Pole Angular vel. => (-Inf, Inf)}

print(env.observation_space)
env.observation_space.sample()

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)


array([-7.4922889e-01,  1.8315812e+37, -3.1337431e-01,  2.5627188e+38],
      dtype=float32)

In [5]:
# action_space : { 
#                0 => push cart to left
#                1 => push cart to right}

print(env.action_space)
env.action_space.sample()

Discrete(2)


0

# 4.  RL Model Taxonomy

In [6]:
os.chdir('G:\\Reinforcement Learning')
from IPython.display import Image
Image(filename='rl_img.png')

<IPython.core.display.Image object>

# Training The RL Model

In [None]:
log_path = os.path.join('Training', 'Logs')

In [4]:
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [5]:
model.learn(total_timesteps =  20000)

Logging to Training\Logs\PPO_5
-----------------------------
| time/              |      |
|    fps             | 286  |
|    iterations      | 1    |
|    time_elapsed    | 7    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 441         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008738721 |
|    clip_fraction        | 0.111       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.685      |
|    explained_variance   | -0.00226    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.23        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0183     |
|    value_loss           | 55.9        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x1b11094fdd8>

# 5. Saving and Loading Models

In [6]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')
model.save(PPO_path)

In [24]:
del model

In [25]:
PPO_path

'Training\\Saved Models\\PPO_Model_Cartpole'

In [26]:
model = PPO.load(PPO_path, env=env)

# 6. Evaluation

In [7]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(200.0, 0.0)

In [8]:
env.close()

# 7. Test Model

In [10]:
episodes = 30

for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
     
    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        
    print('Episode:{}, Score:{}'.format(episode, score))
env.close()

Episode:1, Score:[200.]
Episode:2, Score:[200.]
Episode:3, Score:[200.]
Episode:4, Score:[200.]
Episode:5, Score:[200.]
Episode:6, Score:[200.]
Episode:7, Score:[200.]
Episode:8, Score:[184.]
Episode:9, Score:[200.]
Episode:10, Score:[183.]
Episode:11, Score:[150.]
Episode:12, Score:[200.]
Episode:13, Score:[200.]
Episode:14, Score:[200.]
Episode:15, Score:[200.]
Episode:16, Score:[200.]
Episode:17, Score:[200.]
Episode:18, Score:[200.]
Episode:19, Score:[200.]
Episode:20, Score:[168.]
Episode:21, Score:[200.]
Episode:22, Score:[164.]
Episode:23, Score:[200.]
Episode:24, Score:[200.]
Episode:25, Score:[200.]
Episode:26, Score:[193.]
Episode:27, Score:[200.]
Episode:28, Score:[116.]
Episode:29, Score:[200.]
Episode:30, Score:[200.]


# 8. Viewing Logs on Tensorboard

In [None]:
# run from cmd
# ṭensorboard

# 9. Adding Callback to Training Stage

# 10. Changing Policies

# 11. Using Alternate Algorithm