# Import dependencies

In [1]:
!pip install stable-baselines3



# Load Enviroment

In [5]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

2023-08-17 15:52:51.328043: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Environments

In [6]:
environment_name    = 'CartPole-v1'                 #case-sensitive
env                 = gym.make(environment_name, render_mode="rgb_array")    #making the env

# Understanding the environment
Episodes = Think of an episode as one full game within the environment.

In [8]:
episodes    =   5   # test five times 
for episode in range(1, episodes + 1):
    state   = env.reset()
    done    = False
    score   = 0
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info, info1 = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:43.0
Episode:2 Score:43.0
Episode:3 Score:24.0
Episode:4 Score:22.0
Episode:5 Score:17.0


# Training 


In [9]:
!pip3 install torch torchvision torchaudio



In [12]:
log_path = os.path.join('Traning','Logs')
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)

Using cpu device




In [14]:
model.learn(total_timesteps = 20000)

Logging to Traning/Logs/PPO_1


  if not isinstance(terminated, (bool, np.bool8)):


-----------------------------
| time/              |      |
|    fps             | 1768 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1282        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009237312 |
|    clip_fraction        | 0.12        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.0155     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.16        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0197     |
|    value_loss           | 51.9        |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x1347361d0>

# Save and reload

In [None]:
PPO_Path = os.path.join('Traning', 'Saved Models', 'PPO_Model_Cartpole')

#saving model
model.save(PPO_Path)

del model #delete model

#reloading 
model = PPO.load(PPO_Path, env = env)

model.learn(total_timesteps=1000)


# Evaluation


In [15]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)




(475.2, 38.26173022747403)

# Test Model

In [16]:
episode = 5
for episode in range(1, episodes):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs) #Using model here
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()    




Episode:1 Score:[435.]
Episode:2 Score:[500.]
Episode:3 Score:[500.]
Episode:4 Score:[319.]


# View logs

In [19]:
traning_log_path = os.path.join(log_path,'PPO_1')

!tensorboard --logdir={traning_log_path}

2023-08-18 11:48:48.467644: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.13.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


# Adding a callback to traning Stage


In [30]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
stop_callback = StopTrainingOnRewardThreshold(reward_threshold = 200, verbose = 1)
eval_callback = EvalCallback(env,
                            callback_on_new_best = stop_callback,
                            eval_freq = 1000,
                            best_model_save_path = 'Traning/Saved Models',
                            verbose = 1)
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)
model.learn(total_timesteps=2000, callback=eval_callback)

Using cpu device
Logging to Traning/Logs/PPO_2
Eval num_timesteps=1000, episode_reward=31.20 +/- 10.46
Episode length: 31.20 +/- 10.46
---------------------------------
| eval/              |          |
|    mean_ep_length  | 31.2     |
|    mean_reward     | 31.2     |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------




New best mean reward!
Eval num_timesteps=2000, episode_reward=31.60 +/- 4.67
Episode length: 31.60 +/- 4.67
---------------------------------
| eval/              |          |
|    mean_ep_length  | 31.6     |
|    mean_reward     | 31.6     |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1548 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------


<stable_baselines3.ppo.ppo.PPO at 0x13b6404d0>

# Changing Policies

In [33]:
#New neural network
new_arch = [dict(pi=[128,128,128,128], vf = [128,128,128,128])] #4 layers with 128 elements
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':new_arch})
model.learn(total_timesteps=20000, callback = eval_callback)


Using cpu device
Logging to Traning/Logs/PPO_3




Eval num_timesteps=952, episode_reward=8.60 +/- 0.49
Episode length: 8.60 +/- 0.49
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.6      |
|    mean_reward     | 8.6      |
| time/              |          |
|    total_timesteps | 952      |
---------------------------------




Eval num_timesteps=1952, episode_reward=8.80 +/- 0.40
Episode length: 8.80 +/- 0.40
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.8      |
|    mean_reward     | 8.8      |
| time/              |          |
|    total_timesteps | 1952     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 1550 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
Eval num_timesteps=2952, episode_reward=375.80 +/- 109.29
Episode length: 375.80 +/- 109.29
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 376         |
|    mean_reward          | 376         |
| time/                   |             |
|    total_timesteps      | 2952        |
| train/                  |             |
|    approx_kl            | 0.015256573 |
|    clip_fraction        | 0.263       

<stable_baselines3.ppo.ppo.PPO at 0x13b611710>

In [35]:
#Using an Alternate algh
from stable_baselines3 import DQN
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)
model.learn(total_timesteps=20000)

Using cpu device
Logging to Traning/Logs/DQN_1


----------------------------------
| rollout/            |          |
|    exploration_rate | 0.965    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3077     |
|    time_elapsed     | 0        |
|    total_timesteps  | 74       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.902    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5089     |
|    time_elapsed     | 0        |
|    total_timesteps  | 207      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.856    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 5477     |
|    time_elapsed     | 0        |
|    total_timesteps  | 304      |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x137b6da10>