In [16]:
#!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

In [3]:
#!pip install pyopengl
#!pip install pyglet==1.5.27
#!pip install stable_baselines3[extra]

In [4]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv #multiple agents at a time, Wraping in dummy enviroment i think
from stable_baselines3.common.evaluation import evaluate_policy #Evaluate success

## Load Enviroment

In [6]:
env_name = 'CartPole-v0'
env = gym.make(env_name)

In [7]:
episode = 5
for episode in range(1,episode+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()

Episode: 1 Score: 10.0
Episode: 2 Score: 29.0
Episode: 3 Score: 16.0
Episode: 4 Score: 26.0
Episode: 5 Score: 15.0


## Understanding Environment

In [9]:
env.action_space

Discrete(2)

In [10]:
env.action_space.sample()

1

In [11]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [12]:
env.observation_space.sample()

array([-2.9009771e+00,  5.6577138e+37,  1.7452313e-01, -3.3650037e+38],
      dtype=float32)

## Training

In [14]:
log_path = os.path.join('Training', 'Logs')

In [15]:
log_path

'Training\\Logs'

In [16]:
env = gym.make(env_name)
env = DummyVecEnv([lambda : env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device


In [14]:
PPO?? #To understand parameters of PPO

In [49]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 565  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 408         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009368294 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.000894    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.28        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0163     |
|    value_loss           | 51          |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x2839533b970>

## Save Model

In [17]:
PPO_path = os.path.join('Training', 'Saved_Models', 'PPO_Model_Cartpole')

In [30]:
model.save(PPO_path)
#del model To delete a model

## Reload Model

In [18]:
model = PPO.load(PPO_path, env = env)

## Evaluation
## Solved if avg Reward is 200 or higher, lower the standard deviation the better

In [20]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(200.0, 0.0)

In [21]:
env.close()

## Testing our model(agent) in the environment

In [27]:
obs = env.reset()
model.predict(obs)

(array([1], dtype=int64), None)

In [33]:
action, _ = model.predict(obs)
action

array([0], dtype=int64)

In [35]:
env.step(action)

(array([[ 0.02701586, -0.370135  ,  0.01160023,  0.633029  ]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{}])

In [13]:
episode = 5
for episode in range(1,episode+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action,_ = model.predict(obs) #Now using our model to take decision
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()

Episode: 1 Score: [200.]
Episode: 2 Score: [200.]
Episode: 3 Score: [200.]
Episode: 4 Score: [200.]
Episode: 5 Score: [200.]


## View Logs in Tensorboard

In [41]:
training_log_path = os.path.join(log_path, 'PPO_6')
path

'Training\\Logs\\PPO6'

In [1]:
#!tensorboard --logdir={training_log_path}

## Adding Callback aka Early Stopping for best result for the model

In [22]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [23]:
save_path = os.path.join('Training', 'Saved_Models')

In [24]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold = 200, verbose = 1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best = stop_callback,
                             eval_freq = 10000,
                             best_model_save_path = save_path,
                             verbose = 1)

In [25]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device


In [24]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Logs\PPO_3
-----------------------------
| time/              |      |
|    fps             | 517  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 380         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008011863 |
|    clip_fraction        | 0.0875      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | -0.00473    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.09        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0142     |
|    value_loss           | 50          |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x250afb8eaf0>

## Change Policy (New NN Architecture) 

In [26]:
#pi is actor and vf is value function, 4 layers of 128 neurons
net_arch = dict(pi = [128,128,128,128], vf = [128,128,128,128])

In [27]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log = log_path, policy_kwargs = {'net_arch' : net_arch})

Using cuda device


In [28]:
model.learn(total_timesteps=30000, callback=eval_callback)

Logging to Training\Logs\PPO_7
-----------------------------
| time/              |      |
|    fps             | 436  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 310        |
|    iterations           | 2          |
|    time_elapsed         | 13         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.01379874 |
|    clip_fraction        | 0.175      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.682     |
|    explained_variance   | -0.0128    |
|    learning_rate        | 0.0003     |
|    loss                 | 2.18       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0202    |
|    value_loss           | 18.4       |
----------------------------------------
---------------------

<stable_baselines3.ppo.ppo.PPO at 0x2641d8970a0>

In [29]:
PPO_path = os.path.join('Training', 'Saved_Models', 'PPO_Model_Cartpole')
model = PPO.load(PPO_path, env = env)

## Alternative Algorithm

In [30]:
from stable_baselines3 import DQN

In [31]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device


In [36]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\DQN_2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.965    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6337     |
|    time_elapsed     | 0        |
|    total_timesteps  | 73       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.924    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7960     |
|    time_elapsed     | 0        |
|    total_timesteps  | 161      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.879    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 8571     |
|    time_elapsed     | 0        |
|    total_timesteps  | 255      |
----------------------------------
------------------------

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 10485    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2474     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 10552    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2617     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 10530    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2676     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 10649    |
|    time_elapsed     | 0        |
|    total_timesteps  | 4845     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 10663    |
|    time_elapsed     | 0        |
|    total_timesteps  | 4926     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 10649    |
|    time_elapsed     | 0        |
|    total_timesteps  | 5000     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 10664    |
|    time_elapsed     | 0        |
|    total_timesteps  | 7037     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 10664    |
|    time_elapsed     | 0        |
|    total_timesteps  | 7117     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 10655    |
|    time_elapsed     | 0        |
|    total_timesteps  | 7181     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 10726    |
|    time_elapsed     | 0        |
|    total_timesteps  | 9442     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 10732    |
|    time_elapsed     | 0        |
|    total_timesteps  | 9506     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 10737    |
|    time_elapsed     | 0        |
|    total_timesteps  | 9616     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 540      |
|    fps              | 10658    |
|    time_elapsed     | 1        |
|    total_timesteps  | 11820    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 544      |
|    fps              | 10655    |
|    time_elapsed     | 1        |
|    total_timesteps  | 11913    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 548      |
|    fps              | 10657    |
|    time_elapsed     | 1        |
|    total_timesteps  | 11979    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 648      |
|    fps              | 10688    |
|    time_elapsed     | 1        |
|    total_timesteps  | 14149    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 652      |
|    fps              | 10690    |
|    time_elapsed     | 1        |
|    total_timesteps  | 14237    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 10691    |
|    time_elapsed     | 1        |
|    total_timesteps  | 14328    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 756      |
|    fps              | 10679    |
|    time_elapsed     | 1        |
|    total_timesteps  | 16637    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 10676    |
|    time_elapsed     | 1        |
|    total_timesteps  | 16707    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 10675    |
|    time_elapsed     | 1        |
|    total_timesteps  | 16792    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 10716    |
|    time_elapsed     | 1        |
|    total_timesteps  | 19254    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 868      |
|    fps              | 10716    |
|    time_elapsed     | 1        |
|    total_timesteps  | 19352    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 872      |
|    fps              | 10717    |
|    time_elapsed     | 1        |
|    total_timesteps  | 19450    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x2641d89aa60>

In [38]:
episode = 5
for episode in range(1,episode+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action,_ = model.predict(obs) #Now using our model to take decision
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()

Episode: 1 Score: [9.]
Episode: 2 Score: [8.]
Episode: 3 Score: [10.]
Episode: 4 Score: [8.]
Episode: 5 Score: [9.]


In [35]:
#DQN.load(PPO_path, env = env)