In [2]:
import os 
import gym 
from stable_baselines3 import PPO 
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy 

In [3]:
#Enviroment 
environment_name = 'CartPole-v0'
env = gym.make(environment_name)

In [4]:
#understanding the Environment
episodes = 5
for episodes in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done , info = env.step(action)
        score += reward
    print('Episode:() Score:{}'.format(episodes,score))
env.close()

Episode:() Score:1
Episode:() Score:2
Episode:() Score:3
Episode:() Score:4
Episode:() Score:5


In [5]:
env.action_space

Discrete(2)

In [6]:
env.action_space.sample()

0

In [7]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [8]:
env.observation_space.sample()

array([-3.7236712e+00, -2.9373523e+38,  1.4868098e-01, -9.5686040e+37],
      dtype=float32)

In [9]:
#Training an Reinforcement Learning model 

#Makre your Directories first 
log_path = os.path.join('Training', 'Logs')

In [10]:
log_path

'Training\\Logs'

In [11]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1,tensorboard_log=log_path)

Using cpu device


In [12]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_3
-----------------------------
| time/              |      |
|    fps             | 359  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 538         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008795997 |
|    clip_fraction        | 0.0816      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | 0.0023      |
|    learning_rate        | 0.0003      |
|    loss                 | 8.47        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0118     |
|    value_loss           | 48.8        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x25f9bea07c0>

In [13]:
#Saving and reloading the model 
PPO_Path = os.path.join('Training','Saved Models','PPO_Model_Cartpole')

In [14]:
model.save(PPO_Path)

In [15]:
del model

In [16]:
PPO_Path

'Training\\Saved Models\\PPO_Model_Cartpole'

In [17]:
model = PPO.load(PPO_Path,env=env)

In [18]:
#Evaluation 
evaluate_policy(model, env,n_eval_episodes=10, render=True)



(200.0, 0.0)

In [19]:
env.close()

In [20]:
#Testing 

episodes = 5
for episodes in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action,_ = model.predict(obs)
        obs, reward, done , info = env.step(action)
        score += reward
    print('Episode:() Score:{}'.format(episodes,score))
env.close()

Episode:() Score:1
Episode:() Score:2
Episode:() Score:3
Episode:() Score:4
Episode:() Score:5


In [21]:
obs = env.reset()
action,_= model.predict(obs)

In [22]:
obs

array([[ 0.03568954,  0.03370797, -0.01301825,  0.02979722]],
      dtype=float32)

In [23]:
env.action_space.sample()

0

In [24]:
env.step(action)

(array([[ 0.0363637 ,  0.22901416, -0.01242231, -0.2669645 ]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{}])

In [25]:
training_log_path = os.path.join (log_path,'PPO_2')

In [26]:
training_log_path

'Training\\Logs\\PPO_2'

In [28]:
#Adding Callback to the Training Stage 
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [32]:
save_path = os.path.join('Training','Saved Models')

In [35]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200 , verbose=1)
eval_callback = EvalCallback(env,
                            callback_on_new_best=stop_callback,
                            eval_freq=10000,
                            best_model_save_path= save_path,
                            verbose=1)

In [36]:
model = PPO('MlpPolicy', env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [37]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training\Logs\PPO_4
-----------------------------
| time/              |      |
|    fps             | 2371 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1496        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007910954 |
|    clip_fraction        | 0.0956      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -5.52e-05   |
|    learning_rate        | 0.0003      |
|    loss                 | 8.02        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.015      |
|    value_loss           | 53.5        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x25fabcf6ee0>

In [40]:
#Change Policies 

net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])] 

In [45]:
model = PPO('MlpPolicy', env,verbose=1,tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [46]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training\Logs\PPO_5
-----------------------------
| time/              |      |
|    fps             | 1781 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 992         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014430705 |
|    clip_fraction        | 0.215       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | 7.72e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 3.44        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0263     |
|    value_loss           | 21.9        |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 200          |
|    mean_reward          | 200          |
| time/                   |              |
|    total_timesteps      | 10000        |
| train/                  |              |
|    approx_kl            | 0.0078855455 |
|    clip_fraction        | 0.0989       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.565       |
|    explained_variance   | 0.599        |
|    learning_rate        | 0.0003       |
|    loss                 | 13.2         |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.0147      |
|    value_loss           | 35.3         |
------------------------------------------
------------------------------
| time/              |       |
|    fps             | 761   |
|    iterations      | 5     |
|    time_ela

<stable_baselines3.ppo.ppo.PPO at 0x25fadeaf1c0>

In [48]:
#using alternative algorithm 
from stable_baselines3 import DQN

In [49]:
model = DQN('MlpPolicy', env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [51]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.954    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8729     |
|    time_elapsed     | 0        |
|    total_timesteps  | 96       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.916    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 9313     |
|    time_elapsed     | 0        |
|    total_timesteps  | 177      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.857    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 10710    |
|    time_elapsed     | 0        |
|    total_timesteps  | 300      |
----------------------------------
------------------------

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 11578    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2270     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 11642    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2364     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 11691    |
|    time_elapsed     | 0        |
|    total_timesteps  | 2444     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 11802    |
|    time_elapsed     | 0        |
|    total_timesteps  | 4557     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 11801    |
|    time_elapsed     | 0        |
|    total_timesteps  | 4639     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 11807    |
|    time_elapsed     | 0        |
|    total_timesteps  | 4724     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 11988    |
|    time_elapsed     | 0        |
|    total_timesteps  | 6979     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 11999    |
|    time_elapsed     | 0        |
|    total_timesteps  | 7081     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 11994    |
|    time_elapsed     | 0        |
|    total_timesteps  | 7150     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 12107    |
|    time_elapsed     | 0        |
|    total_timesteps  | 9349     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 12105    |
|    time_elapsed     | 0        |
|    total_timesteps  | 9420     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 12108    |
|    time_elapsed     | 0        |
|    total_timesteps  | 9507     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 540      |
|    fps              | 12127    |
|    time_elapsed     | 0        |
|    total_timesteps  | 11863    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 544      |
|    fps              | 12134    |
|    time_elapsed     | 0        |
|    total_timesteps  | 11955    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 548      |
|    fps              | 12121    |
|    time_elapsed     | 0        |
|    total_timesteps  | 12039    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 648      |
|    fps              | 12169    |
|    time_elapsed     | 1        |
|    total_timesteps  | 14363    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 652      |
|    fps              | 12178    |
|    time_elapsed     | 1        |
|    total_timesteps  | 14484    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 12170    |
|    time_elapsed     | 1        |
|    total_timesteps  | 14572    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 756      |
|    fps              | 12159    |
|    time_elapsed     | 1        |
|    total_timesteps  | 16614    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 12177    |
|    time_elapsed     | 1        |
|    total_timesteps  | 16784    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 12184    |
|    time_elapsed     | 1        |
|    total_timesteps  | 16891    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 12231    |
|    time_elapsed     | 1        |
|    total_timesteps  | 19294    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 868      |
|    fps              | 12228    |
|    time_elapsed     | 1        |
|    total_timesteps  | 19349    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 872      |
|    fps              | 12232    |
|    time_elapsed     | 1        |
|    total_timesteps  | 19441    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x25fadee93a0>

In [53]:
model.save(save_path)

In [54]:
DQN.load

<bound method BaseAlgorithm.load of <class 'stable_baselines3.dqn.dqn.DQN'>>