In [1]:
#!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

In [2]:
#!pip install pyopengl
#!pip install pyglet==1.5.27
#!pip install stable_baselines3[extra]

In [29]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv #multiple agents at a time, Wraping in dummy enviroment i think
from stable_baselines3.common.evaluation import evaluate_policy #Evaluate success

In [30]:
#Load Enviroment

In [31]:
env_name = 'MountainCar-v0'
env = gym.make(env_name)

In [32]:
episode = 5
for episode in range(1,episode+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()

Episode: 1 Score: -200.0
Episode: 2 Score: -200.0
Episode: 3 Score: -200.0
Episode: 4 Score: -200.0
Episode: 5 Score: -200.0


In [10]:
#Understanding Environment

In [6]:
env.action_space

Discrete(3)

In [7]:
env.action_space.sample()

1

In [8]:
env.observation_space

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)

In [9]:
env.observation_space.sample()

array([0.32944986, 0.0634734 ], dtype=float32)

In [10]:
#Training

In [11]:
log_path = os.path.join('Training', 'Logs')

In [12]:
log_path

'Training\\Logs'

In [33]:
env = gym.make(env_name)
env = DummyVecEnv([lambda : env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device


In [14]:
PPO?? #To understand parameters of PPO

In [34]:
model.learn(total_timesteps=30000)

Logging to Training\Logs\PPO_6
-----------------------------
| time/              |      |
|    fps             | 495  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 370         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.003848855 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.1        |
|    explained_variance   | 3.8e-05     |
|    learning_rate        | 0.0003      |
|    loss                 | 13.8        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.000904   |
|    value_loss           | 133         |
-----------------------------------------
---

------------------------------------------
| time/                   |              |
|    fps                  | 290          |
|    iterations           | 13           |
|    time_elapsed         | 91           |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0041539213 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.06        |
|    explained_variance   | -0.00161     |
|    learning_rate        | 0.0003       |
|    loss                 | 0.22         |
|    n_updates            | 120          |
|    policy_gradient_loss | 0.000218     |
|    value_loss           | 3.62         |
------------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 290         |
|    iterations           | 14          |
|    time_elapsed         | 98          |
|    total_times

-----------------------------------------
| time/                   |             |
|    fps                  | 290         |
|    iterations           | 24          |
|    time_elapsed         | 168         |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.004853756 |
|    clip_fraction        | 0.00874     |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.891      |
|    explained_variance   | -0.0047     |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00299    |
|    n_updates            | 230         |
|    policy_gradient_loss | -0.000352   |
|    value_loss           | 0.0282      |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 290         |
|    iterations           | 25          |
|    time_elapsed         | 176         |
|    total_timesteps      | 51200 

<stable_baselines3.ppo.ppo.PPO at 0x1fa75e4e880>

In [18]:
#Save Model

In [35]:
PPO_path = os.path.join('Training', 'Saved_Models', 'PPO_Model_Mountaincar')

In [36]:
model.save(PPO_path)
#del model To delete a model

In [37]:
#Reload Model

In [38]:
model = PPO.load(PPO_path, env = env)

In [39]:
# Evaluation
# Solved if avg Reward is 200 or higher, lower the standard deviation the better

In [40]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(-200.0, 0.0)

In [41]:
env.close()

In [26]:
#Testing our model(agent) in the environment

In [16]:
obs = env.reset()
model.predict(obs)

(array([2], dtype=int64), None)

In [17]:
action, _ = model.predict(obs)
action

array([0], dtype=int64)

In [26]:
env.step(action)

(array([[-4.2180881e-01,  2.5017478e-04]], dtype=float32),
 array([-1.], dtype=float32),
 array([False]),
 [{}])

In [25]:
episode = 5
for episode in range(1,episode+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action,_ = model.predict(obs) #Now using our model to take decision
        obs, reward, done, info = env.step([0])
        score += reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()

Episode: 1 Score: [-200.]
Episode: 2 Score: [-200.]
Episode: 3 Score: [-200.]
Episode: 4 Score: [-200.]
Episode: 5 Score: [-200.]


In [None]:
# View Logs in Tensorboard

In [41]:
training_log_path = os.path.join(log_path, 'PPO_6')
path

'Training\\Logs\\PPO6'

In [42]:
#!tensorboard --logdir={training_log_path}

^C


In [28]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [29]:
save_path = os.path.join('Training', 'Saved_Models')

In [30]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold = 200, verbose = 1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best = stop_callback,
                             eval_freq = 5000,
                             best_model_save_path = save_path,
                             verbose = 1)

In [31]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [32]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Logs\PPO_4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -200     |
| time/              |          |
|    fps             | 498      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -200         |
| time/                   |              |
|    fps                  | 374          |
|    iterations           | 2            |
|    time_elapsed         | 10           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0020716218 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.1         |
|    explained_variance   |

KeyboardInterrupt: 