## Cartpole Control

### Author: Priyanka Dani

- Project 1 in RL
- Date: 23/11/2022
- Reference: [https://www.youtube.com/watch?v=Mut_u40Sqz4&t=1154s]
- Cartpole Documentation: [https://www.gymlibrary.dev/environments/classic_control/cart_pole/]
- Further references: 
    - [https://spinningup.openai.com/en/latest/user/introduction.html#what-this-is]
    - [https://stable-baselines3.readthedocs.io/en/master/guide/algos.html]

In [3]:
## Importing the dependencies ##

import os #useful in saving our model and logout fast
import gym #environment
from stable_baselines3 import PPO #imported one of the RL algorithms PPO
from stable_baselines3.common.vec_env import DummyVecEnv #its a wrapper around the environment
from stable_baselines3.common.evaluation import evaluate_policy #makes it easier to evaluate the policy

In [4]:
## Loading the environment ##

environment_name = "CartPole-v1"
env = gym.make(environment_name)

In [5]:
## Running the environment ##

#had to install pyglet

episodes = 5 #episode length is 500 frames for v1 and 200 frames for v0
for episode in range(1, episodes + 1):
    state = env.reset() #this gives the observations of the environment
    done = False
    score = 0
    
    while not done:
        env.render() #allows us to view the environment
        #env.action_space #will give you the nature of the action space
        action = env.action_space.sample() #generates a random action
        #env.observation_space #will give you the nature of the action space
        n_state, reward, done, info = env.step(action) #observations, rewards, whether or not the episode is done
        score += reward
    print('Episode: {} Score: {}'.format(episode, score))
env.close() #shuts the window down

Episode: 1 Score: 12.0
Episode: 2 Score: 30.0
Episode: 3 Score: 18.0
Episode: 4 Score: 14.0
Episode: 5 Score: 22.0


In [6]:
## Understanding the environment ##

env.action_space #discrete(2,)
env.observation_space #box(4,) like an ndarray

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [7]:
## To save the logs of the training ##
log_path = os.path.join('Training', 'Logs') #manually created directories
log_path

'Training/Logs'

In [8]:
## Creating the agent ## 

env = gym.make(environment_name) #recreate the environment
env = DummyVecEnv([lambda: env]) #wrapped the environment in dummy
agent = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path) #define the agent
#MlpPolicy: Multilayer Perceptron policy, environment handle, we want to log the result, where to log

#PPO?? #learn more about the policy arguments

Using cpu device


In [9]:
## Training the agent ##

agent.learn(total_timesteps = 20000) #recommended 20000

Logging to Training/Logs/PPO_1
-----------------------------
| time/              |      |
|    fps             | 928  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 998         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008147578 |
|    clip_fraction        | 0.0909      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00338    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.66        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0137     |
|    value_loss           | 54.9        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x7f9537794760>

In [10]:
## Save the agent ##

PPO_path = os.path.join('Training', 'Saved_agents', 'PPO_Cartpole_Agent')
agent.save(PPO_path)

In [None]:
## Delete an agent ##

#del agent

In [None]:
## Load a model ##

#agent = PPO.load(PPO_path, env=env) #this will load and start the learning process

In [11]:
## Evaluating the trained agent ##
evaluate_policy(agent, env, n_eval_episodes = 10, render =True)

#env.close()



(500.0, 0.0)

In [12]:
## Testing the agent ##

episodes = 5
for episode in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = agent.predict(obs) #model now determines the action
        obs, reward, done, info = env.step(action) #model's chosen action passed to env
        score += reward
    print('Episode: {} Score: {}'.format(episode, score))

Episode: 1 Score: [467.]
Episode: 2 Score: [144.]
Episode: 3 Score: [413.]
Episode: 4 Score: [141.]
Episode: 5 Score: [500.]


In [13]:
#env.close() #shuts the window down

In [None]:
## Tensorboard viewing training logs ##

#training_log_path = os.path.join('Training', 'Logs','PPO_1')

#Run this in the terminal better 
#cd Documents/Training/Logs/PPO_1 --> #tensorboard --logdir=.

#Run in jupyter, need to close later
#!tensorboard --logdir=training_log_path

In [15]:
## CALLBACK Functions ##

# Especially useful while training large models. It automatically stops training and saves the model when the reward
# threshold is achieved.

from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

save_path = os.path.join('Training', 'Saved_agents')

stop_callback = StopTrainingOnRewardThreshold(reward_threshold = 475, verbose = 1)

eval_callback = EvalCallback(env, 
                             callback_on_new_best = stop_callback, 
                             eval_freq = 10000, 
                             best_model_save_path = save_path, 
                             verbose = 1)

In [17]:
## Creating new agent for callback ##

agent_callback = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)
agent_callback.learn(total_timesteps = 20000, callback  = eval_callback)

evaluate_policy(agent_callback, env, n_eval_episodes = 10, render =True) #evaluate agent
env.close()

Using cpu device
Logging to Training/Logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 1503 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1214        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008082055 |
|    clip_fraction        | 0.0942      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00692     |
|    learning_rate        | 0.0003      |
|    loss                 | 7.71        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0135     |
|    value_loss           | 54.4        |
----------------------------

(487.4, 37.8)

In [20]:
## Specifying a different architecture of the neural network of the policy ##

net_arch = [dict(pi = [128, 128, 128, 128], vf = [128, 128, 128, 128])] 
#here we have a 4 layered 128 neuron each NN for policy and value function of PPO

agent_netarch = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path, policy_kwargs = {'net_arch':net_arch})

agent_netarch.learn(total_timesteps = 20000, callback = eval_callback)

evaluate_policy(agent_callback, env, n_eval_episodes = 5, render =True) #evaluate agent
env.close()

Using cpu device
Logging to Training/Logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 1349 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 781          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0149679445 |
|    clip_fraction        | 0.224        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.681       |
|    explained_variance   | 0.000136     |
|    learning_rate        | 0.0003       |
|    loss                 | 3.57         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0232      |
|    value_loss           | 20           |
-----------