In [3]:
#Import dependencies

!pip install stable-baselines3[extra]
!pip install gymnasium
!pip install pyglet





In [4]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

In [5]:
# load environment

environment_name = 'CartPole-v1'
env = gym.make(environment_name, render_mode="human")


In [6]:
environment_name

'CartPole-v1'

In [7]:
n_episodes = 5

for episode in range(n_episodes):
    score = 0
    n_state = env.reset()  # Reset the environment to start a new episode
    done = False
    
    while not done:
        env.render()
        action = env.action_space.sample()
        step_result = env.step(action)
        n_state, reward, done, info = step_result[:4]  # Unpack the first four values
        score += reward
        
    
    print('Episode:{} Score:{}'.format(episode, score))

# Close the environment when done
env.close()

Episode:0 Score:28.0
Episode:1 Score:12.0
Episode:2 Score:38.0
Episode:3 Score:11.0
Episode:4 Score:13.0


In [10]:
#understanding the environment

env.action_space.sample()

0

In [11]:
env.observation_space.sample()

array([-8.9640146e-01,  1.6710913e+38,  1.9785228e-01, -2.5452518e+38],
      dtype=float32)

In [12]:
#Train an Rl Model

log_path = os.path.join('Training','Logs')

In [13]:
log_path

'Training\\Logs'

In [14]:
# Create the Gym environment
env = gym.make("CartPole-v0", render_mode="human")

# Wrap the environment using DummyVecEnv to make it compatible with Stable Baselines
env = DummyVecEnv([lambda: env])

# Define the PPO model using the 'MlpPolicy'
# 'MlpPolicy' is a type of neural network policy for reinforcement learning
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

# Here, 'log_path' should be replaced with the path to the directory where you want to store tensorboard logs
# This log will be useful for visualizing and analyzing training progress

  logger.deprecation(


Using cpu device


In [15]:
# Train the PPO model for a total of 20,000 timesteps
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_24
-----------------------------
| time/              |      |
|    fps             | 46   |
|    iterations      | 1    |
|    time_elapsed    | 44   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 45          |
|    iterations           | 2           |
|    time_elapsed         | 90          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008467187 |
|    clip_fraction        | 0.0836      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00438    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.66        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0124     |
|    value_loss           | 54.2        |
-----------------------------------------
--

<stable_baselines3.ppo.ppo.PPO at 0x2abddbd53f0>

In [16]:
#Save and Relode the model

# Define the path where the trained PPO model for CartPole will be saved
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [17]:
model.save(PPO_Path)

In [18]:
#del model

In [19]:
PPO_Path

'Training\\Saved Models\\PPO_Model_Cartpole'

In [20]:
model.learn(total_timesteps = 1000)

Logging to Training\Logs\PPO_25


-----------------------------
| time/              |      |
|    fps             | 48   |
|    iterations      | 1    |
|    time_elapsed    | 42   |
|    total_timesteps | 2048 |
-----------------------------


<stable_baselines3.ppo.ppo.PPO at 0x2abddbd53f0>

In [21]:
model = PPO.load(PPO_Path, env = env)

In [22]:
#Evaluation

evaluate_policy(model, env, n_eval_episodes = 10, render = True)




(200.0, 0.0)

In [23]:
#env.close()

In [27]:
action

1

In [29]:
#Test model

n_episodes = 5

for episode in range(1, n_episodes+1):
    obs = env.reset()
    score = 0
    done = False
    
    while not done:
        env.render()
        action, _ = model.predict(obs) #Using model here
        step_result = env.step(action)
        obs, reward, done, info = step_result[:4]  # Unpack the first four values
        score += reward
    
    print('Episode:{} Score:{}'.format(episode, score))

# Close the environment when done
#env.close()

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [292]:
env.close()

In [30]:
#View logs in Tensorboard
training_log_path = os.path.join(log_path, 'PPO_2')

In [31]:
training_log_path

'Training\\Logs\\PPO_2'

In [297]:
!tensorboard --logdir={training_log_path} 

^C


In [32]:
#Adding Callback To the Training Set

from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [33]:
save_path = os.path.join('Training', 'Saved Models')

In [34]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold = 200, verbose = 1)
eval_callback = EvalCallback(env,
                            callback_on_new_best=stop_callback,
                            eval_freq=10000,
                            best_model_save_path=save_path,
                            verbose=1)

In [35]:
model = PPO('MlpPolicy', env,verbose=1, tensorboard_log=log_path)

Using cpu device


In [36]:
model.learn(total_timesteps = 20000, callback = eval_callback)

Logging to Training\Logs\PPO_26


-----------------------------
| time/              |      |
|    fps             | 46   |
|    iterations      | 1    |
|    time_elapsed    | 44   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 45          |
|    iterations           | 2           |
|    time_elapsed         | 89          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008473435 |
|    clip_fraction        | 0.0982      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00169     |
|    learning_rate        | 0.0003      |
|    loss                 | 8.93        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0146     |
|    value_loss           | 56.9        |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x2abddbfb7c0>

In [37]:
#Changing Policies

net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

In [38]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch': net_arch})

Using cpu device




In [39]:
model.learn(total_timesteps=10000, callback=eval_callback)

Logging to Training\Logs\PPO_27


-----------------------------
| time/              |      |
|    fps             | 46   |
|    iterations      | 1    |
|    time_elapsed    | 44   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 43          |
|    iterations           | 2           |
|    time_elapsed         | 93          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015569285 |
|    clip_fraction        | 0.209       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.00255    |
|    learning_rate        | 0.0003      |
|    loss                 | 2.94        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0229     |
|    value_loss           | 18.7        |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x2abddc61ea0>

In [40]:
#Using an alternative algorithm

from stable_baselines3 import DQN

In [41]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [42]:
model.learn(total_timesteps =20000)

Logging to Training\Logs\DQN_2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.96     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 46       |
|    time_elapsed     | 1        |
|    total_timesteps  | 84       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.925    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 46       |
|    time_elapsed     | 3        |
|    total_timesteps  | 158      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.89     |
| time/               |          |
|    episodes         | 12       |
|    fps              | 46       |
|    time_elapsed     | 5        |
|    total_timesteps  | 232      |
----------------------------------
------------------------

<stable_baselines3.dqn.dqn.DQN at 0x2abddbf9480>