In [270]:
#Import dependencies

!pip install stable-baselines3[extra]
!pip install gymnasium
!pip install pyglet







In [271]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

In [306]:
# load environment

environment_name = 'CartPole-v1'
env = gym.make(environment_name, render_mode="human")


In [273]:
environment_name

'CartPole-v1'

In [307]:
n_episodes = 5

for episode in range(n_episodes):
    score = 0
    n_state = env.reset()  # Reset the environment to start a new episode
    done = False
    
    while not done:
        env.render()
        action = env.action_space.sample()
        step_result = env.step(action)
        n_state, reward, done, info = step_result[:4]  # Unpack the first four values
        score += reward
        
    
    print('Episode:{} Score:{}'.format(episode, score))

# Close the environment when done
env.close()

Episode:0 Score:10.0
Episode:1 Score:17.0
Episode:2 Score:19.0
Episode:3 Score:24.0
Episode:4 Score:17.0


In [275]:
env.step(action)

error: display Surface quit

In [310]:
#understanding the environment

env.action_space.sample()

0

In [311]:
env.observation_space.sample()

array([ 1.7115535e+00, -1.2841006e+38, -3.2496206e-02, -3.0847348e+38],
      dtype=float32)

In [312]:
#Train an Rl Model

log_path = os.path.join('Training','Logs')

In [279]:
log_path

'Training\\Logs'

In [313]:
# Create the Gym environment
env = gym.make("CartPole-v0", render_mode="human")

# Wrap the environment using DummyVecEnv to make it compatible with Stable Baselines
env = DummyVecEnv([lambda: env])

# Define the PPO model using the 'MlpPolicy'
# 'MlpPolicy' is a type of neural network policy for reinforcement learning
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

# Here, 'log_path' should be replaced with the path to the directory where you want to store tensorboard logs
# This log will be useful for visualizing and analyzing training progress

Using cpu device


In [314]:
# Train the PPO model for a total of 20,000 timesteps
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_20


KeyboardInterrupt: 

In [282]:
#Save and Relode the model

# Define the path where the trained PPO model for CartPole will be saved
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [283]:
model.save(PPO_Path)

In [None]:
del model

In [284]:
PPO_Path

'Training\\Saved Models\\PPO_Model_Cartpole'

In [316]:
model.learn(total_timesteps = 1000)

Logging to Training\Logs\PPO_21


KeyboardInterrupt: 

In [286]:
model = PPO.load(PPO_Path, env = env)

In [315]:
#Evaluation

evaluate_policy(model, env, n_eval_episodes = 10, render = True)


KeyboardInterrupt: 

In [288]:
env.close()

In [289]:
action, _ = model.predict(obs)

In [290]:
action

array([0], dtype=int64)

In [293]:
#Test model

n_episodes = 5

for episode in range(1, n_episodes+1):
    obs = env.reset()
    score = 0
    done = False
    
    while not done:
        env.render()
        action, _ = model.predict(obs) #Using model here
        step_result = env.step(action)
        obs, reward, done, info = step_result[:4]  # Unpack the first four values
        score += reward
    
    print('Episode:{} Score:{}'.format(episode, score))

# Close the environment when done
#env.close()

error: display Surface quit

In [292]:
env.close()

In [295]:
#View logs in Tensorboard
training_log_path = os.path.join(log_path, 'PPO_2')

In [296]:
training_log_path

'Training\\Logs\\PPO_2'

In [297]:
!tensorboard --logdir={training_log_path} 

^C


In [299]:
#Adding Callback To the Training Set

from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [300]:
save_path = os.path.join('Training', 'Saved Models')

In [317]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold = 200, verbose = 1)
eval_callback = EvalCallback(env,
                            callback_on_new_best=stop_callback,
                            eval_freq=10000,
                            best_model_save_path=save_path,
                            verbose=1)

In [318]:
model = PPO('MlpPolicy', env,verbose=1, tensorboard_log=log_path)

Using cpu device


In [319]:
model.learn(total_timesteps = 20000, callback = eval_callback)

Logging to Training\Logs\PPO_22
-----------------------------
| time/              |      |
|    fps             | 46   |
|    iterations      | 1    |
|    time_elapsed    | 44   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 45          |
|    iterations           | 2           |
|    time_elapsed         | 89          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008402481 |
|    clip_fraction        | 0.112       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.000489    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.44        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0186     |
|    value_loss           | 60.5        |
-----------------------------------------
--

<stable_baselines3.ppo.ppo.PPO at 0x1a597f68d00>

In [321]:
#Changing Policies

net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

In [322]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch': net_arch})

Using cpu device




In [323]:
model.learn(total_timesteps=10000, callback=eval_callback)

Logging to Training\Logs\PPO_23
-----------------------------
| time/              |      |
|    fps             | 46   |
|    iterations      | 1    |
|    time_elapsed    | 44   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 45          |
|    iterations           | 2           |
|    time_elapsed         | 90          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013865634 |
|    clip_fraction        | 0.191       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.00231    |
|    learning_rate        | 0.0003      |
|    loss                 | 4.33        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0218     |
|    value_loss           | 20.1        |
-----------------------------------------
--

<stable_baselines3.ppo.ppo.PPO at 0x1a597f68e20>

In [324]:
#Using an alternative algorithm

from stable_baselines3 import DQN

In [325]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [326]:
model.learn(total_timesteps =20000)

Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.952    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 46       |
|    time_elapsed     | 2        |
|    total_timesteps  | 101      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.916    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 46       |
|    time_elapsed     | 3        |
|    total_timesteps  | 177      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.886    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 46       |
|    time_elapsed     | 5        |
|    total_timesteps  | 239      |
----------------------------------
------------------------

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 45       |
|    time_elapsed     | 51       |
|    total_timesteps  | 2351     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 45       |
|    time_elapsed     | 54       |
|    total_timesteps  | 2517     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 45       |
|    time_elapsed     | 57       |
|    total_timesteps  | 2656     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 46       |
|    time_elapsed     | 109      |
|    total_timesteps  | 5051     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 46       |
|    time_elapsed     | 111      |
|    total_timesteps  | 5167     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 46       |
|    time_elapsed     | 113      |
|    total_timesteps  | 5252     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 46       |
|    time_elapsed     | 157      |
|    total_timesteps  | 7266     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 46       |
|    time_elapsed     | 158      |
|    total_timesteps  | 7345     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 46       |
|    time_elapsed     | 160      |
|    total_timesteps  | 7435     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 46       |
|    time_elapsed     | 212      |
|    total_timesteps  | 9828     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 46       |
|    time_elapsed     | 213      |
|    total_timesteps  | 9888     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 46       |
|    time_elapsed     | 215      |
|    total_timesteps  | 9976     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 540      |
|    fps              | 46       |
|    time_elapsed     | 268      |
|    total_timesteps  | 12454    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 544      |
|    fps              | 46       |
|    time_elapsed     | 270      |
|    total_timesteps  | 12524    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 548      |
|    fps              | 46       |
|    time_elapsed     | 272      |
|    total_timesteps  | 12597    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 648      |
|    fps              | 46       |
|    time_elapsed     | 319      |
|    total_timesteps  | 14776    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 652      |
|    fps              | 46       |
|    time_elapsed     | 320      |
|    total_timesteps  | 14836    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 46       |
|    time_elapsed     | 322      |
|    total_timesteps  | 14918    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 756      |
|    fps              | 46       |
|    time_elapsed     | 367      |
|    total_timesteps  | 16997    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 46       |
|    time_elapsed     | 368      |
|    total_timesteps  | 17054    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 46       |
|    time_elapsed     | 369      |
|    total_timesteps  | 17123    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 46       |
|    time_elapsed     | 419      |
|    total_timesteps  | 19416    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 868      |
|    fps              | 46       |
|    time_elapsed     | 421      |
|    total_timesteps  | 19503    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 872      |
|    fps              | 46       |
|    time_elapsed     | 423      |
|    total_timesteps  | 19604    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x1a597df40a0>

In [3]:
git remote add origin https://github.com/Soumyajit-7/Reinforced-Learning-Basic-Model-1.git
git branch -M main
git push -u origin main

SyntaxError: invalid decimal literal (1726920618.py, line 1)