# 1 Import dependencies

In [1]:
!pip install stable-baselines3[extra]
!pip install gymnasium

Collecting opencv-python (from stable-baselines3[extra])
  Obtaining dependency information for opencv-python from https://files.pythonhosted.org/packages/c7/ec/9dabb6a9abfdebb3c45b0cc52dec901caafef2b2c7e7d6a839ed86d81e91/opencv_python-4.9.0.80-cp37-abi3-win_amd64.whl.metadata
  Using cached opencv_python-4.9.0.80-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting pygame (from stable-baselines3[extra])
  Obtaining dependency information for pygame from https://files.pythonhosted.org/packages/82/61/93ae7afbd931a70510cfdf0a7bb0007540020b8d80bc1d8762ebdc46479b/pygame-2.5.2-cp311-cp311-win_amd64.whl.metadata
  Using cached pygame-2.5.2-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting rich (from stable-baselines3[extra])
  Obtaining dependency information for rich from https://files.pythonhosted.org/packages/be/be/1520178fa01eabe014b16e72a952b9f900631142ccd03dc36cf93e30c1ce/rich-13.7.0-py3-none-any.whl.metadata
  Using cached rich-13.7.0-py3-none-any.whl.metadata (18 kB)
Collecting sh

In [22]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [23]:
environment_name = "CartPole-v1"
env = gym.make(environment_name, render_mode="human")

In [24]:
episodes = 5
for episode in range(1, episodes + 1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, truncated, info = env.step(action)
        score += reward
    print("Episode: {}, Score: {}".format(episode, score))
env.close()

Episode: 1, Score: 23.0
Episode: 2, Score: 31.0
Episode: 3, Score: 25.0
Episode: 4, Score: 52.0
Episode: 5, Score: 47.0


### Shows random actions

In [5]:
env.action_space.sample()

1

### Displays observation of the space

In [6]:
env.observation_space.sample()

array([-4.7542429e+00, -1.6298754e+38, -3.7188858e-01, -2.6541647e+37],
      dtype=float32)

### Creating a path to store the tensorboard log of the training

In [14]:
log_path = os.path.join('C://', 'Users', 'vyshn', 'Documents System', '6th Sem', 'RL', 'RL_Projects', 'Training', 'Logs')

In [15]:
log_path

'C:Users\\vyshn\\Documents System\\6th Sem\\RL\\RL_Projects\\Training\\Logs'

### Setting up dummy vector environment

In [19]:
env = gym.make(environment_name, render_mode = "human")
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)

Using cpu device


### Training the model

In [10]:
model.learn(total_timesteps = 20000)

Logging to C:Users\vyshn\Documents System\6th Sem\RL\RL_Projects\Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 521  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 381         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009878541 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00231     |
|    learning_rate        | 0.0003      |
|    loss                 | 5.08        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0186     |
|    value_loss           | 51.4 

<stable_baselines3.ppo.ppo.PPO at 0x1bb14ccf350>

### Setting up path for storing the model

In [16]:
PPO_path = os.path.join('C:', 'Users', 'vyshn', 'Documents System', '6th Sem', 'RL', 'RL_Projects', 'Training', 'Saved Models', 'PPO_Model_Cartpole.zip')

In [25]:
model.save(PPO_path)



In [93]:
del model

### Reloading the model from saved path

In [20]:
model = PPO.load(PPO_path, env = env)

### Evaluating the model's performance

In [21]:
#Evaluation
evaluate_policy(model, env, n_eval_episodes = 10, render = True) 



(500.0, 0.0)

### Closing the env

In [93]:
env.close()

In [16]:
# Observation space
env.reset()[0]

array([ 0.0273257 ,  0.04699913,  0.01946591, -0.02997928], dtype=float32)

### Gameplay by the PPO Model

In [40]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done: 
        print('info', info)
        break

info [{'TimeLimit.truncated': True, 'terminal_observation': array([-1.4361699 , -0.03888802,  0.03800402, -0.05264218], dtype=float32)}]


### Playing for 5 episodes

In [18]:
episodes = 5
for episode in range(1, episodes + 1):
    obs  = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    print("Episode: {}, Score: {}".format(episode, score))
env.close()

ValueError: You have passed a tuple to the predict() function instead of a Numpy array or a Dict. You are probably mixing Gym API with SB3 VecEnv API: `obs, info = env.reset()` (Gym) vs `obs = vec_env.reset()` (SB3 VecEnv). See related issue https://github.com/DLR-RM/stable-baselines3/issues/1694 and documentation for more information: https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html#vecenv-api-vs-gym-api

In [29]:
!pip install tensorboard



# Viewing logs in Tensorboard

In [16]:
training_log_path = os.path.join(log_path, 'PPO_5')

In [17]:
training_log_path

'C:Users\\vyshn\\Documents System\\6th Sem\\RL\\RL_Projects\\Training\\Logs\\PPO_5'

In [18]:
!tensorboard --logdir={training_log_path}

2024-02-14 12:30:34.321865: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.

usage: tensorboard [-h] [--helpfull] [--logdir PATH] [--logdir_spec PATH_SPEC]
                   [--host ADDR] [--bind_all] [--port PORT]
                   [--reuse_port BOOL] [--load_fast {false,auto,true}]
                   [--extra_data_server_flags EXTRA_DATA_SERVER_FLAGS]
                   [--grpc_creds_type {local,ssl,ssl_dev}]
                   [--grpc_data_provider PORT] [--purge_orphaned_data BOOL]
                   [--db URI] [--db_import] [--inspect] [--version_tb]
                   [--tag TAG] [--event_file PATH] [--path_prefix PATH]
                   [--window_title TEXT] [--max_reload_threads COUNT]
                   [--reload_interval SECONDS] [--reload_task TYPE]
        

In [33]:
!pip install tensorflow

Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/93/21/9b035a4f823d6aee2917c75415be9a95861ff3d73a0a65e48edbf210cec1/tensorflow-2.15.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow-2.15.0-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Collecting tensorflow-intel==2.15.0 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.15.0 from https://files.pythonhosted.org/packages/4c/48/1a5a15517f18eaa4ff8d598b1c000300b20c1bb0e624539d702117a0c369/tensorflow_intel-2.15.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow_intel-2.15.0-cp311-cp311-win_amd64.whl.metadata (5.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.15.0->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow-intel==2.15.0->tensorflow)
  Obtaining dependency information for flatbuffers>=23.5.26 from https://files.pythonhosted.org/packages/6

# Adding a Callback to the training stage

In [38]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [39]:
save_path = os.path.join('D:', 'RL', 'Training', 'Saved Models')

In [66]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold = 600, verbose = 1)
eval_callback = EvalCallback(env,
                            callback_on_new_best = stop_callback,
                            eval_freq = 10000,
                            best_model_save_path = save_path,
                            verbose = 1)

In [67]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)

Using cpu device


MlpPolicy is a type of policy network that uses a multi-layer perceptron (MLP) to extract features from the observations and output actions and values

In [68]:
model.learn(total_timesteps = 20000, callback = eval_callback)

Logging to D:RL\Training\Logs\PPO_7
-----------------------------
| time/              |      |
|    fps             | 556  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 381        |
|    iterations           | 2          |
|    time_elapsed         | 10         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00831252 |
|    clip_fraction        | 0.113      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.686     |
|    explained_variance   | 0.0069     |
|    learning_rate        | 0.0003     |
|    loss                 | 7.26       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0193    |
|    value_loss           | 53.4       |
----------------------------------------
----------------

<stable_baselines3.ppo.ppo.PPO at 0x1962d596c50>

Verbose controls the amount of info about the training displayed.

In [99]:
model_path = os.path.join('D:', 'RL', 'Training', 'Saved Models','best_model')
model = PPO.load(model_path, env = env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [121]:
evaluate_policy(model, env, n_eval_episodes = 10, render = True)

(9.6, 0.4898979485566356)

In [128]:
env.close()

# Changing Policies

In [114]:
net_arch = [dict(pi = [128, 128, 128, 128], vf = [128, 128, 128, 128])]

In [115]:
model = PPO('MlpPolicy', env, verbose = 1, policy_kwargs = {'net_arch' : net_arch})

Using cpu device


In [129]:
model.learn(total_timesteps = 20000, callback = eval_callback)

Logging to D:RL\Training\Logs\DQN_4
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.956    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3423     |
|    time_elapsed     | 0        |
|    total_timesteps  | 92       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.917    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3974     |
|    time_elapsed     | 0        |
|    total_timesteps  | 174      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.857    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 4672     |
|    time_elapsed     | 0        |
|    total_timesteps  | 300      |
----------------------------------
-------------------

<stable_baselines3.dqn.dqn.DQN at 0x196318a09d0>

# Using an alternative Algorithm (DQN)

In [7]:
from stable_baselines3 import DQN

In [11]:
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [132]:
model.learn(total_timesteps = 20000)

Logging to D:RL\Training\Logs\PPO_9
-----------------------------
| time/              |      |
|    fps             | 557  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 389         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008692675 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00132    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.48        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.017      |
|    value_loss           | 49.3        |
----------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x19635917990>

In [134]:
evaluate_policy(model, env, n_eval_episodes = 10, render = True)

(500.0, 0.0)

In [3]:
dqn_path = os.path.join('D:', 'RL', 'Training', 'Saved Models', 'DQN_model')

In [4]:
model.save(dqn_path)

NameError: name 'model' is not defined

In [139]:
env.close()