# 1. Import dependencies

In [None]:
!pip install stable-baselines3[extra]

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Downloading stable_baselines3-2.7.0-py3-none-any.whl (187 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stable-baselines3
Successfully installed stable-baselines3-2.7.0


In [None]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Load Environment

In [None]:
environment_name = "CartPole-v1"

In [None]:
env = gym.make(environment_name, render_mode="human")

  logger.deprecation(


In [None]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        # env.render() # render_mode is set in gym.make
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:40.0
Episode:2 Score:16.0
Episode:3 Score:21.0
Episode:4 Score:22.0
Episode:5 Score:16.0


# Understanding The Environment

In [None]:
# 0-push cart to left, 1-push cart to the right
env.action_space.sample()

np.int64(1)

In [None]:
# [cart position, cart velocity, pole angle, pole angular velocity]
env.observation_space.sample()

array([ 0.09414016, -0.55836326, -0.24045606,  0.8105944 ], dtype=float32)

# 3. Train an RL Model

In [None]:
import os
log_path = os.path.join('Training', 'Logs')
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)
model.learn(total_timesteps=20000)

Using cpu device
Logging to Training/Logs/PPO_1


  logger.deprecation(


-----------------------------
| time/              |      |
|    fps             | 1140 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 870         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008567128 |
|    clip_fraction        | 0.0812      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | 0.000365    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.17        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0112     |
|    value_loss           | 50.8        |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7a926c830fb0>

# 4. Save and Reload Model

In [None]:
import os
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model')
model.save(PPO_path)


In [None]:
del model
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model')
model = PPO.load(PPO_path, env=env)

# 4. Evaluation

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(np.float64(200.0), np.float64(0.0))

In [None]:
env.close()

# 5. Test Model

In [None]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        done = terminated or truncated
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.close()

# 6. Viewing Logs in Tensorboard

In [None]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [None]:
!tensorboard --logdir={training_log_path}

  "[`\000-\040\177-\240\s]+",
  style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
  if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
  for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
2025-09-28 05:33:30.841761: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759037610.867955    2774 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759037610.875517    2774 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1759037610.894901    2774 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1759037610.894945    27

# 7. Adding a callback to the training Stage

In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os
save_path = os.path.join('Training', 'Saved Models')
log_path = os.path.join('Training', 'Logs')

In [None]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])

In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [None]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)
model.learn(total_timesteps=20000, callback=eval_callback)

Using cpu device
Logging to Training/Logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 1147 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 872         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008874618 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.0116      |
|    learning_rate        | 0.0003      |
|    loss                 | 7.03        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0152     |
|    value_loss           | 54.2        |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7a926c9261e0>

In [None]:
model_path = os.path.join('Training', 'Saved Models', 'best_model')
model = PPO.load(model_path, env=env)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(np.float64(200.0), np.float64(0.0))

In [None]:
env.close()

# 8. Changing Policies

In [None]:
net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]
model = PPO('MlpPolicy', env, verbose = 1, policy_kwargs={'net_arch': net_arch})
model.learn(total_timesteps=20000, callback=eval_callback)

Using cpu device




-----------------------------
| time/              |      |
|    fps             | 1082 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 648        |
|    iterations           | 2          |
|    time_elapsed         | 6          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.01434309 |
|    clip_fraction        | 0.21       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.681     |
|    explained_variance   | 0.00335    |
|    learning_rate        | 0.0003     |
|    loss                 | 3.95       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0271    |
|    value_loss           | 21.3       |
----------------------------------------
-----------------------------------------
| time/   

<stable_baselines3.ppo.ppo.PPO at 0x7a926c925d60>

# 9. Using an Alternate Algorithm

In [53]:
from stable_baselines3 import DQN
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)
model.learn(total_timesteps=20000, callback=eval_callback)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    learning_rate    | 0.0001   |
|    loss             | 0.000889 |
|    n_updates        | 1228     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 850      |
|    time_elapsed     | 5        |
|    total_timesteps  | 5059     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000273 |
|    n_updates        | 1239     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 832      |
|    time_elapsed     | 6        |
|    total_timesteps  | 5103     |
| train/              |          |
|    learning_rate    | 0

<stable_baselines3.dqn.dqn.DQN at 0x7a926cb16e70>

In [54]:
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')
model.save(dqn_path)

In [55]:
model = DQN.load(dqn_path, env=env)
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(np.float64(9.4), np.float64(0.8))

In [56]:
env.close()