In [4]:
!apt-get install ffmpeg freeglut3-dev xvfb
!pip install "stable-baselines3[extra]>=2.0.0a4"

E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?
Collecting stable-baselines3>=2.0.0a4 (from stable-baselines3[extra]>=2.0.0a4)
  Downloading stable_baselines3-2.3.0-py3-none-any.whl.metadata (5.1 kB)
Collecting opencv-python (from stable-baselines3[extra]>=2.0.0a4)
  Downloading opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting pygame (from stable-baselines3[extra]>=2.0.0a4)
  Downloading pygame-2.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting tensorboard>=2.9.1 (from stable-baselines3[extra]>=2.0.0a4)
  Downloading tensorboard-2.16.2-py3-none-any.whl.metadata (1.6 kB)
Collecting shimmy~=1.3.0 (from shimmy[atari]~=1.3.0; extra == "extra"->stable-baselines3[extra]>=2.0.0a4)
  Downloading Shimmy-1.3.0-py3-none-any.whl.metadata (3.7 kB)
Collecting autorom~=0.6.1 

#### Libraries

In [1]:
# Import libraries
import environ as en
import gymnasium as gym
import numpy as np

#### Setup Model

In [2]:
# Import model (PPO)
from stable_baselines3 import PPO
# MLPPolicy since CartPole is a feature vector not an immage (CNNPolicy would be used for images)
from stable_baselines3.ppo.policies import MlpPolicy


# Create an enviroment for the model
env = gym.make("CartPole-v1")
# Make the model
model = PPO(MlpPolicy, env, verbose=0)

#### Custum Evaluate function

In [8]:
# Base RL algorithm 
from stable_baselines3.common.base_class import BaseAlgorithm


def evaluate(
    model: BaseAlgorithm,
    num_episodes: int = 100,
    deterministic: bool = True,
) -> float:
    """
    Evaluate an RL agent for `num_episodes`.

    :param model: the RL Agent
    :param env: the gym Environment
    :param num_episodes: number of episodes to evaluate it
    :param deterministic: Whether to use deterministic or stochastic actions
    :return: Mean reward for the last `num_episodes`
    """
    # This function will only work for a single environment
    vec_env = model.get_env() # get the environment
    obs = vec_env.reset() # reset function for environment
    all_episode_rewards = [] # episode rewards
    for _ in range(num_episodes):
        episode_rewards = []
        done = False
        # Note: SB3 VecEnv resets automatically:
        # https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html#vecenv-api-vs-gym-api
        # obs = vec_env.reset()
        while not done:
            # _states are only useful when using LSTM policies
            # `deterministic` is to use deterministic actions
            action, _states = model.predict(obs, deterministic=deterministic) # Deterministic policy
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, _info = vec_env.step(action)
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    print(f"Mean reward: {mean_episode_reward:.2f} - Num episodes: {num_episodes}")

    return mean_episode_reward

#### Stable Baselines Evaluate

In [10]:
# Random Agent, before training
mean_reward_before_train = evaluate(model, num_episodes=100, deterministic=True)

Mean reward: 9.17 - Num episodes: 100


In [4]:
# Stable-Baselines Evaluate Policy
from stable_baselines3.common.evaluation import evaluate_policy

# Evaluate the Policy
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, warn=False)

print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward: 412.55 +/- 95.54


#### Training the Model and Evaluate

In [5]:
# Train the agent for 10000 steps
model.learn(total_timesteps=10_000, log_interval=100, tb_log_name="test1", progress_bar=True)

# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:500.00 +/- 0.00
