In [3]:
import stable_baselines3

print(f"{stable_baselines3.__version__=}")

stable_baselines3.__version__='2.5.0'


In [4]:
import gymnasium as gym
import numpy as np

print(f"{gym.__version__=}")

gym.__version__='1.0.0'


### RL algorithm

In [8]:
from stable_baselines3 import PPO

The next thing you need to import is the policy class that will be used to create the networks (for the policy/value functions).
This step is optional as you can directly use strings in the constructor: 

```PPO('MlpPolicy', env)``` instead of ```PPO(MlpPolicy, env)```

Note that some algorithms like `SAC` have their own `MlpPolicy`, that's why using string for the policy is the recommened option.

In [18]:
from stable_baselines3.ppo import MlpPolicy

### Environment

In [19]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
model = PPO(MlpPolicy, env, verbose=0)

In [20]:
# In fact, Stable-Baselines3 already provides you with that helper:
from stable_baselines3.common.evaluation import evaluate_policy

In [21]:
# Use a separate environement for evaluation
eval_env = gym.make("CartPole-v1", render_mode="rgb_array")

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:9.14 +/- 0.72


## Train the agent and evaluate it

In [22]:
# Train the agent for 10000 steps
model.learn(total_timesteps=10_000)

<stable_baselines3.ppo.ppo.PPO at 0x321f2e790>

In [23]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:449.37 +/- 77.40
