In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import gymnasium as gym
import optuna
import torch.nn as nn

In [4]:
env = DummyVecEnv([lambda: gym.make("BipedalWalker-v3") for _ in range(32)])

In [9]:


# Initialize the PPO agent with the environment
model = PPO(
    policy="MlpPolicy",  # Multi-layer perceptron policy
    env=env,             # Environment
    verbose=1,           # Logging level
    learning_rate= 3e-4,  # Learning rate
    gamma=0.999,          # Discount factor
    n_steps=2048,        # Number of steps to run for each environment per update
    batch_size=64,       # Mini-batch size
    gae_lambda=0.95,         # lambda
    ent_coef=0.0,
    clip_range= 0.18,
    n_epochs= 10,
)

# Train the agent
model.learn(total_timesteps=5000000)  # Train for 1,500,000 steps

# Save the model
model.save("ppo_bipedal_best")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 84.4     |
|    ep_rew_mean     | -109     |
| time/              |          |
|    fps             | 3670     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 211          |
|    ep_rew_mean          | -109         |
| time/                   |              |
|    fps                  | 2507         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0051386817 |
|    clip_fraction        | 0.0567       |
|    clip_range           | 0.18         |
|    en

In [10]:
env = gym.make("BipedalWalker-v3")
model = PPO.load("ppo_bipedal_best", env = env)
env.reset()
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=30)
print(f"Mean Reward = {mean_reward}, Std Reward = {std_reward}")


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Mean Reward = 235.44227944483484, Std Reward = 95.43356145832836


In [12]:
from gymnasium.wrappers import RecordVideo
video_folder = "videosbipedal"  # Directory to save the video
env = gym.make("BipedalWalker-v3", render_mode="rgb_array")
env = RecordVideo(env, video_folder=video_folder, episode_trigger=lambda x: True)

model = PPO.load("ppo_bipedal_best", env=env)

# Test the trained agent
obs, info = env.reset()
done = False

while not done:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env.step(action)

env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
