# PPO for Lunar Lander with Stable Baselines3

This notebook implements Proximal Policy Optimization (PPO) to solve the Lunar Lander environment from OpenAI Gymnasium.

## 1. Install Required Packages

Run this cell first to install all dependencies.

In [None]:
!pip install swig
!pip install gymnasium[box2d]
!pip install stable-baselines3[extra]
!pip install tensorboard
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
import os
from matplotlib import animation
from IPython.display import HTML

In [None]:
# Initialise the environment
env = gym.make("LunarLander-v3", render_mode="human")

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
for _ in range(1000):
    # this is where you would insert your policy
    action = env.action_space.sample()

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()

## 3. Create the Environment

In [None]:
# Create a single environment for testing
env = gym.make('LunarLander-v3', render_mode='rgb_array')
obs, info = env.reset()

## 4. Create Vectorized Environment for Training

Using multiple parallel environments speeds up training.

In [None]:
# Create vectorized environment with 4 parallel environments
vec_env = make_vec_env('LunarLander-v3', n_envs=10, seed=42)

## 5. Initialize PPO Agent

In [None]:
# Create directories for logs and models
os.makedirs("logs", exist_ok=True)
os.makedirs("models", exist_ok=True)

# Initialize PPO agent with custom hyperparameters
model = PPO(
    "MlpPolicy", env, learning_rate=0.0003, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, clip_range_vf=None, normalize_advantage=True, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, use_sde=False, sde_sample_freq=-1, rollout_buffer_class=None, rollout_buffer_kwargs=None, target_kl=None, stats_window_size=100, tensorboard_log=None, policy_kwargs=None, verbose=0, seed=None, device='auto', _init_setup_model=True
    )

## 6. Set Up Callbacks for Evaluation and Checkpointing

In [None]:
# Create evaluation environment
eval_env = gym.make('LunarLander-v3')

# Evaluation callback - evaluates the model every 10000 steps
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./models/",
    log_path="./logs/",
    eval_freq=10000,
    deterministic=True,
    render=False
)

# Checkpoint callback - saves the model every 50000 steps
checkpoint_callback = CheckpointCallback(
    save_freq=50000,
    save_path="./models/",
    name_prefix="ppo_model"
)

## 7. Train the Agent

This will train for 500,000 timesteps. Adjust as needed.

In [None]:
# Train the agent
total_timesteps = 500000

print(f"Starting training for {total_timesteps} timesteps...")
model.learn(
    total_timesteps=total_timesteps,
    callback=[eval_callback, checkpoint_callback],
    progress_bar=False
)

print("\nTraining completed!")

# Save the final model
model.save("models/ppo_lunar_lander_final")
print("Final model saved!")

## 8. Evaluate the Trained Agent

In [None]:
# Load the best model
model = PPO.load("models/ppo_lunar_lander_final.zip")

# Evaluate the agent
eval_env = gym.make('LunarLander-v3')
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"Mean reward over 100 episodes: {mean_reward:.2f} +/- {std_reward:.2f}")

eval_env.close()

In [None]:
def create_episode_animation(model, env_name='LunarLander-v3', deterministic=True):
    """
    Create an animated visualization of one episode.
    """
    env = gym.make(env_name, render_mode='rgb_array')

    # Collect frames from one episode
    frames = []
    obs, info = env.reset()
    done = False
    total_reward = 0
    step_count = 0

    while not done:
        # Render and store frame
        frame = env.render()
        frames.append(frame)

        # Get action from trained model
        action, _ = model.predict(obs, deterministic=deterministic)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        total_reward += reward
        step_count += 1

    env.close()

    print(f"Episode completed in {step_count} steps with total reward: {total_reward:.2f}")

    # Create animation
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.axis('off')
    img = ax.imshow(frames[0])

    def animate(i):
        img.set_array(frames[i])
        ax.set_title(f'Step: {i+1}/{len(frames)} | Reward: {total_reward:.2f}', fontsize=14)
        return [img]

    anim = animation.FuncAnimation(
        fig, animate, frames=len(frames), interval=50, blit=True, repeat=True
    )

    plt.close()  # Prevent static display
    return anim

# Create and display the animation
print("Creating animation of trained agent...\n")
anim = create_episode_animation(model)
HTML(anim.to_jshtml())

## Summary

This notebook demonstrates:
1. Setting up the Lunar Lander environment
2. Training a PPO agent with Stable Baselines3
3. Evaluating the trained agent
4. Visualizing performance
5. Comparing with a random baseline

The PPO algorithm should achieve an average reward above 200 (considered solved) after sufficient training.