In [1]:
import gymnasium as gym
import torch
import torch.nn as nn
import random
import math
import numpy as np
from collections import deque
from ale_py import ALEInterface
import optuna
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [14]:
def objective(trial):
    # Suggest hyperparameters for PPO
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    n_steps = trial.suggest_int("n_steps", 256, 4096, step=256)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256, 512])
    gamma = trial.suggest_float("gamma", 0.95, 0.999, step=0.005)
    gae_lambda = trial.suggest_float("gae_lambda", 0.85, 0.99, step=0.01)
    clip_range = trial.suggest_float("clip_range", 0.1, 0.3, step=0.05)
    ent_coef = trial.suggest_float("ent_coef", 1e-4, 0.1, log=True)
    n_epochs = trial.suggest_int("n_epochs", 3, 10)
    
    # Create the environment
    env = DummyVecEnv([lambda: gym.make("LunarLander-v3") for _ in range(4)])
    
    # Define the PPO model with the suggested hyperparameters
    model = PPO(
        policy="MlpPolicy",
        env=env,
        learning_rate=learning_rate,
        n_steps=n_steps,
        batch_size=batch_size,
        gamma=gamma,
        gae_lambda=gae_lambda,
        clip_range=clip_range,
        ent_coef=ent_coef,
        verbose=0,  # Suppress training logs for faster optimization
        n_epochs=n_epochs
    )
    
    try:
        # Train the model
        model.learn(total_timesteps=50000)  # Increased training timesteps

        # Evaluate the model
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)

        # Report intermediate results for Optuna pruning
        trial.report(mean_reward, step=0)

        # Check for pruning
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    finally:
        # Ensure the environment is properly closed
        env.close()
    
    return mean_reward

In [15]:
# Run the optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best accuracy:", study.best_value)

[I 2024-12-14 10:54:30,709] A new study created in memory with name: no-name-3fb966be-5876-4fc9-88fd-7cfff9b832b5
[I 2024-12-14 10:55:06,575] Trial 0 finished with value: -110.04965961286435 and parameters: {'learning_rate': 0.005181324715204164, 'n_steps': 2048, 'batch_size': 32, 'gamma': 0.98, 'gae_lambda': 0.9099999999999999, 'clip_range': 0.25, 'ent_coef': 0.03580541456076328, 'n_epochs': 5}. Best is trial 0 with value: -110.04965961286435.
[I 2024-12-14 10:55:39,886] Trial 1 finished with value: -221.14861094379847 and parameters: {'learning_rate': 0.005639269257580862, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.99, 'gae_lambda': 0.86, 'clip_range': 0.2, 'ent_coef': 0.005422933506001748, 'n_epochs': 10}. Best is trial 0 with value: -110.04965961286435.
[I 2024-12-14 10:56:23,201] Trial 2 finished with value: -2157.822654043806 and parameters: {'learning_rate': 9.962030763316052e-05, 'n_steps': 512, 'batch_size': 32, 'gamma': 0.955, 'gae_lambda': 0.9299999999999999, 'clip_range'

Best hyperparameters: {'learning_rate': 0.002164447901112235, 'n_steps': 512, 'batch_size': 32, 'gamma': 0.995, 'gae_lambda': 0.89, 'clip_range': 0.1, 'ent_coef': 0.0001652124469384415, 'n_epochs': 9}
Best accuracy: 238.51914107394677


In [5]:
env = DummyVecEnv([lambda: gym.make("LunarLander-v3") for _ in range(4)])

In [25]:
best_params = study.best_params

In [26]:
best_params

{'learning_rate': 0.002164447901112235,
 'n_steps': 512,
 'batch_size': 32,
 'gamma': 0.995,
 'gae_lambda': 0.89,
 'clip_range': 0.1,
 'ent_coef': 0.0001652124469384415,
 'n_epochs': 9}

In [27]:

# Create the LunarLander-v3 environment

# Initialize the PPO agent with the environment
model = PPO(
    policy="MlpPolicy",  # Multi-layer perceptron policy
    env=env,             # Environment
    verbose=0,           # Logging level
    learning_rate=best_params['learning_rate'],  # Learning rate
    gamma=best_params['gamma'],          # Discount factor
    n_steps=best_params['n_steps'],        # Number of steps to run for each environment per update
    batch_size=best_params['batch_size'],       # Mini-batch size
    gae_lambda=best_params['gae_lambda'],         # lambda
    ent_coef=best_params['ent_coef'],
    clip_range= best_params['clip_range'],
    n_epochs= best_params["n_epochs"],
)

# Train the agent
model.learn(total_timesteps=1000000)  # Train for 1,000,000 steps

# Save the model
model.save("ppo_lunarlander_best")



In [6]:
model = PPO.load("ppo_lunarlander_best", env = env)

In [7]:

env.reset()
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean Reward = {mean_reward}, Std Reward = {std_reward}")




Mean Reward = 267.9496877070079, Std Reward = 21.550040399440356


In [8]:
from gymnasium.wrappers import RecordVideo
video_folder = "videostest"  # Directory to save the video
env = gym.make("LunarLander-v3", render_mode="rgb_array")
env = RecordVideo(env, video_folder=video_folder, episode_trigger=lambda x: True)

model = PPO.load("ppo_lunarlander_best", env=env)

# Test the trained agent
obs, info = env.reset()
done = False

while not done:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env.step(action)

env.close()