In [25]:
import gym
from stable_baselines3 import PPO

# Create the Lunar Lander environment with render_mode='human'
env = gym.make('LunarLander-v2', render_mode='human')

# Create the PPO agent with optimal hyperparameters
model = PPO('MlpPolicy', env, 
            learning_rate=0.0003,  # Learning rate
            gamma=0.99,            # Discount factor
            clip_range=0.2,        # Clip range
            ent_coef=0.01,         # Entropy coefficient
            vf_coef=0.5,           # Value function coefficient
            n_steps=2048,          # Number of steps per update
            batch_size=64,         # Batch size
            n_epochs=10,           # Number of epochs per update
            verbose=1)

# Train the model
model.learn(total_timesteps=100000)  # Train for 100,000 timesteps

# Save the trained model (optional)
model.save("ppo_lunar_lander_optimized")

# Evaluate the trained agent
obs, info = env.reset()  # Reset the environment
done = False
score = 0

while not done:
    env.render()  # This shows the pop-up visualization of the lunar lander
    action, _states = model.predict(obs)  # Get action from the trained model
    obs, reward, done, truncated, info = env.step(action)  # Take the action
    score += reward

print(f"Episode Score: {score}")  # Print the final score of the episode
env.close()  # Close the environment when done


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -182     |
| time/              |          |
|    fps             | 47       |
|    iterations      | 1        |
|    time_elapsed    | 43       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 99.8        |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 46          |
|    iterations           | 2           |
|    time_elapsed         | 87          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006535309 |
|    clip_fraction        | 0.0116      |
|    clip_range           | 0.2         |
|    entropy_loss   