In [None]:
import gymnasium as gym

1. Always define the environment in the same cell as where it is rendered. Due to the way pygame works, once the environment is closed, you need to remake it. 

2. The environment below is the base environment. It shows what happens when there is not trained model. 

In [None]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
               enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode='human')

observation, info = env.reset()
for _ in range(1000):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

In [None]:
from stable_baselines3 import PPO
from gymnasium import Wrapper


We create this wrapper so that during training our model knows that landing between the flagpoles will result in a bigger reward and that landing close to it is desirable over landing further way from it. 

In [None]:
class PrecisionLandingWrapper(Wrapper):
    def __init__(self, env):
        super().__init__(env)
        
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        
        # If the lander has landed (terminated), check landing precision
        if terminated:
            x_pos = obs[0]  # Horizontal position
            
            # Landing pad is roughly between x = -0.1 and x = 0.1
            # Give bonus reward for landing closer to center
            if abs(x_pos) < 0.05:  # Very close to center
                reward += 100  # Big bonus
            elif abs(x_pos) < 0.1:  # Within landing pad
                reward += 50   # Medium bonus
            elif abs(x_pos) < 0.2:  # Close to landing pad
                reward += 5   # Small bonus
            else:  # Far from landing pad
                reward -= 50   # Penalty for landing far away
                
        return obs, reward, terminated, truncated, info

In [None]:
train_env = PrecisionLandingWrapper(gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
                     enable_wind=False, wind_power=15.0, turbulence_power=1.5))

You can change timesteps to whatever you want as timesteps is basically the training time here. 

In [None]:
model = PPO("MlpPolicy", train_env, verbose=1)
model.learn(total_timesteps=200000, log_interval=50)
model.save("ppo_lunar_lander")

In [None]:
model = PPO.load("ppo_lunar_lander")

In [15]:
test_env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0,
                     enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode='human')

obs, info = test_env.reset()
for _ in range(5000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = test_env.step(action)
    if terminated or truncated:
        obs, info = test_env.reset()
test_env.close()