#Setup

In [1]:
!pip install swig

In [2]:
!pip3 install box2d-py==2.3.8

In [3]:
!pip install stable-baselines3[extra] gymnasium -q

In [4]:
import numpy as np
import gymnasium as gym
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise

In [16]:
env = gym.make("LunarLanderContinuous-v2", render_mode="rgb_array")
NUM_ACTIONS = env.action_space.shape[-1]
noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(NUM_ACTIONS),
                                     sigma=0.1 * np.ones(NUM_ACTIONS))

model = DDPG(policy="MlpPolicy",
             env=env,
             learning_rate=0.005,
             buffer_size=2048,
             batch_size=32,
             train_freq=(5, "episode"),
             action_noise=noise,
             device="cuda", # GPU?
             verbose=1)

#Training, testing

In [17]:
model.learn(10000, log_interval=100)
model.save("ddpg_lunar_lander")

Load if needed

In [None]:
#model = DDPG.load("ddpg_lunar_lander")

In [18]:
def test_model(model, env, num_eps=100):
  total_rewards = []
  for episode in range(num_eps):
    state, _ = env.reset()
    done = False
    episode_reward = 0

    while not done:
      action, _ = model.predict(state, deterministic=True)
      new_state, reward, done, _, _ = env.step(action)
      state = new_state
      episode_reward += reward
      # env.render()
    print(f"Episode {episode+1} reward: {episode_reward}")
    total_rewards.append(episode_reward)
  # env.close()
  return total_rewards

In [19]:
reward_50 = test_model(model, env, 50)
reward_100 = test_model(model, env, 100)

In [20]:
print(f"Average reward for 50 eps: {np.mean(reward_50)}")
print(f"Average reward for 100 eps: {np.mean(reward_100)}")