# Double Deep Q-Networks (DDQN)

source: https://medium.com/@coldstart_coder/dqn-algorithm-training-an-ai-to-land-on-the-moon-1a1307748ed9

In [1]:
import time
import wandb
import random
import numpy as np
from tqdm import tqdm
import multiprocessing
import gymnasium as gym
from src.env import CustomLunarLander
from src.models import DDQN_Agent, ReplayBuffer
from src import util

import torch
import torch.nn as nn

wandb.login()
np.seterr(all='raise'); # raise exceptions on errors
print(f"Number of cores available: {multiprocessing.cpu_count()}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") # device for pytorch
gym.register(id="CustomLunarLander-v0", entry_point=CustomLunarLander)

[34m[1mwandb[0m: Currently logged in as: [33mthomasvroom[0m ([33mthomasvroom-maastricht-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Number of cores available: 12
Using device: cuda


In [4]:
def train(config, run_name):
    run = wandb.init(
        project="RL",
        entity="thomasvroom-maastricht-university",
        config=config,
        name=run_name
    )

    # seeding
    random.seed(config["random_seed"])
    np.random.seed(config["random_seed"])
    torch.manual_seed(config["random_seed"])
    torch.backends.cudnn.deterministic = config["deterministic"]

    # create environment (only 1, since bottleneck isn't experience gathering)
    env = gym.make(
        id="CustomLunarLander-v0",
        gravity=config["gravity"],
        enable_wind=config["enable_wind"],
        wind_power=config["wind_power"],
        turbulence_power=config["turbulence_power"],
        max_episode_steps=config["max_env_steps"]
    )

    agent = DDQN_Agent(env.observation_space.shape[0], env.action_space.n).to(device)
    optimizer = torch.optim.AdamW(agent.parameters(), weight_decay=config["weight_decay"], lr=config["learning_rate"])
    loss_fn = nn.MSELoss()

    # create target network and replay buffer
    target_network = type(agent)(env.observation_space.shape[0], env.action_space.n).to(device)
    target_network.load_state_dict(agent.state_dict())
    replay_buffer = ReplayBuffer(config["buffer_size"])

    epsilon = 1
    learning_steps = 0

    for episode in tqdm(range(config["train_episodes"])):
        # reset environment
        state, _ = env.reset(seed=None if episode > 0 else config["random_seed"])
        done = False
        truncated = False
        total_reward = 0

        # run environment until done
        while not (done or truncated):
            # epsilon-greedy action selection
            if np.random.random() > epsilon:
                with torch.no_grad():
                    observation = torch.tensor(state, dtype=torch.float).to(device)
                    action = agent.get_action(observation).item()
            else:
                action = env.action_space.sample()

            # execute action
            new_state, reward, done, truncated, _ = env.step(action)

            # add sample to replay buffer
            replay_buffer.add_new_sample(state, action, reward, new_state, done)

            state = new_state
            total_reward += reward

            # only update weights if there are enough samples
            if len(replay_buffer) > config["batch_size"]:
                # replace target network
                if learning_steps % config["target_replace_steps"] == 0:
                    target_network.load_state_dict(agent.state_dict())

                optimizer.zero_grad()

                # sample from replay buffer
                samples = replay_buffer.get_pytorch_training_samples(device, config["batch_size"])
                states, actions, rewards, new_states, was_terminals = samples
                indices = np.arange(config["batch_size"])

                # get the predicted q-values
                q_pred = agent.forward(states)[indices, actions]

                # get the estimated next q-values
                q_next = target_network.forward(new_states).max(dim=1)[0]
                q_next[was_terminals] = 0.0

                # target values
                q_label = rewards + config["gamma"] * q_next

                # calculate and backpropegate loss
                loss = loss_fn(q_label, q_pred).to(device)
                loss.backward()
                optimizer.step()

                # decay epsilon
                epsilon = max(epsilon - config["epsilon_decay"], config["epsilon_min"])

                # record data
                run.log({"loss": loss, "epsilon": epsilon}, learning_steps)
                learning_steps += 1

        run.log({"total_reward": total_reward}, max(learning_steps, episode))

    env.close()
    run.finish(0)
    torch.save(agent.state_dict(), f"models/{run_name}")

config = { # see: https://gymnasium.farama.org/environments/box2d/lunar_lander/
    "gravity": -10.0,
    "wind_power": 15.0,
    "turbulence_power": 1.5,

    "random_seed": 123,
    "deterministic": True, # toggles torch.backends.cudnn.deterministic
    "train_episodes": 2000,
    "buffer_size": 100_000, # size of the replay buffer
    "batch_size": 64,
    "target_replace_steps": 500, # after how many steps the target network gets replaced
    "max_env_steps": 1000, # number of steps before truncation

    "gamma": 0.99,
    "learning_rate": 1e-4,
    "weight_decay": 0.01,
    "epsilon_min": 0.01,
    "epsilon_decay": 5e-6
}

### Training without wind

In [None]:
config["enable_wind"] = False
run_name = f"DDQN-NoWind-{time.time()}"
train(config, run_name)

### Training with wind

In [5]:
config["enable_wind"] = True
run_name = f"DDQN-Wind-{time.time()}"
train(config, run_name)

100%|██████████| 2000/2000 [38:07<00:00,  1.14s/it]


0,1
epsilon,██▆▆▆▅▅▅▅▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,▇▆█▂▅▂▂▇▁▁▁▁▁▁▅▁▁█▂▁▂▄▂▁▁▁▁▂▂▁▃▁▁▂▁▁▁▃▁▁
total_reward,▄▄▂▃▄▂▁▄▂▂▁▂▄▃▃▄▄▃▂▄▄▂▃▃▄▃▃▃▇▅▇▇▇▅▄█▇██▇

0,1
epsilon,0.01
loss,15.41412
total_reward,248.45203


### Visualize Episode

In [6]:
# run_name = 

# load agent
agent = DDQN_Agent(8, 4).to(device)
agent.load_state_dict(torch.load(f"models/{run_name}"))

util.visualize_episode(
    env_id="CustomLunarLander-v0",
    gravity=config["gravity"],
    enable_wind=False,
    wind_power=config["wind_power"],
    turbulence_power=config["turbulence_power"],
    agent=agent,
    device=device,
    max_time=30,
    video_name=None
)

Collected a total reward of: 255.88928824183392
