# Adversary Double Deep Q-Networks (DDQN)

In [1]:
import time
import wandb
import random
import numpy as np
from tqdm.notebook import trange
import multiprocessing
import gymnasium as gym
from src.env import CustomLunarLander, AdversarialLunarLander
from src.models import DDQN_Agent, ReplayBuffer
from src import util

import torch
import torch.nn as nn

wandb.login()
np.seterr(all='raise'); # raise exceptions on errors
print(f"Number of cores available: {multiprocessing.cpu_count()}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") # device for pytorch
gym.register(id="CustomLunarLander-v0", entry_point=CustomLunarLander)
gym.register(id="AdversarialLunarLander-v0", entry_point=AdversarialLunarLander)

[34m[1mwandb[0m: Currently logged in as: [33mthomasvroom[0m ([33mthomasvroom-maastricht-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Number of cores available: 12
Using device: cuda


In [2]:
def train(config, run_name):
    run = wandb.init(
        project="RL",
        entity="thomasvroom-maastricht-university",
        config=config,
        name=run_name
    )

    # seeding
    random.seed(config["random_seed"])
    np.random.seed(config["random_seed"])
    torch.manual_seed(config["random_seed"])
    torch.backends.cudnn.deterministic = config["deterministic"]

    # create environment (only 1, since bottleneck isn't experience gathering)
    env = gym.make(
        id="AdversarialLunarLander-v0",
        gravity=config["gravity"],
        wind_power=config["wind_power"],
        turbulence_power=config["turbulence_power"],
        max_episode_steps=config["max_env_steps"]
    )

    # create both protagonist and adversary
    protagonist = DDQN_Agent(env.observation_space.shape[0], env.action_space[0].n).to(device)
    adversary = DDQN_Agent(env.observation_space.shape[0], env.action_space[1].n).to(device) # see AdversarialLunarLander.py
    optimizer_protagonist = torch.optim.AdamW(protagonist.parameters(), weight_decay=config["weight_decay"], lr=config["learning_rate"])
    optimizer_adversary = torch.optim.AdamW(adversary.parameters(), weight_decay=config["weight_decay"], lr=config["learning_rate"])
    loss_fn = nn.MSELoss()
    
    # create target network and replay buffer for both players
    target_network_protagonist = type(protagonist)(env.observation_space.shape[0], env.action_space[0].n).to(device)
    target_network_protagonist.load_state_dict(protagonist.state_dict())
    replay_buffer_protagonist = ReplayBuffer(config["buffer_size"])
    target_network_adversary = type(adversary)(env.observation_space.shape[0], env.action_space[1].n).to(device)
    target_network_adversary.load_state_dict(adversary.state_dict())
    replay_buffer_adversary = ReplayBuffer(config["buffer_size"])

    epsilon_protagonist = 1
    epsilon_adversary = 1
    global_learning_steps = 0
    protagonist_learning_steps = 0
    adversary_learning_steps = 0

    def train_episode(training_protagonist, epsilon, global_steps, local_steps, episode):
        # reset environment
        state, _ = env.reset(seed=None if episode > 0 else config["random_seed"])
        done = False
        truncated = False
        total_reward = 0

        # run environment until done
        while not (done or truncated):
            # epsilon-greedy action selection (protagonist)
            if not training_protagonist or np.random.random() > epsilon:
                with torch.no_grad():
                    observation = torch.tensor(state, dtype=torch.float).to(device)
                    protagonist_action = protagonist.get_action(observation).item()
            else:
                protagonist_action = env.action_space[0].sample()
            # epsilon-greedy action selection (adversary)
            if training_protagonist or np.random.random() > epsilon:
                with torch.no_grad():
                    observation = torch.tensor(state, dtype=torch.float).to(device)
                    adversary_action = adversary.get_action(observation).item()
            else:
                adversary_action = env.action_space[1].sample()

            # execute action
            new_state, reward, done, truncated, _ = env.step([protagonist_action, adversary_action])

            # add sample to replay buffer
            if training_protagonist:
                replay_buffer_protagonist.add_new_sample(state, protagonist_action, reward, new_state, done)
            else: # zero-sum game, so reward is inversed
                replay_buffer_adversary.add_new_sample(state, adversary_action, -reward, new_state, done)

            state = new_state
            total_reward += reward

            # only update one player at a time
            agent = protagonist if training_protagonist else adversary
            replay_buffer = replay_buffer_protagonist if training_protagonist else replay_buffer_adversary
            target_network = target_network_protagonist if training_protagonist else target_network_adversary
            optimizer = optimizer_protagonist if training_protagonist else optimizer_adversary

            # only update weights if there are enough samples
            if len(replay_buffer) > config["batch_size"]:
                # replace target network
                if local_steps % config["target_replace_steps"] == 0:
                    target_network.load_state_dict(agent.state_dict())

                optimizer.zero_grad()

                # sample from replay buffer
                samples = replay_buffer.get_pytorch_training_samples(device, config["batch_size"])
                states, actions, rewards, new_states, was_terminals = samples
                indices = np.arange(config["batch_size"])

                # get the predicted q-values
                q_pred = agent.forward(states)[indices, actions]

                # get the estimated next q-values
                q_next = target_network.forward(new_states).max(dim=1)[0]
                q_next[was_terminals] = 0.0

                # target values
                q_label = rewards + config["gamma"] * q_next

                # calculate and backpropegate loss
                loss = loss_fn(q_label, q_pred).to(device)
                loss.backward()
                optimizer.step()

                # decay epsilon and record data
                epsilon = max(epsilon - config["epsilon_decay"], config["epsilon_min"])
                if training_protagonist:
                    run.log({"loss (protagonist)": loss, "epsilon (protagonist)": epsilon}, global_steps)
                else:
                    run.log({"loss (adversary)": loss, "epsilon (adversary)": epsilon}, global_steps)

                local_steps += 1
                global_steps += 1

        run.log({"total_reward": total_reward}, max(global_steps, episode))
        return epsilon, global_steps, local_steps

    # let protagonist and adversary take turns
    for episode in trange(config["training_cycles"]):
        for _ in trange(config["protagonist_episodes"]):
            epsilon_protagonist, global_learning_steps, protagonist_learning_steps = train_episode(
                True, epsilon_protagonist, global_learning_steps, protagonist_learning_steps, episode
            )
        for _ in trange(config["adversary_episodes"]):
            epsilon_adversary, global_learning_steps, adversary_learning_steps = train_episode(
                False, epsilon_adversary, global_learning_steps, adversary_learning_steps, episode
            )

    env.close()
    run.finish(0)
    torch.save(protagonist.state_dict(), f"models/protagonist-{run_name}")
    torch.save(adversary.state_dict(), f"models/adversary-{run_name}")

config = { # see: https://gymnasium.farama.org/environments/box2d/lunar_lander/
    "gravity": -10.0,
    "wind_power": 10.0,
    "turbulence_power": 1.0,

    "random_seed": 123,
    "deterministic": True, # toggles torch.backends.cudnn.deterministic
    "training_cycles": 4, # how often the protagonist and adversary switch position
    "protagonist_episodes": 600,
    "adversary_episodes": 200,
    "buffer_size": 100_000, # size of the replay buffer
    "batch_size": 64,
    "target_replace_steps": 500, # after how many steps the target network gets replaced
    "max_env_steps": 1000, # number of steps before truncation

    "gamma": 0.99,
    "learning_rate": 1e-4,
    "weight_decay": 0.01,
    "epsilon_min": 0.01,
    "epsilon_decay": 5e-6
}

In [3]:
run_name = f"Adversary-DDQN-{time.time()}"
train(config, run_name)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

0,1
epsilon (adversary),████▇▇▇▇▆▆▆▄▄▄▄▃▃▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epsilon (protagonist),█▆▅▅▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss (adversary),▁▁▁▁▁▄▁█▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss (protagonist),▅▃▃▂▂█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▅▃▁▁▁▁▁▂▁▁▁▁▃▁▁▁▂▁▁
total_reward,▁▁▁▁▁▁▁▃▁▂▄▃▅▄▁▅▅▅▅▅▅▅▅▅▄▇▅▇▆▇▆█▆▇█████▆

0,1
epsilon (adversary),0.01
epsilon (protagonist),0.01
loss (adversary),2.01521
loss (protagonist),2.52194
total_reward,-157.65986


In [5]:
# run_name = "Adversary-DDQN-1748037372.4373443"

# load agent
protagonist = DDQN_Agent(8, 4).to(device)
protagonist.load_state_dict(torch.load(f"models/protagonist-{run_name}"))
adversary = DDQN_Agent(8, 4).to(device)
adversary.load_state_dict(torch.load(f"models/adversary-{run_name}"))
video_name = None
max_time = 30

env = gym.make(
    "AdversarialLunarLander-v0",
    gravity=config["gravity"],
    wind_power=config["wind_power"],
    turbulence_power=config["turbulence_power"],
    render_mode="rgb_array" if video_name else "human"
)
if video_name: 
    env = gym.wrappers.RecordVideo(env, f"videos/{video_name}")

# reset environment
obs, _ = env.reset()
done = False
target_time = time.time() + max_time
reward_sum = 0
while not (done or time.time() > target_time):
    # convert observation to tensor
    obs = torch.tensor(obs, dtype=torch.float32).to(device)

    # sample action from agents
    with torch.no_grad():
        protagonist_action = protagonist.get_action(obs).item()
        adversary_action = adversary.get_action(obs).item()

    # execute action
    obs, reward, terminated, truncated, __ = env.step([protagonist_action, adversary_action])
    done = terminated or truncated
    reward_sum += reward
    env.render()

# close ui
env.close()
print(f"Collected a total reward of: {reward_sum}")

Collected a total reward of: -713.5486136769899


In [4]:
# run_name = "Adversary-DDQN-1748037372.4373443"

# load agent
agent = DDQN_Agent(8, 4).to(device)
agent.load_state_dict(torch.load(f"models/protagonist-{run_name}"))

util.visualize_episode(
    env_id="CustomLunarLander-v0",
    gravity=config["gravity"],
    enable_wind=False,
    wind_power=config["wind_power"],
    turbulence_power=config["turbulence_power"],
    agent=agent,
    device=device,
    max_time=30,
    video_name=None
)

Collected a total reward of: 272.92410187049927
