# DQN Atari Paper Upgrade using Rainbow
Here we will upgrade the DQN Atari paper using the Rainbow algorithm.
From the collection of improvements in the Rainbow algorithm, we will implement the following:
- Dueling Network Architecture
- Prioritized Experience Replay
- N-Step Returns
- Noisy Networks

In [57]:
# ! pip install gymnasium[atari,accept-rom-license] torch numpy opencv-python matplotlib

In [58]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

from copy import deepcopy

In [59]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

## Hyperparameters
As per the paper, we use certain hyperparameters that were tuned across various Atari games.

In [60]:
LEARNING_RATE = 0.0002
DISCOUNT_FACTOR = 0.99
REPLAY_MEMORY_SIZE = 150_000
MINI_BATCH_SIZE = 32
TARGET_UPDATE_FREQ = 1_200
FRAME_SKIP = 4
MIN_EPSILON = 0.1
MAX_EPSILON = 1.0
EPSILON_PHASE = 0.1
MAX_STEPS = 1_500_001
REPLAY_START_SIZE = 75_000
SAVE_FREQUENCY = 500_000

N_STEP = 3  # For N-Step Returns
ALPHA = 0.6  # Prioritization exponent
BETA_START = 0.4  # Initial value of beta for importance sampling
BETA_FRAMES = MAX_STEPS - REPLAY_START_SIZE  # Schedule for beta

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x13390b3b0>

## Prioritized Replay Buffer
The paper introduces Prioritized Replay Buffer to sample important transitions more frequently. We use a SumTree data structure to store priorities and sample transitions based on the priorities.

In [61]:
class PrioritizedReplayBuffer:
    def __init__(self, capacity, alpha, obs_shape):
        self.capacity = capacity
        self.alpha = alpha

        self.pos = 0
        self.size = 0

        # Pre-allocate memory for buffer components
        self.states = np.empty((capacity, *obs_shape), dtype=np.uint8)
        self.next_states = np.empty((capacity, *obs_shape), dtype=np.uint8)
        self.actions = np.empty((capacity,), dtype=np.int32)
        self.rewards = np.empty((capacity,), dtype=np.float32)
        self.dones = np.empty((capacity,), dtype=np.bool_)

        self.priorities = np.zeros((capacity,), dtype=np.float32)
        self.max_priority = 1.0

    def add(self, error, state, next_state, action, reward, done):
        self.states[self.pos] = state
        self.next_states[self.pos] = next_state
        self.actions[self.pos] = action
        self.rewards[self.pos] = reward
        self.dones[self.pos] = done

        self.priorities[self.pos] = self.max_priority ** self.alpha

        self.pos = (self.pos + 1) % self.capacity
        self.size = min(self.size + 1, self.capacity)

    def sample(self, batch_size, beta):
        if self.size == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.size]

        probs = prios / prios.sum()
        indices = np.random.choice(self.size, batch_size, p=probs)

        # Retrieve samples directly without using zip()
        states = self.states[indices]
        next_states = self.next_states[indices]
        actions = self.actions[indices]
        rewards = self.rewards[indices]
        dones = self.dones[indices]

        total = self.size
        weights = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights = weights.astype(np.float32)

        return states, next_states, actions, rewards, dones, indices, weights

    def update(self, idxs, errors):
        errors = np.abs(errors) + 1e-6
        self.max_priority = max(self.max_priority, errors.max())
        self.priorities[idxs] = errors ** self.alpha

## Noisy Linear Layers
The paper introduces Noisy Linear Layers to add noise to the weights of the linear layers. We use a NoisyLinear layer to add noise to the weights of the linear layers.

In [62]:
class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, std_init=0.5):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.std_init = std_init

        self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.register_buffer(
            "weight_epsilon", torch.FloatTensor(out_features, in_features)
        )

        self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
        self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
        self.register_buffer("bias_epsilon", torch.FloatTensor(out_features))

        self.reset_parameters()
        self.reset_noise()

    def reset_parameters(self):
        mu_range = 1 / np.sqrt(self.weight_mu.size(1))
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(
            self.std_init / np.sqrt(self.weight_sigma.size(1))
        )

        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.std_init / np.sqrt(self.bias_sigma.size(0)))

    def reset_noise(self):
        epsilon_in = self._scale_noise(self.in_features).to(device)
        epsilon_out = self._scale_noise(self.out_features).to(device)

        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
        self.bias_epsilon.copy_(epsilon_out)

    def forward(self, input):
        if self.training:
            weight = (
                self.weight_mu + self.weight_sigma * self.weight_epsilon
            )
            bias = self.bias_mu + self.bias_sigma * self.bias_epsilon
        else:
            weight = self.weight_mu
            bias = self.bias_mu

        return F.linear(input, weight, bias)

    def _scale_noise(self, size):
        x = torch.randn(size)
        return x.sign() * x.abs().sqrt()

## Deep Q-Network Architecture
Dueling Network Architecture is used in the Rainbow algorithm. The architecture consists of two streams, one for the state value and the other for the advantage values. The two streams are combined to produce the Q-values.

We will also use the previously implemented NoisyLinear layer to add noise to the weights of the linear layers.

In [63]:
class DeepQNetwork(nn.Module):
    def __init__(self, n_actions):
        super(DeepQNetwork, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
        )

        self.n_actions = n_actions
        self.fc_input_dim = 7 * 7 * 64

        # Dueling DQN streams with NoisyLinear layers
        self.value_stream = nn.Sequential(
            NoisyLinear(self.fc_input_dim, 512),
            nn.ReLU(),
            NoisyLinear(512, 1),
        )

        self.advantage_stream = nn.Sequential(
            NoisyLinear(self.fc_input_dim, 512),
            nn.ReLU(),
            NoisyLinear(512, n_actions),
        )

    def forward(self, x):
        x = self.conv(x / 255.0)
        value = self.value_stream(x)
        advantage = self.advantage_stream(x)
        q_vals = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q_vals

    def reset_noise(self):
        for m in self.modules():
            if isinstance(m, NoisyLinear):
                m.reset_noise()

## Action Selection
As in the paper, we use an epsilon-greedy policy to select actions during training. We start with a high epsilon value and decay it over time. In addition to the NoisyLinear layers.

## Gymnasium Environment Setup

Here we set up the gym environment, by selecting the Breakout game. We specify RMSProp as the optimizer just like in the paper training details.

In [64]:
def make_env(env_id, render_mode=None, frame_skip=4):
    """Create environment with preprocessing wrappers."""
    env = gym.make(env_id, render_mode=render_mode, frameskip=1)
    env = gym.wrappers.AtariPreprocessing(env, frame_skip=4)
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env = gym.wrappers.FrameStack(env, 4)
    env = gym.wrappers.AutoResetWrapper(env)
    return env


env = make_env("ALE/Breakout-v5")

n_actions = env.action_space.n
dqn = DeepQNetwork(n_actions).to(device)
optimizer = torch.optim.Adam(dqn.parameters(), lr=LEARNING_RATE)
dqn_prime = DeepQNetwork(n_actions).to(device)

buffer = PrioritizedReplayBuffer(
    REPLAY_MEMORY_SIZE, alpha=ALPHA, obs_shape=(4, 84, 84)
)

In [65]:
import torch

def count_trainable_parameters(model):
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params

print(f"Trainable parameters: {count_trainable_parameters(dqn):,}")

Trainable parameters: 6,507,690


In [66]:
dqn.to(device)

DeepQNetwork(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
  )
  (value_stream): Sequential(
    (0): NoisyLinear()
    (1): ReLU()
    (2): NoisyLinear()
  )
  (advantage_stream): Sequential(
    (0): NoisyLinear()
    (1): ReLU()
    (2): NoisyLinear()
  )
)

## Training Loop
Here, we follow the methodology from the paper, by putting all the above components together to train the DQN agent on the Atari game.

In [67]:
training_history = {
    "loss": [0],
    "mean_q_value": [0],
    "episode_rewards": [0],
    "steps": [0],
}

t_observation, _ = env.reset()
episode_reward = 0

progress_bar = tqdm(range(MAX_STEPS), desc="Training Progress")

episode_steps = 0
episode_loss = 0
episode_q_values = 0

n_step_buffer = []

for t in progress_bar:
    # Epsilon with linear decay
    eps = max(
        MIN_EPSILON,
        MIN_EPSILON
        + (MAX_EPSILON - MIN_EPSILON) * (1 - t / (EPSILON_PHASE * MAX_STEPS)),
    )

    # Epsilon-greedy policy with Noisy Networks
    if np.random.rand() < eps:
        action = env.action_space.sample()
    else:
        with torch.no_grad():
            dqn.reset_noise()
            obs_tensor = torch.tensor(
                np.array(t_observation), device=device, dtype=torch.float32
            ).unsqueeze(0)
            q_values = dqn(obs_tensor)
            action = torch.argmax(q_values, dim=1).item()
            episode_q_values += q_values.mean().item()

    # Step environment
    t1_observation, reward, done, _, info = env.step(action)
    episode_reward += reward

    # Add to n-step buffer
    n_step_buffer.append((t_observation, action, reward, done, t1_observation))

    if len(n_step_buffer) >= N_STEP:
        # Compute N-step return
        R = sum(
            [
                n_step_buffer[i][2] * (DISCOUNT_FACTOR ** i)
                for i in range(N_STEP)
            ]
        )
        state = n_step_buffer[0][0]
        action_n = n_step_buffer[0][1]
        next_state = n_step_buffer[-1][4]
        done_n = n_step_buffer[-1][3]

        buffer.add(
            error=buffer.max_priority,
            state=state,
            next_state=next_state,
            action=action_n,
            reward=R,
            done=done_n,
        )

        n_step_buffer.pop(0)

    if done:
        # Add remaining transitions in n-step buffer
        while len(n_step_buffer) > 0:
            len_buffer = len(n_step_buffer)
            R = sum(
                [
                    n_step_buffer[i][2] * (DISCOUNT_FACTOR ** i)
                    for i in range(len_buffer)
                ]
            )
            state = n_step_buffer[0][0]
            action_n = n_step_buffer[0][1]
            next_state = n_step_buffer[-1][4]
            done_n = n_step_buffer[-1][3]

            buffer.add(
                error=buffer.max_priority,
                state=state,
                next_state=next_state,
                action=action_n,
                reward=R,
                done=done_n,
            )

            n_step_buffer.pop(0)

        training_history["steps"].append(t)
        training_history["episode_rewards"].append(episode_reward)
        training_history["mean_q_value"].append(
            episode_q_values / episode_steps if episode_steps > 0 else 0
        )
        training_history["loss"].append(
            episode_loss / episode_steps if episode_steps > 0 else 0
        )
        episode_reward = 0
        progress_bar.set_description(
            f"R: {training_history['episode_rewards'][-1]:.2f}, l: {training_history['loss'][-1]:.2f}, Mean Q: {training_history['mean_q_value'][-1]:.2f}, e: {eps:.2f}"
        )
        episode_steps = 0
        episode_loss = 0
        episode_q_values = 0

        t_observation, _ = env.reset()
    else:
        t_observation = t1_observation

    # Checkpoint every SAVE_FREQUENCY steps
    if t > 0 and t % SAVE_FREQUENCY == 0:
        torch.save(dqn.state_dict(), f"checkpoint{t}.pt")

    if t > REPLAY_START_SIZE:
        if t % 4 == 0:
            beta = min(
                1.0,
                BETA_START + (t - REPLAY_START_SIZE) * (1.0 - BETA_START) / BETA_FRAMES,
            )

            (
                states,
                next_states,
                actions,
                rewards,
                dones,
                idxs,
                is_weights,
            ) = buffer.sample(MINI_BATCH_SIZE, beta)

            # Convert to tensors
            states = torch.tensor(states, device=device, dtype=torch.float32)
            next_states = torch.tensor(next_states, device=device, dtype=torch.float32)
            actions = torch.tensor(actions, device=device, dtype=torch.long)
            rewards = torch.tensor(rewards, device=device, dtype=torch.float32)
            dones = torch.tensor(dones.astype(np.float32), device=device)

            is_weights = torch.tensor(is_weights, device=device, dtype=torch.float32)

            # Reset noise in the networks
            dqn.reset_noise()
            dqn_prime.reset_noise()

            with torch.no_grad():
                not_done = 1.0 - dones
                # Double DQN with N-Step Returns
                next_q_values = dqn_prime(next_states)
                next_actions = dqn(next_states).argmax(dim=1)
                next_q_values = next_q_values.gather(
                    1, next_actions.unsqueeze(1)
                ).squeeze(1)
                y_j = rewards + (DISCOUNT_FACTOR ** N_STEP) * next_q_values * not_done

            optimizer.zero_grad()

            q_values = dqn(states)
            q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)

            td_errors = y_j - q_values
            loss = (is_weights * td_errors.pow(2)).mean()

            loss.backward()
            optimizer.step()
            episode_loss += loss.item()

            # Update priorities in the buffer
            errors = td_errors.detach().cpu().numpy()
            buffer.update(idxs, errors)

        if t % TARGET_UPDATE_FREQ == 0:
            dqn_prime = deepcopy(dqn)

    episode_steps += 1

env.close()

Training Progress:   0%|          | 0/1500001 [00:00<?, ?it/s]

R: 2.00, l: 0.00, Mean Q: -0.01, e: 0.80:   2%|▏         | 32623/1500001 [00:38<29:11, 837.85it/s] 


KeyboardInterrupt: 

*The trainings were run on Kaggle, so the training logs are not included in this notebook.*

In [14]:
import pandas as pd

# Convert the dictionary to a DataFrame
df_plot_infos = pd.DataFrame(training_history)

# Save the DataFrame to a CSV file
df_plot_infos.to_csv("../data/rainbow_dqn_training_history.csv", index=False)

## Result
*The tests were run on Kaggle, so the tests logs are not included in this notebook.*

In [17]:
# Function to load model weights from checkpoint file
def load_checkpoint(model, checkpoint_file):
    model.load_state_dict(torch.load(checkpoint_file, map_location=device))
    model.eval()  # Set the model to evaluation mode (important for inference)


# Function to play a single episode and return the total reward
def play_episode(env, model):
    obs, info = env.reset()
    state = (
        torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(device)
    )  # Move to correct device
    total_reward = 0

    done = False
    while not done:
        if np.random.rand() < 0.05:
            action = env.action_space.sample()  # Random action with 5% probability
        else:
            with torch.no_grad():
                action = (
                    model(state).argmax(dim=1).item()
                )  # Choose action with highest Q-value
        next_obs, reward, done, truncated, info = env.step(action)
        next_state = torch.tensor(next_obs, dtype=torch.float32).unsqueeze(0).to(device)

        total_reward += reward
        state = next_state

        if done or truncated:
            break

    return total_reward


# Function to evaluate the model by playing 50 games
def evaluate_model(checkpoint_file, num_games=50):
    # Create the environment
    env = make_env("ALE/Breakout-v5", frame_skip=4)

    # Initialize model
    action_space = env.action_space.n
    model = DeepQNetwork(action_space).to(device)

    # Load the best checkpoint
    load_checkpoint(model, checkpoint_file)

    total_rewards = []
    for game in range(num_games):
        total_reward = play_episode(env, model)
        total_rewards.append(total_reward)
        print(f"Game {game + 1}, Reward: {total_reward}")

    # Calculate average reward
    avg_reward = np.mean(total_rewards)
    std_reward = np.std(total_rewards)
    max_reward = np.max(total_rewards)
    min_reward = np.min(total_rewards)

    print(f"Average Reward: {avg_reward}")
    print(f"Standard Deviation: {std_reward}")
    print(f"Max Reward: {max_reward}")
    print(f"Min Reward: {min_reward}")

    env.close()


# Call the function to evaluate the model
best_checkpoint_path = "checkpoint1500000.pth"
evaluate_model(best_checkpoint_path)

  model.load_state_dict(torch.load(checkpoint_file, map_location=device))
  torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(device)


Game 1, Reward: 22.0
Game 2, Reward: 13.0
Game 3, Reward: 23.0
Game 4, Reward: 11.0
Game 5, Reward: 15.0
Game 6, Reward: 17.0
Game 7, Reward: 15.0
Game 8, Reward: 23.0
Game 9, Reward: 23.0
Game 10, Reward: 20.0
Game 11, Reward: 15.0
Game 12, Reward: 12.0
Game 13, Reward: 25.0
Game 14, Reward: 22.0
Game 15, Reward: 23.0
Game 16, Reward: 38.0
Game 17, Reward: 30.0
Game 18, Reward: 30.0
Game 19, Reward: 37.0
Game 20, Reward: 21.0
Game 21, Reward: 21.0
Game 22, Reward: 19.0
Game 23, Reward: 12.0
Game 24, Reward: 33.0
Game 25, Reward: 27.0
Game 26, Reward: 11.0
Game 27, Reward: 12.0
Game 28, Reward: 16.0
Game 29, Reward: 22.0
Game 30, Reward: 10.0
Game 31, Reward: 35.0
Game 32, Reward: 16.0
Game 33, Reward: 15.0
Game 34, Reward: 29.0
Game 35, Reward: 25.0
Game 36, Reward: 31.0
Game 37, Reward: 9.0
Game 38, Reward: 19.0
Game 39, Reward: 17.0
Game 40, Reward: 12.0
Game 41, Reward: 23.0
Game 42, Reward: 11.0
Game 43, Reward: 21.0
Game 44, Reward: 23.0
Game 45, Reward: 32.0
Game 46, Reward: 16.