### Authors
Cathal Crowe - 21320128 \
Robert Flanagan - 21311625 \
Steven Lavelle - 21316945

### Execution
The code executes to the end without an error.

### References
https://keras.io/examples/rl/deep_q_network_breakout - Adapted network structure and hyperparameters
https://github.com/KJ-Waller/DQN-PyTorch-Breakout/blob/master/Breakout/DQN_model.py - Adapted network structure

In [None]:
# Import our dependencies
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gymnasium as gym
from gymnasium.wrappers import (
    AtariPreprocessing,
    FrameStackObservation,
    RecordVideo,
)
import ale_py
import matplotlib.pyplot as plt

In [None]:
# Configure our parameters
seed = 42
gamma = 0.99
learning_rate = 0.0001
max_episodes = 1500
epsilon_max = 1
epsilon_min = 0.01
epsilon_decay = np.exp(np.log(epsilon_min / epsilon_max) / max_episodes)
weight_decay = 0.0001
batch_size = 64
max_steps_per_episode = 1000
replay_buffer_size = 100000
target_update_frequency = 1000
start_training_after = 1000
update_after_actions = 4
video_folder = "recorded_episodes"

In [None]:
# Track highest reward and episode
highest_reward = 0
highest_reward_episode = 0

# Environment setup
os.makedirs(video_folder, exist_ok=True)

gym.register_envs(ale_py)
env = gym.make("ALE/Breakout-v5", render_mode="rgb_array")
env = RecordVideo(
    env,
    video_folder=video_folder,
    episode_trigger=lambda x: x % 100 == 0,
)
env = AtariPreprocessing(env, frame_skip=1)
env = FrameStackObservation(env, 4)

env.reset(seed=seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Define our model
num_actions = 3
input_dim = (4, 84, 84)
output_dim = num_actions

class DuelingDQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DuelingDQN, self).__init__()
        self.input_dim = input_dim
        channels, _, _ = input_dim

        # Convolutional layers
        self.feature_layer = nn.Sequential(
            nn.Conv2d(channels, 32, kernel_size=8, stride=4, padding=2),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
        )

        # Compute the output size of the convolutional layers
        conv_output_size = self.conv_output_dim()
        hidden_size = 512

        # Separate stream for values
        self.value_stream = nn.Sequential(
            nn.Linear(conv_output_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),  # Outputs scalar state-value V(s)
        )

        # Separate stream for advantages
        self.advantage_stream = nn.Sequential(
            nn.Linear(conv_output_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_dim),  # Outputs advantages A(s, a)
        )

    # Compute the output size of the convolutional layers
    def conv_output_dim(self):
        x = torch.zeros(1, *self.input_dim)
        x = self.feature_layer(x)
        return int(np.prod(x.shape))

    # Forward pass
    def forward(self, x):
        x = self.feature_layer(x)
        x = x.view(x.shape[0], -1)

        # Compute value and advantages
        value = self.value_stream(x)
        advantage = self.advantage_stream(x)

        # Combine streams to calculate Q-values
        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q_values

# Initialize our model and target model
model = DuelingDQN(input_dim, output_dim).to(device)
model_target = DuelingDQN(input_dim, output_dim).to(device)
model_target.load_state_dict(model.state_dict())

optimizer = optim.Adam(model.parameters(), learning_rate, weight_decay=weight_decay)

In [None]:
# Experience replay buffers
action_history, state_history, state_next_history = [], [], []
rewards_history, done_history = [], []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0

In [None]:
# Plot training performance
def plot_training_performance(episode_rewards, running_rewards):
    plt.figure(figsize=(12, 6))
    plt.plot(episode_rewards, label='Episode Reward')
    plt.plot(running_rewards, label='Running Reward')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.title('Training Performance')
    plt.legend()
    plt.show()
    
# Plot epsilon decay
def plot_epsilon_decay(epsilon_values):
    plt.figure(figsize=(12, 6))
    plt.plot(epsilon_values, label="Epsilon")
    plt.xlabel("Episode")
    plt.ylabel("Epsilon Value")
    plt.title("Epsilon Decay Over Episodes")
    plt.legend()
    plt.show()

In [None]:
# Collect episode rewards, running rewards, and epsilon values across episodes
episode_rewards = []
running_rewards = []
epsilon_values = []

# Set epsilon to 1 for exploration
epsilon = epsilon_max

# Train the model
try:
    while True:
        observation, _ = env.reset()
        state = np.array(observation)
        episode_reward = 0

        for timestep in range(1, max_steps_per_episode):
            frame_count += 1

            # Epsilon-greedy exploration
            if frame_count < start_training_after or np.random.rand(1)[0] < epsilon:
                action = np.random.choice(num_actions)
            else:
                with torch.no_grad():
                    state_tensor = (
                        torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
                    )
                    action_probs = model(state_tensor)
                    action = action_probs.argmax().cpu().item()

            # Environment step
            state_next, reward, done, _, _ = env.step(action)
            state_next = np.array(state_next)

            episode_reward += reward

            # Save experiences
            action_history.append(action)
            state_history.append(state)
            state_next_history.append(state_next)
            done_history.append(done)
            rewards_history.append(reward)
            state = state_next

            # Update network
            if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
                # Sample batch
                indices = np.random.choice(range(len(done_history)), size=batch_size)

                # Prepare batch tensors
                state_sample = torch.tensor(
                    np.array([state_history[i] for i in indices]), dtype=torch.float32
                ).to(device)
                state_next_sample = torch.tensor(
                    np.array([state_next_history[i] for i in indices]), dtype=torch.float32
                ).to(device)
                rewards_sample = torch.tensor(
                    [rewards_history[i] for i in indices], dtype=torch.float32
                ).to(device)
                action_sample = torch.tensor(
                    [action_history[i] for i in indices], dtype=torch.long
                ).to(device)
                done_sample = torch.tensor(
                    [float(done_history[i]) for i in indices], dtype=torch.float32
                ).to(device)

                # Double DQN logic - select action using the main network
                with torch.no_grad():
                    # Get actions from the main network
                    action_next = model(state_next_sample).argmax(1)
                    # Evaluate actions using the target network
                    target_q_values = (
                        model_target(state_next_sample)
                        .gather(1, action_next.unsqueeze(1))
                        .squeeze(1)
                    )
                    updated_q_values = rewards_sample + gamma * target_q_values * (
                        1 - done_sample
                    )

                # Compute Q-values
                q_values = model(state_sample)
                q_action = q_values.gather(1, action_sample.unsqueeze(1)).squeeze(1)

                # Compute loss
                loss = F.smooth_l1_loss(q_action, updated_q_values)

                # Optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # Update target network
            if frame_count % target_update_frequency == 0:
                model_target.load_state_dict(model.state_dict())

            # Trim memory
            if len(rewards_history) > replay_buffer_size:
                for history in [
                    rewards_history,
                    state_history,
                    state_next_history,
                    action_history,
                    done_history,
                ]:
                    del history[:1]

            if done:
                break
        # Decay exploration (epsilon) after each episode
        epsilon = max(epsilon_min, epsilon * epsilon_decay)

        episode_count += 1

        # Update highest reward and episode
        if episode_reward > highest_reward:
            highest_reward = episode_reward
            highest_reward_episode = episode_count
    
        # Collect rewards for plotting
        episode_rewards.append(episode_reward)
        running_reward = np.mean(episode_rewards[-100:])
        running_rewards.append(running_reward)

        # Collect epsilon values for plotting
        epsilon_values.append(epsilon)
            
        print(
            f"Episode {episode_count} - Reward: {episode_reward:.3f}, "
            f"Running Reward: {running_reward:.3f}, Epsilon: {epsilon:.3f}, "
            f"Highest Reward: {highest_reward:.3f} (Episode {highest_reward_episode})"
        )

        # Termination condition
        if episode_count >= max_episodes:
            print(f"Stopped at episode {episode_count}!")
            # Save the trained model
            torch.save(model.state_dict(), "dqn_trained_model.pth")
            print("Model saved to dqn_trained_model.pth")
            break
finally:
    env.close()

# Plot the training performance
plot_training_performance(episode_rewards, running_rewards)

# Plot the epsilon decay
plot_epsilon_decay(epsilon_values)

# Why Reinforcement Learning is the machine learning paradigm of choice for this task
The goal of this project is to train a machine learning paradigm to learn to play an Atari game. The game we have chosen is Breakout due to its simplicity and easily trackable reward. Reinforcement learning is a type of machine learning where the agent learns to make choices by interacting with its environment with the goal of maximizing its reward. There are a number of reasons why reinforcement learning is the most suitable choice. Reinforcement learning's ability to learn through interaction, optimize rewards and handle sequential decisions make it the obvious choice for this task.
## Sequential Decision Making
Breakout is a game comprised of sequential decisions where at every frame the agent must decide to move left move right or stay central. Each decision changes the game state and effects the next decision. Due to this decisions need to be made with long term strategy in mind instead of just focusing on short term reward. This makes reinforcement learning the ideal choice as it focuses on maximising long-term reward.
## Deep Learning
Reinforcement learning can leverage deep learning techniques in the form of a DQN. This allows it to handle the high dimensional input of a game like breakout.
## Exploration and Exploitation
To maximise reward, it is important not to converge on sub optimal strategies. Reinforcement learning utilises epsilon-greedy exploration to ensure there is a balance between exploiting (choosing the action the agent knows will get it more reward immediately) and exploring (making a decision that improves the agent's knowledge and might lead to better long-term reward).
## Unlabelled Data
Machine learning paradigms such as reinforcement learning that rely on labelled data would be unsuitable for this task as games like breakout have no explicit labels mapping actions to rewards. This means the chosen paradigm must discover strategy through trial and error which is exactly how reinforcement learning works.
# The Gym Environment
Gymnasium is a project that provides an API for all single agent reinforcement learning environments. For this project we use the "ALE/Breakout-v5" gym environment provided by The Arcade Learning Environment (ALE) which provides a simulation of Breakout allowing our agent to interact with and learn from the game through an interface. It outputs images that represent the game state which includes the important elements like the ball, the paddle and the bricks. The agent analyses these images to determine its actions. The agent has 3 actions to choose from on each frame these being moving left, moving right or staying still. The reward system is central to how our agent learns. In this game the agent earns rewards by breaking blocks and keeping the ball in play. By interacting with the environment, the agent learns the optimal strategies to achieve the highest reward.
# Implementation
## Capture and pre-processing of the data
### Pre-processing
Our data is pre-processed using the AtariPreprocessing wrapper imported from Gymnasium. All parameters are left on the default ones except that we specify our frame_skip parameter to 1, as we already skip 4 frames based on the environment’s default internal frame skipping with gym.make("ALE/Breakout-v5"), leaving this field blank would be stacking frame skipping leading to undesirable results. The screen size of the processed screen is set to 84px. We convert our frames to grayscale to reduce complexity as colour is of no interest to us. This reduces the input dimensionality into a single channel as opposed to 3 with RGB. Overall, our pre-processing is setup to reduce the complexity of our input, while preserving essential information for learning. This provides us with more efficient data to train off.
### Capture
We store episode rewards and running rewards in lists. Episode rewards are a list of all rewards earned from each episode. Running rewards are a list of all rewards earned over the last 100 episodes and are used to provide an average for the current state of the model. The environment is reset at the start of each episode, the variable episode_reward tracks the total reward for the current episode. If exploration is enabled (determined by epsilon-greedy strategy), the agent randomly chooses an action. Otherwise, the agent uses the model to select the action with highest value by exploitation. After taking an action, the environment returns the next state, reward, and flag stating whether the episode has finished. The agent stores experiences in lists, and once enough experiences are obtained, the agent samples a random batch of experiences for training. We do this to ensure that our model learns from diverse experiences. We then apply our double DQN update by computing target Q-values based on the target network’s predictions, which are then compared with the Q-values from the main network to calculate the loss. The network updates are then updated using backpropagation along with the Adam optimizer. The target network’s weights are updated periodically to match the main network. This allows us to stabilize training by using a fixed target for the Q-value updates. After each episode the epsilon decays to reduce the amount of exploration over time. We record every 100 episodes as an mp4 video file to view our model in action and to be able to observe the results.
## The Network Structure 
Our network structure is built is accordance with a Dueling DQN architecture. Our network is built to handle an input of 4 frames of 84x84 pixels stacked on each other. Are output dimensions are set as the number of actions in Atari Breakout, which is 3 (move left, move right, don’t move). The number of channels is set to 4, one for each frame.
```python
num_actions = 3
input_dim = (4, 84, 84)
output_dim = num_actions

class DuelingDQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DuelingDQN, self).__init__()
        self.input_dim = input_dim
        channels, _, _ = input_dim
```
For feature extraction, we have a sequence of 3 convolutional layers to extract spatial features in the input images. The first layer takes our 4 input channels and applies 32 filters with a kernel size of 8x8, a stride of 4 and a padding of 2. The second layer applies 64 filters with a kernel size of 4x4, a stride of 2 and a padding of 1. Finally, the third layer applies 64 filters with a kernel size 3, a stride of 1 and a padding of 1. We apply the ReLU activation  function after each convolution to help the network learn complex patterns. After our input passes through these layers, we are left with our feature map.
```python
self.feature_layer = nn.Sequential(
    nn.Conv2d(channels, 32, kernel_size=8, stride=4, padding=2),
    nn.ReLU(),
    nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),
    nn.ReLU(),
    nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
    nn.ReLU(),
)
```
We have two separate streams in our network. The value stream is used to estimate the value of the state, which is the expected value returned by that state. To achieve this we use a fully connected layer. The first linear layer maps the output of our convolutional layers to a hidden layer of size 512. We then apply a ReLU activation function to introduce non-linearity. The second layer reduces this to a scalar output representing the state value.
```python
conv_output_size = self.conv_output_dim()
hidden_size = 512

self.value_stream = nn.Sequential(
    nn.Linear(conv_output_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, 1),
)
```
The advantage stream is used to estimate the advantage or relative benefit of each action in a given state. This is very similar to our value stream, however, instead of outputting a singular scalar value, we output a vector of size 3, where each element in this vector corresponds to each action.
```python
self.advantage_stream = nn.Sequential(
    nn.Linear(conv_output_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, output_dim),
)
```
The forward pass passes our input first through the feature layer to extract spatial features as stated previously. We then flatten the output to a 1D vector to make it suitable for our fully connected layers. We then compute the value and advantage using our streams. The final q value is then calculated by combining the value and advantage. This is the final Q-value for each action in a given state.
```python
def forward(self, x):
    x = self.feature_layer(x)
    x = x.view(x.shape[0], -1)
    value = self.value_stream(x)
    advantage = self.advantage_stream(x)
    q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
    return q_values
```
Our conv_output_dim function computes the output size of our convolutional layers. As we are decreasing the spatial dimensions of the input after each layer, we use this function to calculate the size of the feature map after passing through all convolutional layers.
```python
def conv_output_dim(self):
    x = torch.zeros(1, *self.input_dim)
    x = self.feature_layer(x)
    return int(np.prod(x.shape))
```
We initialize both our models of the Dueling DQN to support our Double DQN setup. We use the main network to select the next action, and the model target to evaluate the action selected by the main network. We then compute the updated Q-value based on this evaluation and change the Q-value the updated one. We also initialize our Adam optimizer here with our learning rate and weight decay variables.
```python
model = DuelingDQN(input_dim, output_dim).to(device)
model_target = DuelingDQN(input_dim, output_dim).to(device)
model_target.load_state_dict(model.state_dict())
optimizer = optim.Adam(model.parameters(), learning_rate, weight_decay=weight_decay)
```
## Hyperparameters
We used various hyperparameters throughout creating this DQN for Atari Breakout. The hyperparameters had a large effect on the output of rewards and how successful the DQN was. We changed the hyperparameters multiple times with varying success, either increasing running rewards or significantly decreasing running rewards, causing our DQN to struggle to get a score above 4 in some cases.
```python
gamma = 0.99
learning_rate = 0.0001
max_episodes = 1500
epsilon_max = 1
epsilon_min = 0.01
epsilon_decay = np.exp(np.log(epsilon_min / epsilon_max) / max_episodes)
weight_decay = 0.0001
batch_size = 64
max_steps_per_episode = 1000
replay_buffer_size = 100000
target_update_frequency = 1000
start_training_after = 1000
update_after_actions = 4
```
### Gamma
Gamma was set to 0.99, this is to maximize future rewards as the higher the score obtained the greater rewards received.
### Learning Rate
We found that a learning rate of 0.0001 was sufficient as this allowed us to accurately narrow down on consistent running rewards. We used various other learning rates like 0.00025 and larger learning rates, but they gave us diminished rewards in comparison, converging earlier. We also tried a decreasing and increasing learning rate but a constant learning rate worked best with the decaying epsilon, when fine tuning. 
### Number of Episodes
We ran 1500 episodes, this gave us a convergence, we only ran 1500 episodes due to computational limitations. After 1500 episodes we got reduced rewards, with little overall change in episode rewards.
### Epsilon
We use a varied epsilon value that decayed exponentially over time, we start off running an epsilon value of 1 initially and this decays quickly in the beginning then slowing down to the minimum value of 0.01. This means the DQN will only make a random move 1% of the time, which means not much learning occurs here, and the DQN is more or less trained, as good as it is going to get. Our epsilon decays to 0.01 over the number of episodes ran so if we increase the number of episodes the decay is slower.
### Weight Decay
We use a weight decay value of 0.0001 through the Adam optimizer. This value appears to be optimal in nudging our weights toward zero to prevent them from growing excessively large. This value allowed us to stabilize our training at no additional overhead.
### Batch Size
We use a batch size of 64, this reduces variance in episode rewards in comparison with 32 that we used prior, and we also looked at 128 which didn’t have a noticeable difference, other than increasing computation needed. 64 provides a nice balance in providing previous experiences for training, while being computationally feasible.
### Max steps per episode
The max number of steps that can taking per episode is set at 5000, this allowed the DQN to take a variety of actions, without it getting stuck in a loop without further reward. This allows for experimentation and increases learning.
### Replay Buffer Size
The replay buffer size was set to 100,000 this lets the DQN recall on past experiences, replacing old ones with newer ones once the buffer size has been reached. We had our DQN update every 1000 frames, this allowed us to vary the weights constantly while also training the DQN. If the target update frequency was increased by too much, it takes longer to converge overall running rewards.
### Target Update Frequency
We update our target network after every 1000 steps, as it balances stability and responsiveness. When updated too frequently, the target network became too similar to the main network, reducing stability. Infrequent updates caused the target network to become too outdated, leading to inaccurate Q-value targets
### Start Training After
We do not start training initially until after 1000 steps, allowing the replay buffer to have some information stored. The network then updates after the first 1000 steps occur, so until then random exploration takes place. This is quite slow as we train our model very early as epsilon is decaying over time and dictates our random movements.
### Update after actions
Updates occur after every 4 actions using batches of 64 that we mentioned before. This stops the network from making constant updates letting the network learn.
## Where the Q learning update is applied to the weights
The Q-learning rate is updated on each episode iteration to find the best Q-value possible over the training of our agent. When the while loop begins the environment is reset and episode reward returns to 0. Then a for loop keeps track of timestep to run outlined steps per episode, ensuring the agent doesn’t get stuck in a loop making no progress. The agent doesn’t update Q values initially until after start training is exceeded or a random number is less than epsilon which will more than likely be true for as long as epsilon is still over 0.5. If both are false, the agent will go with exploitation over exploration and the model predicts the action with the highest Q value at this current state.
The else part disables gradient computation reducing overall computation and memory. The state is converted to a PyTorch tensor and unsqueeze helps convert in the games input dimensions. The action is then selected based on which has the highest predicted Q value.
```python
try:
    while True:
        observation, _ = env.reset()
        state = np.array(observation)
        episode_reward = 0
        for timestep in range(1, max_steps_per_episode):
            frame_count += 1
            if frame_count < start_training_after or np.random.rand(1)[0] < epsilon:
                action = np.random.choice(num_actions)
            else:
                with torch.no_grad():
                    state_tensor = (
                        torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
                    )
                    action_probs = model(state_tensor)
                    action = action_probs.argmax().cpu().item()
```
We store experiences containing information on state, action, reward, next state, done in a replay buffer. This allows our model to sample experiences from the replay buffer when training.
```python
action_history.append(action)
state_history.append(state)
state_next_history.append(state_next)
done_history.append(done)
rewards_history.append(reward)
state = state_next
```
The gradient computation is disabled so the code inside doesn’t affect gradient, and a batch of samples are taking from the replay buffer we have. The Q values for the next states are got and it selects the one with the highest Q value. The target then calculates the Q value for the chosen action. The Q values are correlated to the corresponding actions. The Q value is then updated combining rewards observed and future rewards multiplied by target Q values and discounting future rewards when episode is done. Basically, the model selects the best actions and the target checks those actions and ensures the Q values are stable. The Q values are then added together with rewards which forms the Bellman target.
```python
with torch.no_grad():
    action_next = model(state_next_sample).argmax(1)
    target_q_values = (
        model_target(state_next_sample)
        .gather(1, action_next.unsqueeze(1))
        .squeeze(1)
    )
    updated_q_values = rewards_sample + gamma * target_q_values * (
        1 - done_sample
    )
```
After an action is selected the Q-values are calculated by passing state_sample through the model. The model then outputs the Q-values which represent the expected future rewards of each potential action in the current state. We use the gather function in order to retrieve the Q-value of the current action which is stored as q_action. Next, we calculate the loss by calculating the difference between the predicated Q-values and the updated Q-values with the F_smooth_l1_loss function.  How this function works is it punishes small errors heavily using a squared error to help the agent make small adjustments for better accuracy. However, for large errors it takes the absolute error to reduce the impact of outliers. This loss value shows how far the models' predictions are from the true value. Next, we update the model by first resetting the optimizer with optimzer.zero.grad() which clears the gradients from the previous steps. Then we call loss.backward() to compute the new gradient based on the current loss. Finally, optimizer.step() updates the models parameters by applying the calculated gradients
```python
q_values = model(state_sample)
q_action = q_values.gather(1, action_sample.unsqueeze(1)).squeeze(1)
loss = F.smooth_l1_loss(q_action, updated_q_values)
optimizer.zero_grad()
loss.backward()
optimizer.step()
```
## Independently researched concepts
### Random Seed Initialization
We use random seed initialization to ensure reproducibility in our models training and performance. To achieve this, we set a variable seed = 42 and set the gym environment accordingly. By setting the seed to a constant value, we can achieve consistent behaviour across our runs, allowing us to tune our hyperparameters effectively without considering the degree of variance in different seeds. Without using a seed, our results would vary due to the stochastic nature of the model training process, this would make it very difficult for us to evaluate the model’s performance.
```python
env.reset(seed=seed)
```
### Impact of Regularizers on Scores
We use regularization techniques to reduce the overfitting of the model and improve generalization. In our code, we implement L2 regularization through the weight_decay parameter passed to our Adam optimizer. The purpose of this is to penalize large weights during training by the addition of a term proportional to the sum of squared weights to the loss function, this effectively constrains our model’s complexity and improves stability. L2 regularization also encourages simplicity in our model to ensure a smoother convergence.
```python
optimizer = optim.Adam(model.parameters(), learning_rate, weight_decay=weight_decay)
```
### Techniques to counter Catastrophic Forgetting
We implemented techniques to counter catastrophic forgetting (the tendency of neural networks to forget / lose previously learned information when trained on new data). To accomplish this, we created an experience replay to store a buffer of past experiences that we can sample from throughout the training process. This method prevents the agent from solely focusing on recent experiences. We also have a second network (the target network) that updates less frequently, and we use this to calculate more stable Q-values.
```python
action_history, state_history, state_next_history = [], [], []
rewards_history, done_history = [], []
if frame_count % target_update_frequency == 0:
    model_target.load_state_dict(model.state_dict())
```
### Techniques to counter Maximization Bias (Double DQN)
Maximization Bias arises in Q-Learning when we use the same network for both action selection and Q-value estimates. This approach can often overestimate Q-values. To counter this problem, we implemented a double q-learning implementation that resolves this bias by separating the action selection and Q-value estimation. In other words, the main network selects our action, and the target network evaluates it, reducing the overestimation.
```python
with torch.no_grad():
    action_next = model(state_next_sample).argmax(1)
    target_q_values = (
        model_target(state_next_sample)
        .gather(1, action_next.unsqueeze(1))
        .squeeze(1)
    )
    updated_q_values = rewards_sample + gamma * target_q_values * (
        1 - done_sample
    )
```
### Dueling DQN
A Dueling DQN is an enhanced DQN used in reinforced learning, the dueling DQN is better since it has increased stability and performance. This type of DQN uses two streams instead of the usual single stream to calculate the value function and advantage function. The two streams are then combined to create the final Q-values for each action an agent may take. Dueling DQNs allow for more efficient learning and overall achieve better results than basic single stream DQNs. In our implementation of the DQN model, we implemented a Dueling DQN. This gives our network the ability to differentiate between the quality of a state and the specific benefits attached to each action.
```python
self.value_stream = nn.Sequential(
    nn.Linear(conv_output_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, 1),
)
self.advantage_stream = nn.Sequential(
    nn.Linear(conv_output_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, output_dim),
)
```
### Loss
We made use of Smooth L1 Loss to stabilize the training. It does this in making our training more resilient to outliers and noisy transitions, thus “smoothing” out our data. This loss function is widely used in reinforcement learning, especially in Deep Q-Networks because it balances stability and adaptability.
```python
loss = F.smooth_l1_loss(q_action, updated_q_values)
```
# Plots
# Videos
# Evaluation of the results
Overall, our DQN achieves a reasonable average running reward of 16, frequently gets scores over 25 and on occasion can get a score of over 30 in later episodes. We are happy enough with this result as there are so many hyperparameters that can be varied and many attempts received little to no increase in reward. We originally were stuck on an average running reward of 4 or 5 due to some attempts yielding little to no reward. We believe a higher score is achievable with an increase in episodes ran and small changes to the hyperparameters to accommodate for a slower learning rate. Our DQN quickly trains once beginning training after 1000 steps. This is due to learning rate improving constantly and epsilon (which effects randomness) becoming lower over time. Running reward then slows after reaching 500 episodes before picking up and climbing to a running reward of 16 and plateauing, indicating little change in policy. Plateauing occurs due to epsilon reducing to 0.01 which means the DQN only makes random moves 1% of the time. It is clear from initial episodes and videos that the agent figures out the ball drops to the far left upon start.

# References
https://keras.io/examples/rl/deep_q_network_breakout - Adapted network structure and hyperparameters<br>
https://github.com/KJ-Waller/DQN-PyTorch-Breakout/blob/master/Breakout/DQN_model.py - Adapted network structure<br>
https://gymnasium.farama.org<br>
https://ale.farama.org<br>
https://pytorch.org/docs<br>
Mnih, V., et al. 2013. Playing Atari with Deep Reinforcement Learning<br>
Mousavi, S.S., Schukat, M. and Howley, E. 2018. Deep Reinforcement Learning: An Overview