# Playing Atari with Deep Q Networks (DQN)

In this notebook, we'll train a Deep Q-Network (DQN) agent on an **Atari game** using the Gym environment (like `Breakout-v0`).

We'll:
- Understand how to preprocess Atari frames.
- Build a convolutional DQN architecture.
- Train and evaluate the agent.
- Discuss the improvements for stable learning.

## 1. Setup and Imports

In [None]:
!pip install gym gym[atari] gym[accept-rom-license] torch torchvision numpy matplotlib opencv-python --quiet

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import cv2
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

## 2. Preprocessing Atari Frames

In [None]:
def preprocess_frame(frame):
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    frame = cv2.resize(frame, (84, 84))
    return frame / 255.0

## 3. Define the Convolutional DQN

In [None]:
class ConvDQN(nn.Module):
    def __init__(self, action_size):
        super(ConvDQN, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, action_size)
        )

    def forward(self, x):
        x = x / 255.0
        return self.net(x)

## 4. Replay Buffer for Experience Replay

In [None]:
class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)
    def push(self, transition):
        self.buffer.append(transition)
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return np.stack(states), actions, rewards, np.stack(next_states), dones
    def __len__(self):
        return len(self.buffer)

## 5. Environment and Hyperparameters

In [None]:
env = gym.make('ALE/Breakout-v5', render_mode=None)
action_size = env.action_space.n

q_net = ConvDQN(action_size).to(device)
target_net = ConvDQN(action_size).to(device)
target_net.load_state_dict(q_net.state_dict())

optimizer = optim.Adam(q_net.parameters(), lr=1e-4)
memory = ReplayBuffer(100000)

batch_size = 32
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.9995
target_update = 1000
episodes = 200
reward_history = []

## 6. Stack Frames for Temporal Context

In [None]:
from collections import deque
def stack_frames(state, stack):
    stack.append(preprocess_frame(state))
    while len(stack) < 4:
        stack.append(stack[-1])
    stacked_state = np.stack(stack, axis=0)
    return stacked_state, stack

## 7. Training Loop (Simplified)

In [None]:
for ep in range(episodes):
    state, _ = env.reset()
    frame_stack = deque(maxlen=4)
    state, frame_stack = stack_frames(state, frame_stack)
    total_reward = 0
    done = False

    while not done:
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
            q_values = q_net(state_t)
            action = q_values.argmax().item()

        next_state, reward, done, _, _ = env.step(action)
        next_state, frame_stack = stack_frames(next_state, frame_stack)
        memory.push((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        if len(memory) >= batch_size:
            states, actions, rewards, next_states, dones = memory.sample(batch_size)
            states = torch.FloatTensor(states).to(device)
            next_states = torch.FloatTensor(next_states).to(device)
            actions = torch.LongTensor(actions).unsqueeze(1).to(device)
            rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
            dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

            q_values = q_net(states).gather(1, actions)
            next_q = target_net(next_states).max(1)[0].unsqueeze(1)
            target = rewards + gamma * next_q * (1 - dones)

            loss = nn.SmoothL1Loss()(q_values, target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        epsilon = max(epsilon_min, epsilon * epsilon_decay)

    reward_history.append(total_reward)

    if ep % 10 == 0:
        target_net.load_state_dict(q_net.state_dict())
        print(f"Episode {ep}, Reward: {total_reward}, Epsilon: {epsilon:.3f}")

## 8. Visualize Training Rewards

In [None]:
plt.plot(reward_history)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('DQN Training on Atari Breakout')
plt.show()

## Summary

- Built a convolutional **Deep Q-Network (DQN)** for Atari gameplay.
- Applied **frame stacking** and **experience replay** for stable learning.
- Trained the model using **epsilon-greedy** exploration.

This is the foundation for more advanced algorithms like **Double DQN**, **Dueling DQN**, and **Rainbow DQN**.