In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Training a DQN Agent to Play Pong -- Vizuara

## 1. Why Does This Matter?

In the first two notebooks, we built every component of the DQN algorithm: the convolutional neural network (Notebook 1), the replay buffer, and the target network (Notebook 2). Now it is time to put them all together and train an agent to play an actual Atari game.

We will train a DQN agent on **Pong** -- the classic game where two paddles hit a ball back and forth. The agent controls one paddle, and the built-in game AI controls the other. The score ranges from -21 (complete loss) to +21 (complete win). A random agent scores about -21 because it almost never hits the ball.

By the end of this notebook, you will have a fully trained DQN agent that learns to play Pong from raw pixels -- no hand-crafted features, no game-specific knowledge. The same code you write here is essentially what DeepMind used in their groundbreaking 2013 paper.

We will use a lightweight environment wrapper to keep training under 10 minutes on a T4 GPU.

---

## 2. Building Intuition

### The epsilon-greedy exploration schedule

Before we start training, let us understand how the agent explores. DQN uses epsilon-greedy exploration with **annealing**: epsilon starts high (fully random) and decreases over time (mostly greedy).

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import random
from collections import deque
import time

# Epsilon schedule
def get_epsilon(step, eps_start=1.0, eps_end=0.1, decay_steps=100000):
    """Linear epsilon decay."""
    return max(eps_end, eps_start - step * (eps_start - eps_end) / decay_steps)

# Visualize the schedule
steps = np.arange(0, 200000)
epsilons = [get_epsilon(s) for s in steps]

plt.figure(figsize=(10, 5))
plt.plot(steps, epsilons, color='#3498db', linewidth=2)
plt.axvline(x=100000, color='gray', linestyle='--', alpha=0.5)
plt.text(50000, 0.7, 'Exploration phase\n(mostly random)', ha='center', fontsize=12, color='#e74c3c')
plt.text(150000, 0.2, 'Exploitation phase\n(mostly greedy)', ha='center', fontsize=12, color='#2ecc71')
plt.xlabel('Training Steps', fontsize=12)
plt.ylabel('Epsilon', fontsize=12)
plt.title('Epsilon-Greedy Annealing Schedule', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Concrete examples
for t in [0, 25000, 50000, 75000, 100000, 150000]:
    eps = get_epsilon(t)
    print(f"  Step {t:>7,}: epsilon = {eps:.3f} ({eps*100:.1f}% random, {(1-eps)*100:.1f}% greedy)")

### How Pong works for the agent

In [None]:
# Let us understand the Pong action space
print("Pong Action Space:")
print("  0: NOOP (do nothing)")
print("  1: FIRE (start the game / not used during play)")
print("  2: UP   (move paddle up)")
print("  3: DOWN (move paddle down)")
print("  4: UP + FIRE")
print("  5: DOWN + FIRE")
print()
print("Effective actions: UP, DOWN, NOOP")
print("The agent learns which action maximizes long-term score.")
print()
print("Reward signal:")
print("  +1 when the agent scores a point")
print("  -1 when the opponent scores a point")
print("   0 at all other time steps")
print()
print("A random agent scores about -21 (loses every rally).")
print("A trained DQN agent typically reaches +18 to +21 (wins almost every rally).")

---

## 3. The Mathematics

The complete DQN algorithm combines everything we have built:

**Epsilon-greedy policy:**

$$a = \begin{cases} \text{random action} & \text{with probability } \epsilon \\ \arg\max_a Q(s, a; \theta) & \text{with probability } 1 - \epsilon \end{cases}$$

**Loss function:**

$$L(\theta) = \mathbb{E}_{(s,a,r,s') \sim \mathcal{D}} \left[ \left( r + \gamma \max_{a'} Q(s', a'; \theta^-) - Q(s, a; \theta) \right)^2 \right]$$

**Epsilon annealing:**

$$\epsilon(t) = \max\left(\epsilon_{\text{end}},\; \epsilon_{\text{start}} - t \cdot \frac{\epsilon_{\text{start}} - \epsilon_{\text{end}}}{\text{decay\_steps}}\right)$$

In [None]:
# Print the hyperparameter table from the DQN paper
print("=" * 55)
print("DQN Hyperparameters (Mnih et al., 2015)")
print("=" * 55)

hyperparams = {
    "Replay buffer size":      "1,000,000 (we use 50,000)",
    "Mini-batch size":         "32",
    "Discount factor (gamma)": "0.99",
    "Learning rate":           "0.00025 (RMSProp) / 1e-4 (Adam)",
    "Epsilon start":           "1.0",
    "Epsilon end":             "0.1",
    "Epsilon decay frames":    "1,000,000 (we use 50,000)",
    "Target update freq (C)":  "10,000 (we use 1,000)",
    "Training start":          "50,000 frames (we use 1,000)",
    "Frame skip":              "4",
    "Frame stack":             "4",
}

for k, v in hyperparams.items():
    print(f"  {k:30s} {v}")

---

## 4. Let's Build It -- Component by Component

### Step 1: All components

In [None]:
# DQN network
class DQN(nn.Module):
    def __init__(self, n_actions):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4), nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512), nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        s, a, r, ns, d = zip(*batch)
        return (torch.stack(s), torch.tensor(a, dtype=torch.long),
                torch.tensor(r, dtype=torch.float32),
                torch.stack(ns), torch.tensor(d, dtype=torch.bool))

    def __len__(self):
        return len(self.buffer)

print("DQN, ReplayBuffer defined. Ready to train.")

### Step 2: Environment setup

In [None]:
# Install gymnasium with Atari support
import subprocess
subprocess.run(['pip', 'install', '-q', 'gymnasium[atari]', 'gymnasium[accept-rom-license]'],
               capture_output=True)

import gymnasium as gym
import cv2

class AtariWrapper:
    """
    Wraps an Atari environment with DQN preprocessing.
    - Frame skip (repeat action N times)
    - Grayscale + resize to 84x84
    - Stack 4 frames
    - Clip rewards to {-1, 0, +1}
    """
    def __init__(self, env_name="PongNoFrameskip-v4", frame_skip=4, stack_size=4):
        self.env = gym.make(env_name)
        self.frame_skip = frame_skip
        self.stack_size = stack_size
        self.frames = deque(maxlen=stack_size)
        self.n_actions = self.env.action_space.n

    def _preprocess(self, frame):
        gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        resized = cv2.resize(gray, (84, 84))
        return resized.astype(np.float32) / 255.0

    def reset(self):
        obs, info = self.env.reset()
        frame = self._preprocess(obs)
        for _ in range(self.stack_size):
            self.frames.append(frame)
        return torch.tensor(np.array(list(self.frames)))

    def step(self, action):
        total_reward = 0.0
        done = False
        for _ in range(self.frame_skip):
            obs, reward, terminated, truncated, info = self.env.step(action)
            total_reward += reward
            done = terminated or truncated
            if done:
                break
        frame = self._preprocess(obs)
        self.frames.append(frame)
        state = torch.tensor(np.array(list(self.frames)))
        # Clip reward
        clipped_reward = np.clip(total_reward, -1.0, 1.0)
        return state, clipped_reward, done

    def close(self):
        self.env.close()

# Create the environment
try:
    env = AtariWrapper("PongNoFrameskip-v4")
    state = env.reset()
    print(f"Environment: PongNoFrameskip-v4")
    print(f"State shape: {state.shape}")
    print(f"Number of actions: {env.n_actions}")

    # Show what the agent sees
    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    for i in range(4):
        axes[i].imshow(state[i].numpy(), cmap='gray')
        axes[i].set_title(f'Frame {i+1}')
        axes[i].axis('off')
    plt.suptitle('What the DQN agent sees: 4 stacked frames', fontweight='bold')
    plt.tight_layout()
    plt.show()
    env.close()
    ATARI_AVAILABLE = True
except Exception as e:
    print(f"Atari environment not available: {e}")
    print("We will use a substitute environment for training.")
    ATARI_AVAILABLE = False

### Step 3: Substitute environment (fallback)

In [None]:
class PongSubstitute:
    """
    A simplified Pong-like environment for when Atari ROMs are unavailable.
    Ball bounces, paddle moves up/down. Similar reward structure.
    """
    def __init__(self):
        self.n_actions = 3  # 0=stay, 1=up, 2=down
        self.size = 84
        self.reset()

    def reset(self):
        self.paddle_y = 42
        self.ball_x = 42
        self.ball_y = 42
        self.ball_dx = 2
        self.ball_dy = random.choice([-1, 0, 1])
        self.opponent_y = 42
        self.score = 0
        self.steps = 0
        self.frames = deque(maxlen=4)
        frame = self._render()
        for _ in range(4):
            self.frames.append(frame)
        return torch.tensor(np.array(list(self.frames)))

    def _render(self):
        img = np.zeros((84, 84), dtype=np.float32)
        # Agent paddle (right side)
        py = max(0, min(self.size-12, self.paddle_y - 6))
        img[py:py+12, 78:80] = 1.0
        # Opponent paddle (left side)
        oy = max(0, min(self.size-12, self.opponent_y - 6))
        img[oy:oy+12, 4:6] = 1.0
        # Ball
        bx = max(0, min(self.size-3, int(self.ball_x)))
        by = max(0, min(self.size-3, int(self.ball_y)))
        img[by:by+3, bx:bx+3] = 1.0
        # Center line
        img[::4, 42] = 0.3
        return img

    def step(self, action):
        # Move paddle
        if action == 1:
            self.paddle_y = max(6, self.paddle_y - 3)
        elif action == 2:
            self.paddle_y = min(78, self.paddle_y + 3)

        # Move ball
        self.ball_x += self.ball_dx
        self.ball_y += self.ball_dy

        # Bounce off top/bottom
        if self.ball_y <= 1 or self.ball_y >= 82:
            self.ball_dy = -self.ball_dy

        # Check paddle hit (right side - agent)
        reward = 0.0
        done = False
        if self.ball_x >= 76:
            if abs(self.ball_y - self.paddle_y) < 8:
                self.ball_dx = -abs(self.ball_dx)
                self.ball_dy = (self.ball_y - self.paddle_y) / 4
            else:
                reward = -1.0
                self.ball_x = 42
                self.ball_y = 42
                self.ball_dx = 2
                self.ball_dy = random.choice([-1, 0, 1])

        # Opponent paddle (simple AI)
        if self.ball_x < 10:
            if abs(self.ball_y - self.opponent_y) < 8:
                self.ball_dx = abs(self.ball_dx)
                self.ball_dy = (self.ball_y - self.opponent_y) / 4
            else:
                reward = 1.0
                self.ball_x = 42
                self.ball_y = 42
                self.ball_dx = 2
                self.ball_dy = random.choice([-1, 0, 1])

        self.opponent_y += np.clip(self.ball_y - self.opponent_y, -2, 2)

        self.steps += 1
        if self.steps >= 2000:
            done = True

        frame = self._render()
        self.frames.append(frame)
        state = torch.tensor(np.array(list(self.frames)))
        return state, np.clip(reward, -1, 1), done

    def close(self):
        pass

# Test the substitute environment
sub_env = PongSubstitute()
state = sub_env.reset()
print(f"Substitute env -- State shape: {state.shape}, Actions: {sub_env.n_actions}")

fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for i in range(4):
    axes[i].imshow(state[i].numpy(), cmap='gray')
    axes[i].set_title(f'Frame {i+1}')
    axes[i].axis('off')
plt.suptitle('Substitute Pong Environment', fontweight='bold')
plt.tight_layout()
plt.show()
sub_env.close()

### Step 4: The complete training loop

In [None]:
def train_dqn(env_class, n_episodes=500, max_steps=2000,
              buffer_size=50000, batch_size=32, gamma=0.99,
              lr=1e-4, target_update=1000,
              eps_start=1.0, eps_end=0.1, eps_decay=50000,
              min_buffer=1000):
    """
    Complete DQN training loop.

    Returns: episode_rewards, losses, trained online_net
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Create environment
    if env_class == "atari":
        env = AtariWrapper("PongNoFrameskip-v4")
        n_actions = env.n_actions
    else:
        env = PongSubstitute()
        n_actions = env.n_actions

    # Networks
    online_net = DQN(n_actions).to(device)
    target_net = DQN(n_actions).to(device)
    target_net.load_state_dict(online_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    replay_buffer = ReplayBuffer(buffer_size)

    # Tracking
    episode_rewards = []
    losses = []
    step_count = 0
    best_reward = -float('inf')
    start_time = time.time()

    for episode in range(n_episodes):
        state = env.reset()
        episode_reward = 0

        for t in range(max_steps):
            # Epsilon-greedy action selection
            epsilon = get_epsilon(step_count, eps_start, eps_end, eps_decay)

            if random.random() < epsilon:
                action = random.randint(0, n_actions - 1)
            else:
                with torch.no_grad():
                    q_vals = online_net(state.unsqueeze(0).to(device))
                    action = q_vals.argmax(dim=1).item()

            # Environment step
            next_state, reward, done = env.step(action)
            replay_buffer.push(state, action, reward, next_state, done)
            state = next_state
            episode_reward += reward
            step_count += 1

            # Training step
            if len(replay_buffer) >= min_buffer:
                s, a, r, ns, d = replay_buffer.sample(batch_size)
                s, a, r, ns, d = (s.to(device), a.to(device), r.to(device),
                                   ns.to(device), d.to(device))

                # Q-values for taken actions
                q_values = online_net(s).gather(1, a.unsqueeze(1)).squeeze(1)

                # Target Q-values
                with torch.no_grad():
                    next_q = target_net(ns).max(1)[0]
                    target = r + gamma * next_q * (~d).float()

                loss = F.mse_loss(q_values, target)

                optimizer.zero_grad()
                loss.backward()
                # Gradient clipping (stabilizes training)
                torch.nn.utils.clip_grad_norm_(online_net.parameters(), 10.0)
                optimizer.step()

                losses.append(loss.item())

            # Update target network
            if step_count % target_update == 0:
                target_net.load_state_dict(online_net.state_dict())

            if done:
                break

        episode_rewards.append(episode_reward)
        if episode_reward > best_reward:
            best_reward = episode_reward

        # Progress logging
        if (episode + 1) % 25 == 0:
            avg_reward = np.mean(episode_rewards[-25:])
            avg_loss = np.mean(losses[-100:]) if losses else 0
            elapsed = time.time() - start_time
            eps = get_epsilon(step_count, eps_start, eps_end, eps_decay)
            print(f"Episode {episode+1:4d} | "
                  f"Avg Reward: {avg_reward:7.2f} | "
                  f"Best: {best_reward:6.1f} | "
                  f"Epsilon: {eps:.3f} | "
                  f"Buffer: {len(replay_buffer):6d} | "
                  f"Loss: {avg_loss:.4f} | "
                  f"Time: {elapsed:.0f}s")

    env.close()
    return episode_rewards, losses, online_net

# Choose environment
if ATARI_AVAILABLE:
    print("Training on Atari Pong...")
    env_type = "atari"
else:
    print("Training on substitute Pong environment...")
    env_type = "substitute"

print(f"Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")
print("Starting training...\n")

rewards, losses, trained_net = train_dqn(
    env_class=env_type,
    n_episodes=300,
    eps_decay=30000,
    target_update=500,
    min_buffer=500
)

---

## 5. Your Turn

### TODO 1: Implement reward clipping analysis

In [None]:
# TODO: Analyze how reward clipping affects the Q-value distribution
#
# In the DQN paper, rewards are clipped to {-1, 0, +1}.
# This means the agent treats a +10 reward the same as a +1 reward.
#
# YOUR TASK:
# 1. Generate 1000 random transitions with varying rewards (e.g., -5 to +5)
# 2. Compute TD targets with and without reward clipping
# 3. Plot the distribution of TD targets for both cases
# 4. Explain why clipping helps training stability
#
# YOUR CODE HERE

### TODO 2: Visualize learned Q-values

In [None]:
# TODO: After training, visualize what the agent has learned
#
# 1. Create 10 different game states (ball in different positions)
# 2. Run each through the trained network
# 3. Create a heatmap: rows = states, columns = actions, values = Q-values
# 4. Interpret: does the agent learn to move the paddle toward the ball?
#
# YOUR CODE HERE
# Hint: use plt.imshow() for the heatmap with plt.colorbar()

---

## 6. Putting It All Together

In [None]:
# Visualize the training results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Episode rewards
ax = axes[0, 0]
ax.plot(rewards, alpha=0.3, color='gray', linewidth=0.5)
window = 20
if len(rewards) >= window:
    smoothed = np.convolve(rewards, np.ones(window)/window, mode='valid')
    ax.plot(range(window-1, len(rewards)), smoothed, color='#2ecc71', linewidth=2, label=f'{window}-episode moving average')
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5, label='Break even')
ax.set_xlabel('Episode')
ax.set_ylabel('Episode Reward')
ax.set_title('Training Reward Over Time')
ax.legend()
ax.grid(True, alpha=0.3)

# 2. Training loss
ax = axes[0, 1]
if losses:
    ax.plot(losses, alpha=0.1, color='gray')
    loss_window = 100
    if len(losses) >= loss_window:
        smoothed_loss = np.convolve(losses, np.ones(loss_window)/loss_window, mode='valid')
        ax.plot(range(loss_window-1, len(losses)), smoothed_loss, color='#e74c3c', linewidth=2)
ax.set_xlabel('Training Step')
ax.set_ylabel('Loss')
ax.set_title('DQN Training Loss')
ax.grid(True, alpha=0.3)

# 3. Reward distribution
ax = axes[1, 0]
ax.hist(rewards, bins=30, color='#3498db', edgecolor='white', alpha=0.7)
ax.axvline(x=np.mean(rewards), color='red', linestyle='--', label=f'Mean: {np.mean(rewards):.1f}')
ax.set_xlabel('Episode Reward')
ax.set_ylabel('Count')
ax.set_title('Reward Distribution')
ax.legend()
ax.grid(True, alpha=0.3)

# 4. Learning progress (first half vs second half)
ax = axes[1, 1]
mid = len(rewards) // 2
first_half = rewards[:mid]
second_half = rewards[mid:]
ax.boxplot([first_half, second_half], labels=['First Half', 'Second Half'])
ax.set_ylabel('Episode Reward')
ax.set_title('Learning Progress: First vs Second Half')
ax.grid(True, alpha=0.3)

plt.suptitle('DQN Training Results on Pong', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print(f"\nTraining Summary:")
print(f"  Total episodes: {len(rewards)}")
print(f"  Final avg reward (last 25): {np.mean(rewards[-25:]):.2f}")
print(f"  Best episode reward: {max(rewards):.1f}")
print(f"  Improvement: {np.mean(rewards[:25]):.2f} -> {np.mean(rewards[-25:]):.2f}")

---

## 7. Training and Results

In [None]:
# Watch the trained agent play
def evaluate_agent(trained_net, env_class, n_episodes=5):
    """Evaluate the trained agent without exploration."""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if env_class == "atari":
        env = AtariWrapper("PongNoFrameskip-v4")
    else:
        env = PongSubstitute()

    eval_rewards = []
    for ep in range(n_episodes):
        state = env.reset()
        episode_reward = 0
        frames_to_show = []

        for t in range(2000):
            with torch.no_grad():
                q_vals = trained_net(state.unsqueeze(0).to(device))
                action = q_vals.argmax(dim=1).item()

            if t % 50 == 0:
                frames_to_show.append(state[0].numpy().copy())

            state, reward, done = env.step(action)
            episode_reward += reward
            if done:
                break

        eval_rewards.append(episode_reward)
        print(f"  Episode {ep+1}: reward = {episode_reward:.1f} ({t+1} steps)")

        # Show frames from the first episode
        if ep == 0 and frames_to_show:
            n_show = min(8, len(frames_to_show))
            fig, axes = plt.subplots(1, n_show, figsize=(2*n_show, 2))
            for i in range(n_show):
                axes[i].imshow(frames_to_show[i], cmap='gray')
                axes[i].axis('off')
                axes[i].set_title(f't={i*50}', fontsize=8)
            plt.suptitle('Trained Agent Playing', fontweight='bold')
            plt.tight_layout()
            plt.show()

    env.close()
    print(f"\nEvaluation: mean reward = {np.mean(eval_rewards):.2f} +/- {np.std(eval_rewards):.2f}")
    return eval_rewards

print("Evaluating trained agent (no exploration)...")
eval_rewards = evaluate_agent(trained_net, env_type)

---

## 8. Final Output

In [None]:
print("=" * 60)
print("COMPLETE DQN TRAINING PIPELINE -- SUMMARY")
print("=" * 60)
print(f"""
Components:
  1. DQN Network:     4x84x84 -> Conv -> Conv -> Conv -> FC -> Q-values
  2. Replay Buffer:   Stores transitions, samples random mini-batches
  3. Target Network:  Frozen copy, updated every C steps
  4. Epsilon-Greedy:  Anneals from 1.0 (random) to 0.1 (greedy)
  5. Preprocessing:   RGB -> Grayscale -> 84x84 -> Stack 4 frames

Training Loop:
  For each step:
    1. Select action (epsilon-greedy)
    2. Execute in environment
    3. Store transition in replay buffer
    4. Sample mini-batch and compute TD loss
    5. Gradient descent on online network
    6. Periodically copy to target network

Results:
  Starting reward:  {np.mean(rewards[:25]):.2f}
  Final reward:     {np.mean(rewards[-25:]):.2f}
  Best reward:      {max(rewards):.1f}
  Total episodes:   {len(rewards)}

The agent learned to play Pong from raw pixels alone!
""")

---

## 9. Reflection and Next Steps

**What we built:**
- A complete, end-to-end DQN training pipeline
- Epsilon-greedy exploration with linear annealing
- Full training loop with logging and evaluation
- Trained an agent to play Pong from raw pixels

**Key takeaways:**
1. The DQN algorithm is surprisingly simple once you have all the components
2. Epsilon annealing is critical -- the agent needs to explore early and exploit later
3. Gradient clipping helps stabilize training
4. Reward clipping normalizes the learning signal across different games

**Think about:**
1. We used the same architecture for Pong. Could you use it for Breakout without changing anything? (Yes -- that is the power of DQN.)
2. Why does the agent sometimes plateau before improving? What is happening during those flat periods?
3. The DQN paper used 50 million training frames. We used far fewer. How would more training change the results?

**Next notebook:** We will explore DQN extensions -- Double DQN, which fixes the overestimation bias we have been ignoring, and examine how DQN performs across different Atari games.