In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Experience Replay and Target Networks -- Vizuara

## 1. Why Does This Matter?

In the previous notebook, we built a DQN that can take raw pixel frames and output Q-values. The architecture is ready. But if we try to train it naively -- updating the network on each transition as it happens -- the training will be wildly unstable and the agent will learn nothing useful.

Two fundamental problems sabotage naive training:

**Correlated data.** Consecutive game frames are nearly identical. Frame 100 looks almost the same as frame 101. Training a neural network on such correlated data causes it to forget earlier experiences -- a phenomenon called catastrophic forgetting.

**Moving targets.** In Q-learning, the target value depends on the same network we are updating. Every gradient step shifts the target, so we are chasing a moving goal. This is like trying to hit an archery target that moves every time you release the arrow.

DeepMind's 2013 paper introduced two brilliantly simple solutions: **experience replay** (store transitions and sample randomly) and **target networks** (freeze the target for stability). These two ideas -- not the CNN architecture -- are what made DQN actually work.

In this notebook, we will build both components from scratch, understand exactly why they are necessary, and visualize their impact on training stability.

---

## 2. Building Intuition

### Why correlated data breaks training

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
from collections import deque

# Demo: training on correlated vs random data
# We have 4 "patterns" the network should learn
# Let us see what happens with sequential vs random ordering

torch.manual_seed(42)
np.random.seed(42)

# Simple network for demonstration
class SimpleNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(10, 32),
            nn.ReLU(),
            nn.Linear(32, 4)
        )
    def forward(self, x):
        return self.fc(x)

# Create 4 distinct patterns
patterns = {
    0: torch.randn(10) + torch.tensor([2.0]*5 + [0.0]*5),
    1: torch.randn(10) + torch.tensor([0.0]*5 + [2.0]*5),
    2: torch.randn(10) + torch.tensor([-2.0]*5 + [0.0]*5),
    3: torch.randn(10) + torch.tensor([0.0]*5 + [-2.0]*5),
}
targets = {
    0: torch.tensor([5.0, 0.0, 0.0, 0.0]),
    1: torch.tensor([0.0, 5.0, 0.0, 0.0]),
    2: torch.tensor([0.0, 0.0, 5.0, 0.0]),
    3: torch.tensor([0.0, 0.0, 0.0, 5.0]),
}

# Training with SEQUENTIAL data (correlated - bad!)
net_seq = SimpleNet()
opt_seq = torch.optim.SGD(net_seq.parameters(), lr=0.01)
losses_seq = []

for epoch in range(200):
    # Sequential: train on pattern 0 for 50 steps, then 1, then 2, then 3
    pattern_id = (epoch * 4 // 200) % 4
    x = patterns[pattern_id].unsqueeze(0)
    t = targets[pattern_id].unsqueeze(0)
    pred = net_seq(x)
    loss = F.mse_loss(pred, t)
    opt_seq.zero_grad()
    loss.backward()
    opt_seq.step()
    # Check ALL patterns
    total_loss = sum(F.mse_loss(net_seq(patterns[i].unsqueeze(0)), targets[i].unsqueeze(0)).item() for i in range(4))
    losses_seq.append(total_loss / 4)

# Training with RANDOM data (uncorrelated - good!)
net_rand = SimpleNet()
# Use same initial weights
net_rand.load_state_dict(SimpleNet().state_dict())
opt_rand = torch.optim.SGD(net_rand.parameters(), lr=0.01)
losses_rand = []

for epoch in range(200):
    # Random: pick a random pattern each step
    pattern_id = random.randint(0, 3)
    x = patterns[pattern_id].unsqueeze(0)
    t = targets[pattern_id].unsqueeze(0)
    pred = net_rand(x)
    loss = F.mse_loss(pred, t)
    opt_rand.zero_grad()
    loss.backward()
    opt_rand.step()
    total_loss = sum(F.mse_loss(net_rand(patterns[i].unsqueeze(0)), targets[i].unsqueeze(0)).item() for i in range(4))
    losses_rand.append(total_loss / 4)

plt.figure(figsize=(10, 5))
plt.plot(losses_seq, label='Sequential (correlated)', color='#e74c3c', linewidth=2)
plt.plot(losses_rand, label='Random (uncorrelated)', color='#2ecc71', linewidth=2)
plt.xlabel('Training Step')
plt.ylabel('Average Loss Across All Patterns')
plt.title('Correlated data causes catastrophic forgetting')
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Sequential training: the network forgets earlier patterns!")
print("Random sampling: the network learns all patterns simultaneously.")

### The moving target problem

In [None]:
# Demo: trying to hit a moving target vs a fixed target

# Fixed target: easy to converge
value = 0.0
fixed_target = 10.0
lr = 0.1
values_fixed = [value]

for _ in range(50):
    value = value + lr * (fixed_target - value)
    values_fixed.append(value)

# Moving target: oscillates or diverges
value = 0.0
moving_target = 10.0
values_moving = [value]

for _ in range(50):
    # Target depends on current value (like Q-learning without target network)
    moving_target = 3.0 + 0.99 * (value + np.random.normal(0, 0.5))
    value = value + lr * (moving_target - value)
    values_moving.append(value)

plt.figure(figsize=(10, 5))
plt.plot(values_fixed, label='Fixed target (converges)', color='#2ecc71', linewidth=2)
plt.axhline(y=10.0, color='#2ecc71', linestyle='--', alpha=0.5)
plt.plot(values_moving, label='Moving target (unstable)', color='#e74c3c', linewidth=2)
plt.xlabel('Update Step')
plt.ylabel('Estimated Value')
plt.title('Fixed vs Moving Targets in Value Estimation')
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

---

## 3. The Mathematics

### Experience replay

The agent stores transitions $(s_t, a_t, r_t, s_{t+1}, \text{done}_t)$ in a circular buffer $\mathcal{D}$ of fixed capacity $N$. During training, a random mini-batch is sampled uniformly:

$$(s_j, a_j, r_j, s_{j+1}) \sim \text{Uniform}(\mathcal{D})$$

This breaks temporal correlations because transitions from different episodes and time steps are mixed together in each batch.

### The DQN loss function

DQN maintains two networks: the **online network** (parameters $\theta$) that we train, and the **target network** (parameters $\theta^-$) that provides stable targets.

The loss function is:

$$L(\theta) = \mathbb{E}_{(s, a, r, s') \sim \mathcal{D}} \left[ \left( r + \gamma \max_{a'} Q(s', a'; \theta^-) - Q(s, a; \theta) \right)^2 \right]$$

The term inside the square is the **temporal difference (TD) error**. The target network $\theta^-$ is updated by copying from $\theta$ every $C$ steps:

$$\theta^- \leftarrow \theta \quad \text{every } C \text{ steps}$$

In [None]:
# Let us compute the TD error for a concrete example
print("=" * 50)
print("Computing TD Error -- Worked Example")
print("=" * 50)

# Given:
r = 3.0           # reward
gamma = 0.99       # discount factor
q_online = 7.0     # Q(s, a=1; theta) -- online network's prediction
# Target network predicts for next state:
q_target_a0 = 5.0  # Q(s', a'=0; theta^-)
q_target_a1 = 8.0  # Q(s', a'=1; theta^-)

# Step 1: max Q from target network
max_q_target = max(q_target_a0, q_target_a1)
print(f"\n1. max Q(s', a'; theta^-) = max({q_target_a0}, {q_target_a1}) = {max_q_target}")

# Step 2: compute target value
target = r + gamma * max_q_target
print(f"2. Target = r + gamma * max_Q = {r} + {gamma} * {max_q_target} = {target:.2f}")

# Step 3: TD error
td_error = target - q_online
print(f"3. TD error = target - Q(s,a;theta) = {target:.2f} - {q_online} = {td_error:.2f}")

# Step 4: loss
loss = td_error ** 2
print(f"4. Loss = (TD error)^2 = ({td_error:.2f})^2 = {loss:.2f}")

print(f"\nInterpretation: The online network predicted {q_online},")
print(f"but the target says it should be {target:.2f}.")
print(f"Gradient descent will push the prediction upward by {td_error:.2f}.")

---

## 4. Let's Build It -- Component by Component

### Step 1: The Replay Buffer

In [None]:
class ReplayBuffer:
    """
    Circular buffer for storing and sampling experience transitions.

    Stores (state, action, reward, next_state, done) tuples.
    When full, the oldest transitions are overwritten.
    """
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        """Store a single transition."""
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        """Sample a random mini-batch of transitions."""
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.stack(states),
            torch.tensor(actions, dtype=torch.long),
            torch.tensor(rewards, dtype=torch.float32),
            torch.stack(next_states),
            torch.tensor(dones, dtype=torch.bool)
        )

    def __len__(self):
        return len(self.buffer)

# Test it
buffer = ReplayBuffer(capacity=10000)
print(f"Empty buffer: {len(buffer)} transitions")

# Add some fake transitions
for i in range(100):
    state = torch.randn(4, 84, 84)
    action = random.randint(0, 3)
    reward = random.choice([-1.0, 0.0, 1.0])
    next_state = torch.randn(4, 84, 84)
    done = random.random() < 0.05  # 5% chance of episode end
    buffer.push(state, action, reward, next_state, done)

print(f"After 100 transitions: {len(buffer)}")

# Sample a mini-batch
states, actions, rewards, next_states, dones = buffer.sample(32)
print(f"\nSampled batch:")
print(f"  States:      {states.shape}")
print(f"  Actions:     {actions.shape} -- values: {actions[:5].tolist()}")
print(f"  Rewards:     {rewards.shape} -- values: {rewards[:5].tolist()}")
print(f"  Next states: {next_states.shape}")
print(f"  Dones:       {dones.shape} -- values: {dones[:5].tolist()}")

### Step 2: Verify random sampling breaks correlations

In [None]:
# Store transitions from a "sequential episode"
buffer_demo = ReplayBuffer(capacity=1000)

# Simulate an episode with sequential states
for t in range(100):
    # State is just the time step (sequential!)
    state = torch.full((4, 84, 84), float(t) / 100)
    action = t % 4
    reward = 1.0 if t > 80 else 0.0  # Reward only near the end
    next_state = torch.full((4, 84, 84), float(t+1) / 100)
    done = (t == 99)
    buffer_demo.push(state, action, reward, next_state, done)

# Sample and check -- are the indices random?
states_batch, _, rewards_batch, _, _ = buffer_demo.sample(16)
time_steps = (states_batch[:, 0, 0, 0] * 100).int().tolist()

print("Sequential storage: t = 0, 1, 2, ..., 99")
print(f"Random sample of 16: t = {sorted(time_steps)}")
print(f"\nThe sample contains transitions from different time steps!")
print("This is exactly what breaks temporal correlations.")

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Sequential
ax1.bar(range(100), range(100), width=1, color='#e74c3c', alpha=0.3)
ax1.set_title('Sequential Training')
ax1.set_xlabel('Training step')
ax1.set_ylabel('Transition time step')
ax1.text(50, 80, 'Correlated!', ha='center', fontsize=14, color='red', fontweight='bold')

# Random sampling
for _ in range(100):
    sample = random.sample(range(100), 1)[0]
    ax2.scatter(_, sample, c='#2ecc71', alpha=0.5, s=10)
ax2.set_title('Random Sampling from Buffer')
ax2.set_xlabel('Training step')
ax2.set_ylabel('Transition time step')
ax2.text(50, 80, 'Uncorrelated!', ha='center', fontsize=14, color='green', fontweight='bold')

plt.suptitle('Experience Replay Breaks Temporal Correlations', fontweight='bold')
plt.tight_layout()
plt.show()

### Step 3: The Target Network

In [None]:
class DQN(nn.Module):
    """DQN from the previous notebook."""
    def __init__(self, n_actions):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def forward(self, x):
        conv_out = self.conv(x)
        flat = conv_out.view(conv_out.size(0), -1)
        return self.fc(flat)

# Create online and target networks
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_actions = 4

online_net = DQN(n_actions).to(device)
target_net = DQN(n_actions).to(device)

# Initialize target network with same weights as online network
target_net.load_state_dict(online_net.state_dict())
target_net.eval()  # Target network is never trained directly

print("Online network and target network initialized with identical weights.")

# Verify they produce the same output
test_input = torch.randn(1, 4, 84, 84).to(device)
with torch.no_grad():
    q_online = online_net(test_input)
    q_target = target_net(test_input)

print(f"\nSame input -> same output (before any training):")
print(f"  Online:  {q_online[0].cpu().numpy().round(4)}")
print(f"  Target:  {q_target[0].cpu().numpy().round(4)}")
print(f"  Match:   {torch.allclose(q_online, q_target)}")

### Step 4: Computing the DQN loss

In [None]:
def compute_dqn_loss(online_net, target_net, batch, gamma=0.99):
    """
    Compute the DQN loss for a batch of transitions.

    L = E[(r + gamma * max_a' Q(s', a'; theta^-) - Q(s, a; theta))^2]
    """
    states, actions, rewards, next_states, dones = batch

    states = states.to(device)
    actions = actions.to(device)
    rewards = rewards.to(device)
    next_states = next_states.to(device)
    dones = dones.to(device)

    # Current Q-values: Q(s, a; theta) for the actions that were taken
    q_values = online_net(states)
    q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)

    # Target Q-values: r + gamma * max_a' Q(s', a'; theta^-)
    with torch.no_grad():
        next_q_values = target_net(next_states)
        max_next_q = next_q_values.max(1)[0]
        # If done, there is no next state -- target is just r
        target = rewards + gamma * max_next_q * (~dones).float()

    # MSE loss
    loss = F.mse_loss(q_value, target)

    return loss, (q_value - target).detach()  # Also return TD errors

# Test with a small batch from our buffer
batch = buffer.sample(32)
loss, td_errors = compute_dqn_loss(online_net, target_net, batch)
print(f"DQN Loss: {loss.item():.4f}")
print(f"TD errors -- mean: {td_errors.mean().item():.4f}, std: {td_errors.std().item():.4f}")

---

## 5. Your Turn

### TODO 1: Implement target network soft update

The original DQN copies weights every $C$ steps (hard update). An alternative is **soft update** (also called Polyak averaging): $\theta^- \leftarrow \tau \theta + (1 - \tau) \theta^-$, where $\tau$ is a small value like 0.005.

In [None]:
# TODO: Implement soft update and compare with hard update

def hard_update(online_net, target_net):
    """Copy all weights from online to target."""
    target_net.load_state_dict(online_net.state_dict())

def soft_update(online_net, target_net, tau=0.005):
    """Polyak averaging: slowly blend online weights into target."""
    # YOUR CODE HERE
    # Hint: iterate over zip(target_net.parameters(), online_net.parameters())
    # For each pair, update: target_param.data = tau * online_param.data + (1-tau) * target_param.data
    pass

# Test your implementation:
# 1. Create two networks with different random weights
# 2. Apply soft_update 200 times
# 3. Check that target weights gradually approach online weights
# 4. Plot the distance between weights over time

### TODO 2: Analyze replay buffer statistics

In [None]:
# TODO: Fill the replay buffer with transitions and analyze the statistics
#
# 1. Create a buffer of size 10000
# 2. Fill it with 20000 transitions (so it wraps around)
# 3. Sample 1000 batches of size 32
# 4. For each batch, compute the average "time gap" between transitions
#    (how far apart in time are the sampled transitions?)
# 5. Plot the distribution of time gaps
#
# This shows that random sampling produces batches with diverse time gaps,
# which is exactly what we want for breaking correlations.
#
# YOUR CODE HERE

---

## 6. Putting It All Together

Let us now train a DQN with both experience replay and target networks on a simple environment.

In [None]:
# We will use a simplified environment to demonstrate the training loop
# This avoids needing a full Atari setup while showing the same principles

class SimpleGridEnv:
    """
    A 5x5 grid world where the agent must reach a goal.
    State: flattened into a pseudo-image (4, 84, 84)
    """
    def __init__(self):
        self.size = 5
        self.reset()

    def reset(self):
        self.agent_pos = [0, 0]
        self.goal_pos = [4, 4]
        return self._get_state()

    def _get_state(self):
        """Create a 4x84x84 pseudo-image from the grid state."""
        frame = np.zeros((84, 84), dtype=np.float32)
        # Draw agent
        ax, ay = self.agent_pos
        px, py = ax * 16 + 2, ay * 16 + 2
        frame[py:py+12, px:px+12] = 1.0
        # Draw goal
        gx, gy = self.goal_pos
        gpx, gpy = gx * 16 + 2, gy * 16 + 2
        frame[gpy:gpy+12, gpx:gpx+12] = 0.5
        # Stack 4 copies (no motion in this simple env)
        return torch.tensor(np.stack([frame]*4))

    def step(self, action):
        """Execute action: 0=up, 1=down, 2=left, 3=right"""
        dx, dy = [(0,-1), (0,1), (-1,0), (1,0)][action]
        new_x = max(0, min(self.size-1, self.agent_pos[0] + dx))
        new_y = max(0, min(self.size-1, self.agent_pos[1] + dy))
        self.agent_pos = [new_x, new_y]

        done = (self.agent_pos == self.goal_pos)
        reward = 1.0 if done else -0.01
        return self._get_state(), reward, done

# Train with experience replay + target network
env = SimpleGridEnv()
online_net = DQN(n_actions=4).to(device)
target_net = DQN(n_actions=4).to(device)
target_net.load_state_dict(online_net.state_dict())
optimizer = torch.optim.Adam(online_net.parameters(), lr=1e-4)
replay_buffer = ReplayBuffer(capacity=10000)

# Hyperparameters
GAMMA = 0.99
BATCH_SIZE = 32
TARGET_UPDATE = 100
EPSILON_START = 1.0
EPSILON_END = 0.05
EPSILON_DECAY = 500

episode_rewards = []
losses_log = []
step_count = 0

for episode in range(300):
    state = env.reset()
    episode_reward = 0

    for t in range(50):  # Max 50 steps per episode
        # Epsilon-greedy
        epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
                  np.exp(-step_count / EPSILON_DECAY)

        if random.random() < epsilon:
            action = random.randint(0, 3)
        else:
            with torch.no_grad():
                q = online_net(state.unsqueeze(0).to(device))
                action = q.argmax(dim=1).item()

        next_state, reward, done = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward
        step_count += 1

        # Training
        if len(replay_buffer) >= BATCH_SIZE:
            batch = replay_buffer.sample(BATCH_SIZE)
            loss, _ = compute_dqn_loss(online_net, target_net, batch, GAMMA)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses_log.append(loss.item())

        # Target network update
        if step_count % TARGET_UPDATE == 0:
            target_net.load_state_dict(online_net.state_dict())

        if done:
            break

    episode_rewards.append(episode_reward)

# Plot results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Smooth rewards
window = 20
smoothed = np.convolve(episode_rewards, np.ones(window)/window, mode='valid')
ax1.plot(episode_rewards, alpha=0.3, color='gray')
ax1.plot(range(window-1, len(episode_rewards)), smoothed, color='#2ecc71', linewidth=2)
ax1.set_xlabel('Episode')
ax1.set_ylabel('Episode Reward')
ax1.set_title('Training Reward (with replay + target network)')
ax1.grid(True, alpha=0.3)

if losses_log:
    smoothed_loss = np.convolve(losses_log, np.ones(100)/100, mode='valid')
    ax2.plot(losses_log, alpha=0.1, color='gray')
    ax2.plot(range(99, len(losses_log)), smoothed_loss, color='#e74c3c', linewidth=2)
ax2.set_xlabel('Training Step')
ax2.set_ylabel('Loss')
ax2.set_title('DQN Training Loss')
ax2.grid(True, alpha=0.3)

plt.suptitle('DQN Training with Experience Replay + Target Network', fontweight='bold')
plt.tight_layout()
plt.show()

print(f"\nFinal avg reward (last 20 episodes): {np.mean(episode_rewards[-20:]):.3f}")
print(f"Buffer size: {len(replay_buffer)}")
print(f"Total training steps: {step_count}")

---

## 7. Training and Results

Let us now compare training with and without our two innovations.

In [None]:
def train_dqn_variant(use_replay, use_target_net, n_episodes=200, label=""):
    """Train a DQN variant and return rewards."""
    env = SimpleGridEnv()
    online = DQN(4).to(device)
    target = DQN(4).to(device)
    target.load_state_dict(online.state_dict())
    opt = torch.optim.Adam(online.parameters(), lr=1e-4)
    buf = ReplayBuffer(10000)
    rewards = []
    step = 0

    for ep in range(n_episodes):
        state = env.reset()
        ep_reward = 0
        for t in range(50):
            eps = max(0.05, 1.0 - step / 500)
            if random.random() < eps:
                action = random.randint(0, 3)
            else:
                with torch.no_grad():
                    action = online(state.unsqueeze(0).to(device)).argmax(1).item()

            ns, r, done = env.step(action)
            buf.push(state, action, r, ns, done)
            state = ns
            ep_reward += r
            step += 1

            if len(buf) >= 32:
                if use_replay:
                    batch = buf.sample(32)
                else:
                    # No replay: use last 32 transitions (correlated!)
                    recent = list(buf.buffer)[-32:]
                    s, a, rr, ns2, d = zip(*recent)
                    batch = (torch.stack(s), torch.tensor(a, dtype=torch.long),
                             torch.tensor(rr, dtype=torch.float32),
                             torch.stack(ns2), torch.tensor(d, dtype=torch.bool))

                tgt = target if use_target_net else online
                loss, _ = compute_dqn_loss(online, tgt, batch)
                opt.zero_grad()
                loss.backward()
                opt.step()

            if use_target_net and step % 100 == 0:
                target.load_state_dict(online.state_dict())

            if done:
                break
        rewards.append(ep_reward)

    return rewards

# Compare all variants
print("Training 4 variants (this may take a minute)...")
r1 = train_dqn_variant(use_replay=False, use_target_net=False, label="No replay, no target")
r2 = train_dqn_variant(use_replay=True, use_target_net=False, label="Replay only")
r3 = train_dqn_variant(use_replay=False, use_target_net=True, label="Target net only")
r4 = train_dqn_variant(use_replay=True, use_target_net=True, label="Both (full DQN)")

window = 15
fig, ax = plt.subplots(figsize=(12, 6))

for rewards, label, color in [
    (r1, 'No replay, no target net', '#e74c3c'),
    (r2, 'Replay only', '#f39c12'),
    (r3, 'Target net only', '#9b59b6'),
    (r4, 'Both (full DQN)', '#2ecc71'),
]:
    smoothed = np.convolve(rewards, np.ones(window)/window, mode='valid')
    ax.plot(range(window-1, len(rewards)), smoothed, label=label, linewidth=2, color=color)

ax.set_xlabel('Episode', fontsize=12)
ax.set_ylabel('Episode Reward', fontsize=12)
ax.set_title('Impact of Experience Replay and Target Networks', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nBoth innovations together give the most stable, highest-performing training.")

---

## 8. Final Output

In [None]:
print("=" * 60)
print("SUMMARY: Experience Replay and Target Networks")
print("=" * 60)

print("""
EXPERIENCE REPLAY:
  - Stores transitions (s, a, r, s', done) in a circular buffer
  - Samples random mini-batches for training
  - Breaks temporal correlations between consecutive frames
  - Allows reuse of each transition in multiple updates

TARGET NETWORK:
  - Separate copy of the network, updated every C steps
  - Provides stable targets: r + gamma * max Q(s', a'; theta^-)
  - Prevents the "moving target" problem
  - Original DQN: hard copy every 10,000 steps

TOGETHER:
  - These two ideas made deep Q-learning practical
  - Without them, the CNN architecture from Notebook 1 fails to learn
  - They are now standard components in ALL deep RL algorithms
""")

---

## 9. Reflection and Next Steps

**What we built:**
- A replay buffer that stores and randomly samples transitions
- The target network mechanism for stable Q-value targets
- The complete DQN loss computation
- Demonstrated the impact of both innovations on training stability

**Key insights:**
1. Correlated sequential data causes catastrophic forgetting -- random sampling fixes this
2. Moving targets cause instability -- freezing the target network fixes this
3. Both innovations are necessary; either alone is insufficient

**Think about:**
1. The replay buffer stores transitions uniformly. What if some transitions are more "important" than others? (This leads to Prioritized Experience Replay.)
2. Why every C steps for the target update? What happens if C is too small or too large?
3. The buffer has fixed capacity. What happens when it fills up -- do we lose important early experiences?

**Next notebook:** We will put everything together into a complete DQN training loop and train an agent to play Pong from raw pixels.