### Deep Q-Network

**Principle:** Deep Value Function Approximation

**Definition:** Uses a neural network to approximate Q-values, trained via experience replay and target networks (DQN).

**Algorithm Description:** Deep Q-Network approximates the Q-function using a deep neural network that takes states as input and outputs Q-values for each possible action. It uses experience replay (storing and randomly sampling past experiences) and target networks (periodically updated copies) to stabilize learning in high-dimensional spaces.

**Typical Use Cases:**
- Combines q-learning with neural networks
- Famous for playing atari games from pixels.
- Model-free, off-policy control
- Solving problems with high-dimensional or continuous state spaces

**Assumptions:**
- Discrete actions
- Experience replay
- High-dimensional states
- Large datasets
- Neural network training



### ‚öôÔ∏è Prerequisites & Setup

**Required Packages:**
- `gymnasium>=0.29.0` - RL environment
- `torch>=2.0.0` - Deep learning framework
- `numpy`, `matplotlib`, `seaborn` - Data processing and visualization

**Installation:**
```bash
pip install gymnasium torch numpy matplotlib seaborn
```

**‚è±Ô∏è Estimated Training Time:**
- CPU: ~15-20 minutes
- GPU: ~5-8 minutes

**üí° Quick Test:** To reduce training time for testing, change `n_episodes=500` to `n_episodes=100` in the training cell.

### 1. Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque, namedtuple
import random

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

### 2. Deep Q-Network (DQN)

DQN uses a neural network to approximate Q(s, a) instead of a table:

**Key innovations:**
1. **Experience Replay**: Store transitions in memory and sample randomly for training
2. **Target Network**: Separate network for computing target Q-values (stabilizes training)
3. **Neural Network**: Approximates Q-values for continuous/large state spaces

**Loss Function:** 
L = (r + Œ≥ max Q_target(s', a') - Q(s, a))¬≤

In [None]:
class QNetwork(nn.Module):
    """Neural network for approximating Q-values"""
    
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
    
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)  # Q-values for all actions

In [None]:
# Experience replay buffer
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    """Experience replay buffer for storing and sampling transitions"""
    
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.buffer.append(Transition(state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

In [None]:
class DQNAgent:
    """DQN agent with experience replay and target network"""
    
    def __init__(self, state_dim, action_dim, learning_rate=0.001, 
                 gamma=0.99, epsilon=1.0, epsilon_decay=0.995, 
                 epsilon_min=0.01, buffer_size=10000, batch_size=64):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size
        
        # Q-network and target network
        self.q_network = QNetwork(state_dim, action_dim)
        self.target_network = QNetwork(state_dim, action_dim)
        self.target_network.load_state_dict(self.q_network.state_dict())
        
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.replay_buffer = ReplayBuffer(buffer_size)
    
    def select_action(self, state):
        """Epsilon-greedy action selection"""
        if np.random.random() < self.epsilon:
            return np.random.randint(self.action_dim)
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                q_values = self.q_network(state_tensor)
                return q_values.argmax().item()
    
    def train_step(self):
        """Perform one training step using experience replay"""
        if len(self.replay_buffer) < self.batch_size:
            return None
        
        # Sample batch from replay buffer
        transitions = self.replay_buffer.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        
        # Convert to tensors
        state_batch = torch.FloatTensor(batch.state)
        action_batch = torch.LongTensor(batch.action)
        reward_batch = torch.FloatTensor(batch.reward)
        next_state_batch = torch.FloatTensor(batch.next_state)
        done_batch = torch.FloatTensor(batch.done)
        
        # Current Q-values
        current_q_values = self.q_network(state_batch).gather(1, action_batch.unsqueeze(1))
        
        # Target Q-values (using target network)
        with torch.no_grad():
            max_next_q_values = self.target_network(next_state_batch).max(1)[0]
            target_q_values = reward_batch + (1 - done_batch) * self.gamma * max_next_q_values
        
        # Compute loss and update
        loss = F.mse_loss(current_q_values.squeeze(), target_q_values)
        
        self.optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping for training stability
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), max_norm=1.0)
        
        self.optimizer.step()
        
        return loss.item()
    
    def update_target_network(self):
        """Copy weights from Q-network to target network"""
        self.target_network.load_state_dict(self.q_network.state_dict())
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

### 3. Train DQN Agent

In [None]:
def train_dqn(env, agent, n_episodes=500, target_update_freq=10):
    """Train DQN agent"""
    episode_rewards = []
    episode_lengths = []
    losses = []
    
    for episode in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        episode_loss = []
        
        for step in range(500):  # Max steps per episode
            # Select and perform action
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            # Store transition in replay buffer
            agent.replay_buffer.push(state, action, reward, next_state, float(done))
            
            # Train on batch from replay buffer
            loss = agent.train_step()
            if loss is not None:
                episode_loss.append(loss)
            
            total_reward += reward
            state = next_state
            
            if done:
                break
        
        # Update target network periodically
        if (episode + 1) % target_update_freq == 0:
            agent.update_target_network()
        
        # Decay epsilon
        agent.decay_epsilon()
        
        episode_rewards.append(total_reward)
        episode_lengths.append(step + 1)
        if episode_loss:
            losses.append(np.mean(episode_loss))
        
        # Print progress
        if (episode + 1) % 50 == 0:
            avg_reward = np.mean(episode_rewards[-50:])
            print(f"Episode {episode + 1}/{n_episodes}, "
                  f"Avg Reward: {avg_reward:.2f}, "
                  f"Epsilon: {agent.epsilon:.3f}")
    
    return episode_rewards, episode_lengths, losses

In [None]:
# Create CartPole environment
env = gym.make('CartPole-v1')

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

print(f"Environment: CartPole-v1")
print(f"State dimension: {state_dim}")
print(f"Number of actions: {action_dim}")

# Create DQN agent
agent = DQNAgent(
    state_dim=state_dim,
    action_dim=action_dim,
    learning_rate=0.001,
    gamma=0.99,
    epsilon=1.0,
    epsilon_decay=0.995,
    epsilon_min=0.01
)

print("\nTraining DQN agent...")
episode_rewards, episode_lengths, losses = train_dqn(
    env, agent, n_episodes=500, target_update_freq=10
)

### 4. Visualize Training Results

In [None]:
def moving_average(data, window_size=20):
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Episode Rewards
axes[0, 0].plot(episode_rewards, alpha=0.3, label='Raw Rewards')
if len(episode_rewards) > 20:
    axes[0, 0].plot(moving_average(episode_rewards, 20), 
                    label='Moving Average (20 episodes)', linewidth=2)
axes[0, 0].axhline(y=195, color='r', linestyle='--', label='Solved Threshold (195)')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Total Reward')
axes[0, 0].set_title('DQN: Episode Rewards Over Time')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Episode Lengths
axes[0, 1].plot(episode_lengths, alpha=0.3, label='Raw Lengths')
if len(episode_lengths) > 20:
    axes[0, 1].plot(moving_average(episode_lengths, 20), 
                    label='Moving Average (20 episodes)', linewidth=2)
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Episode Length')
axes[0, 1].set_title('Episode Lengths Over Time')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Training Loss
if losses:
    axes[1, 0].plot(losses, alpha=0.5)
    if len(losses) > 20:
        axes[1, 0].plot(moving_average(losses, 20), linewidth=2, label='MA(20)')
    axes[1, 0].set_xlabel('Episode')
    axes[1, 0].set_ylabel('Loss')
    axes[1, 0].set_title('Training Loss')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Success Rate
window = 50
success_threshold = 195
if len(episode_rewards) >= window:
    success_rate = []
    for i in range(len(episode_rewards) - window + 1):
        success_rate.append(np.mean([r >= success_threshold 
                                     for r in episode_rewards[i:i+window]]))
    axes[1, 1].plot(success_rate, linewidth=2, color='green')
    axes[1, 1].set_xlabel('Episode')
    axes[1, 1].set_ylabel('Success Rate')
    axes[1, 1].set_title(f'Success Rate (Rolling {window} episodes)')
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nTraining Complete!")
print(f"Final epsilon: {agent.epsilon:.4f}")
print(f"Average reward (last 50 episodes): {np.mean(episode_rewards[-50:]):.2f}")

### 5. Evaluate Learned Policy

In [None]:
def evaluate_dqn(env, agent, n_episodes=100):
    """Evaluate trained DQN agent"""
    rewards = []
    
    # Save epsilon and set to 0 for pure exploitation
    original_epsilon = agent.epsilon
    agent.epsilon = 0.0
    
    for episode in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        
        for _ in range(500):
            action = agent.select_action(state)
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            
            if terminated or truncated:
                break
        
        rewards.append(total_reward)
    
    agent.epsilon = original_epsilon
    
    return {
        'mean_reward': np.mean(rewards),
        'std_reward': np.std(rewards),
        'success_rate': np.mean([r >= 195 for r in rewards])
    }

eval_results = evaluate_dqn(env, agent, n_episodes=100)

print("=" * 50)
print("DQN EVALUATION RESULTS")
print("=" * 50)
print(f"Mean Reward: {eval_results['mean_reward']:.2f} ¬± {eval_results['std_reward']:.2f}")
print(f"Success Rate (>=195): {eval_results['success_rate']:.2%}")
print("=" * 50)

env.close()