# Module D: Reinforcement Learning - Complete Solutions

This notebook contains solutions to all exercises and challenges from Module D.

**Contents:**
1. Lab D.1 Solutions: MDP and Value Iteration
2. Lab D.2 Solutions: Q-Learning
3. Lab D.3 Solutions: Deep Q-Networks
4. Lab D.4 Solutions: Policy Gradients and PPO
5. Lab D.5 Solutions: RLHF Concepts

---

In [None]:
# Setup
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
from collections import deque
import random
from typing import Tuple, List, Dict, Optional

try:
    import gymnasium as gym
except ImportError:
    !pip install gymnasium -q
    import gymnasium as gym

np.random.seed(42)
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Device: {device}")

---

## Lab D.1 Solutions: MDP and Value Iteration

### Exercise: Policy Iteration

In [None]:
class GridWorldMDP:
    """Simple 4x4 grid world."""
    
    def __init__(self, grid_size: int = 4, slip_prob: float = 0.0):
        self.grid_size = grid_size
        self.n_states = grid_size ** 2
        self.n_actions = 4
        self.slip_prob = slip_prob
        self.start_state = 0
        self.goal_state = self.n_states - 1
        self.gamma = 0.99
        self.action_deltas = {0: (-1, 0), 1: (0, 1), 2: (1, 0), 3: (0, -1)}
    
    def state_to_pos(self, state):
        return (state // self.grid_size, state % self.grid_size)
    
    def pos_to_state(self, row, col):
        return row * self.grid_size + col
    
    def get_next_state(self, state, action):
        row, col = self.state_to_pos(state)
        dr, dc = self.action_deltas[action]
        new_row = max(0, min(self.grid_size - 1, row + dr))
        new_col = max(0, min(self.grid_size - 1, col + dc))
        return self.pos_to_state(new_row, new_col)
    
    def get_reward(self, state, action, next_state):
        if next_state == self.goal_state:
            return 1.0
        return -0.01
    
    def get_transition_probs(self, state, action):
        if self.slip_prob == 0:
            return {self.get_next_state(state, action): 1.0}
        probs = {}
        for a in range(self.n_actions):
            next_s = self.get_next_state(state, a)
            if a == action:
                prob = 1.0 - self.slip_prob + self.slip_prob / self.n_actions
            else:
                prob = self.slip_prob / self.n_actions
            probs[next_s] = probs.get(next_s, 0) + prob
        return probs

In [None]:
# SOLUTION: Policy Iteration

def policy_evaluation(mdp, policy, threshold=1e-6):
    """
    Evaluate a policy to get V^œÄ.
    
    Given a fixed policy, compute the value of each state under that policy.
    """
    V = np.zeros(mdp.n_states)
    
    while True:
        V_new = np.zeros(mdp.n_states)
        
        for s in range(mdp.n_states):
            if s == mdp.goal_state:
                continue  # Terminal state
            
            # Use the policy's action (not max!)
            a = policy[s]
            transitions = mdp.get_transition_probs(s, a)
            
            # Expected value under policy
            V_new[s] = sum(
                prob * (mdp.get_reward(s, a, ns) + mdp.gamma * V[ns])
                for ns, prob in transitions.items()
            )
        
        # Check convergence
        if np.max(np.abs(V_new - V)) < threshold:
            break
        V = V_new
    
    return V


def policy_improvement(mdp, V):
    """
    Improve policy greedily with respect to V.
    
    For each state, find the action that maximizes expected value.
    """
    policy = np.zeros(mdp.n_states, dtype=int)
    
    for s in range(mdp.n_states):
        if s == mdp.goal_state:
            continue
        
        # Find best action
        action_values = []
        for a in range(mdp.n_actions):
            transitions = mdp.get_transition_probs(s, a)
            q = sum(
                prob * (mdp.get_reward(s, a, ns) + mdp.gamma * V[ns])
                for ns, prob in transitions.items()
            )
            action_values.append(q)
        
        policy[s] = np.argmax(action_values)
    
    return policy


def policy_iteration(mdp):
    """
    Find optimal policy using Policy Iteration.
    
    Alternates between:
    1. Policy Evaluation: Compute V^œÄ
    2. Policy Improvement: Make policy greedy w.r.t. V^œÄ
    """
    # Start with random policy
    policy = np.random.randint(0, mdp.n_actions, size=mdp.n_states)
    
    iteration = 0
    while True:
        iteration += 1
        
        # Policy Evaluation
        V = policy_evaluation(mdp, policy)
        
        # Policy Improvement
        new_policy = policy_improvement(mdp, V)
        
        # Check if policy changed
        if np.array_equal(policy, new_policy):
            print(f"‚úÖ Policy Iteration converged in {iteration} iterations!")
            break
        
        policy = new_policy
    
    return V, policy


# Test it
mdp = GridWorldMDP(grid_size=4)
V_pi, policy_pi = policy_iteration(mdp)

print("\nOptimal Value Function:")
print(V_pi.reshape(4, 4).round(3))

action_names = ['Up', 'Right', 'Down', 'Left']
print("\nOptimal Policy:")
print(np.array([action_names[a] for a in policy_pi]).reshape(4, 4))

### Exercise: Obstacle Grid World

In [None]:
# SOLUTION: Grid with Obstacles

class ObstacleGridMDP(GridWorldMDP):
    """
    Grid world with obstacles (impassable cells).
    """
    
    def __init__(self, grid_size: int = 6, obstacles: List[int] = None):
        super().__init__(grid_size, slip_prob=0.0)
        
        # Default obstacles create a wall pattern
        if obstacles is None:
            self.obstacles = {8, 9, 14, 15, 20, 26, 27}
        else:
            self.obstacles = set(obstacles)
        
        print(f"Grid: {grid_size}x{grid_size}")
        print(f"Obstacles at: {sorted(self.obstacles)}")
    
    def get_next_state(self, state: int, action: int) -> int:
        """Modified to handle obstacles."""
        # Get intended next state
        next_state = super().get_next_state(state, action)
        
        # If it's an obstacle, stay in current state (bounce off)
        if next_state in self.obstacles:
            return state
        
        return next_state
    
    def visualize(self, V=None, policy=None):
        """Visualize the grid with obstacles."""
        fig, ax = plt.subplots(figsize=(8, 8))
        
        # Draw grid
        for i in range(self.grid_size + 1):
            ax.axhline(y=i, color='black', linewidth=1)
            ax.axvline(x=i, color='black', linewidth=1)
        
        # Color cells
        if V is not None:
            V_2d = V.reshape(self.grid_size, self.grid_size)
            ax.imshow(V_2d, cmap='RdYlGn',
                     extent=[0, self.grid_size, self.grid_size, 0])
        
        # Mark obstacles, start, goal
        arrow_map = {0: '‚Üë', 1: '‚Üí', 2: '‚Üì', 3: '‚Üê'}
        for s in range(self.n_states):
            row, col = self.state_to_pos(s)
            x, y = col + 0.5, row + 0.5
            
            if s in self.obstacles:
                ax.add_patch(plt.Rectangle((col, row), 1, 1, color='black'))
                ax.text(x, y, 'üß±', ha='center', va='center', fontsize=16)
            elif s == self.goal_state:
                ax.text(x, y, 'üéØ', ha='center', va='center', fontsize=20)
            elif s == self.start_state:
                ax.text(x, y, 'üöÄ', ha='center', va='center', fontsize=20)
            elif policy is not None:
                ax.text(x, y, arrow_map[policy[s]], ha='center', va='center',
                       fontsize=20, fontweight='bold')
        
        ax.set_xlim(0, self.grid_size)
        ax.set_ylim(self.grid_size, 0)
        ax.set_aspect('equal')
        ax.set_title('Grid World with Obstacles')
        plt.show()


# Solve it
obstacle_mdp = ObstacleGridMDP(grid_size=6)
V_obs, policy_obs = policy_iteration(obstacle_mdp)
obstacle_mdp.visualize(V_obs, policy_obs)

---

## Lab D.2 Solutions: Q-Learning

### Exercise: Hyperparameter Ablation

In [None]:
# SOLUTION: Q-Learning with various learning rates

def q_learning(env, n_episodes=2000, alpha=0.1, gamma=0.99,
               epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
    """
    Tabular Q-learning implementation.
    """
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    Q = np.zeros((n_states, n_actions))
    
    rewards_history = []
    
    for episode in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        
        while True:
            # Epsilon-greedy action selection
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])
            
            # Take action
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            
            # Q-learning update
            best_next = np.max(Q[next_state]) if not done else 0
            td_target = reward + gamma * best_next
            td_error = td_target - Q[state, action]
            Q[state, action] += alpha * td_error
            
            state = next_state
            if done:
                break
        
        rewards_history.append(total_reward)
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
    
    return Q, rewards_history


# Compare learning rates
env = gym.make("FrozenLake-v1", is_slippery=False)

learning_rates = [0.01, 0.1, 0.5, 0.9]
results = {}

for lr in learning_rates:
    print(f"Testing lr = {lr}...")
    Q, rewards = q_learning(env, n_episodes=1500, alpha=lr)
    
    # Evaluate
    successes = 0
    for _ in range(500):
        state, _ = env.reset()
        for _ in range(100):
            action = np.argmax(Q[state])
            state, reward, terminated, truncated, _ = env.step(action)
            if terminated or truncated:
                successes += reward
                break
    
    results[lr] = successes / 500
    print(f"  Success rate: {results[lr]:.1%}")

# Plot
plt.figure(figsize=(8, 5))
plt.bar(range(len(learning_rates)), list(results.values()), alpha=0.7)
plt.xticks(range(len(learning_rates)), [str(lr) for lr in learning_rates])
plt.xlabel('Learning Rate')
plt.ylabel('Success Rate')
plt.title('Q-Learning: Effect of Learning Rate')
plt.grid(True, alpha=0.3)
plt.show()

env.close()

### Exercise: Double Q-Learning

In [None]:
# SOLUTION: Double Q-Learning

def double_q_learning(env, n_episodes=5000, alpha=0.1, gamma=0.99,
                      epsilon=1.0, epsilon_decay=0.9995, epsilon_min=0.05):
    """
    Double Q-learning to reduce overestimation bias.
    
    Uses two Q-tables:
    - Q1 selects, Q2 evaluates (and vice versa)
    """
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    
    Q1 = np.zeros((n_states, n_actions))
    Q2 = np.zeros((n_states, n_actions))
    
    rewards_history = []
    
    for episode in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        
        while True:
            # Epsilon-greedy using sum of Q-tables
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q1[state] + Q2[state])
            
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            
            # Double Q-learning update
            if np.random.random() < 0.5:
                # Update Q1, evaluate with Q2
                if done:
                    target = reward
                else:
                    best_action = np.argmax(Q1[next_state])
                    target = reward + gamma * Q2[next_state, best_action]
                Q1[state, action] += alpha * (target - Q1[state, action])
            else:
                # Update Q2, evaluate with Q1
                if done:
                    target = reward
                else:
                    best_action = np.argmax(Q2[next_state])
                    target = reward + gamma * Q1[next_state, best_action]
                Q2[state, action] += alpha * (target - Q2[state, action])
            
            state = next_state
            if done:
                break
        
        rewards_history.append(total_reward)
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
    
    return (Q1 + Q2) / 2, rewards_history


# Test on slippery FrozenLake
slippery_env = gym.make("FrozenLake-v1", is_slippery=True)

print("Training Double Q-Learning on Slippery FrozenLake...")
Q_double, rewards_double = double_q_learning(slippery_env, n_episodes=10000)

# Evaluate
successes = 0
for _ in range(1000):
    state, _ = slippery_env.reset()
    for _ in range(100):
        action = np.argmax(Q_double[state])
        state, reward, terminated, truncated, _ = slippery_env.step(action)
        if terminated or truncated:
            successes += reward
            break

print(f"\nDouble Q-Learning Success Rate: {successes/10:.1%}")

slippery_env.close()

---

## Lab D.3 Solutions: Deep Q-Networks

### Exercise: Double DQN

In [None]:
# SOLUTION: Double DQN

class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )
    
    def forward(self, x):
        return self.network(x)


class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.FloatTensor(np.array(states)).to(device),
            torch.LongTensor(actions).to(device),
            torch.FloatTensor(rewards).to(device),
            torch.FloatTensor(np.array(next_states)).to(device),
            torch.FloatTensor(dones).to(device)
        )
    
    def __len__(self):
        return len(self.buffer)


class DoubleDQNAgent:
    """
    Double DQN Agent.
    
    Key difference from standard DQN:
    - Use Q-network to SELECT best action
    - Use target network to EVALUATE that action
    """
    
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99,
                 buffer_size=50000, batch_size=64, target_update_freq=100):
        self.action_dim = action_dim
        self.gamma = gamma
        self.batch_size = batch_size
        self.target_update_freq = target_update_freq
        
        self.q_network = QNetwork(state_dim, action_dim).to(device)
        self.target_network = QNetwork(state_dim, action_dim).to(device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.buffer = ReplayBuffer(buffer_size)
        
        self.epsilon = 1.0
        self.epsilon_end = 0.01
        self.epsilon_decay = 0.995
        self.train_steps = 0
    
    def select_action(self, state, training=True):
        if training and random.random() < self.epsilon:
            return random.randint(0, self.action_dim - 1)
        
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
            return self.q_network(state_tensor).argmax().item()
    
    def train_step(self):
        if len(self.buffer) < self.batch_size:
            return None
        
        states, actions, rewards, next_states, dones = self.buffer.sample(self.batch_size)
        
        # Current Q-values
        current_q = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        
        # Double DQN target
        with torch.no_grad():
            # SELECT best action using Q-network
            best_actions = self.q_network(next_states).argmax(dim=1)
            # EVALUATE using target network
            next_q = self.target_network(next_states).gather(1, best_actions.unsqueeze(1)).squeeze(1)
            target_q = rewards + self.gamma * next_q * (1 - dones)
        
        loss = F.mse_loss(current_q, target_q)
        
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), 1.0)
        self.optimizer.step()
        
        self.train_steps += 1
        
        if self.train_steps % self.target_update_freq == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())
        
        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)
        
        return loss.item()


# Train Double DQN on CartPole
env = gym.make("CartPole-v1")
agent = DoubleDQNAgent(
    state_dim=env.observation_space.shape[0],
    action_dim=env.action_space.n
)

print("Training Double DQN on CartPole...")
rewards_history = []

for episode in range(300):
    state, _ = env.reset()
    total_reward = 0
    
    for step in range(500):
        action = agent.select_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        agent.buffer.push(state, action, reward, next_state, done)
        agent.train_step()
        
        total_reward += reward
        state = next_state
        
        if done:
            break
    
    rewards_history.append(total_reward)
    
    if (episode + 1) % 50 == 0:
        avg = np.mean(rewards_history[-50:])
        print(f"Episode {episode + 1}: Avg Reward = {avg:.1f}")

print(f"\nFinal average (last 50): {np.mean(rewards_history[-50:]):.1f}")
env.close()

---

## Lab D.4 Solutions: Policy Gradients and PPO

### Complete PPO Implementation with All Components

In [None]:
# SOLUTION: Full PPO Implementation

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super().__init__()
        
        self.shared = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.Tanh(),
        )
        
        self.actor = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax(dim=-1)
        )
        
        self.critic = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, x):
        features = self.shared(x)
        return self.actor(features), self.critic(features)


class PPO:
    def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99,
                 gae_lambda=0.95, clip_epsilon=0.2, value_coef=0.5,
                 entropy_coef=0.01, n_epochs=10, batch_size=64):
        
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.clip_epsilon = clip_epsilon
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        
        self.network = ActorCritic(state_dim, action_dim).to(device)
        self.optimizer = optim.Adam(self.network.parameters(), lr=lr)
        
        self.states = []
        self.actions = []
        self.log_probs = []
        self.values = []
        self.rewards = []
        self.dones = []
    
    def select_action(self, state):
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        
        with torch.no_grad():
            probs, value = self.network(state_tensor)
        
        dist = Categorical(probs)
        action = dist.sample()
        
        self.states.append(state)
        self.actions.append(action.item())
        self.log_probs.append(dist.log_prob(action).item())
        self.values.append(value.item())
        
        return action.item()
    
    def store(self, reward, done):
        self.rewards.append(reward)
        self.dones.append(done)
    
    def compute_gae(self, next_value):
        values = self.values + [next_value]
        advantages = []
        gae = 0
        
        for t in reversed(range(len(self.rewards))):
            delta = self.rewards[t] + self.gamma * values[t+1] * (1 - self.dones[t]) - values[t]
            gae = delta + self.gamma * self.gae_lambda * (1 - self.dones[t]) * gae
            advantages.insert(0, gae)
        
        advantages = np.array(advantages)
        returns = advantages + np.array(self.values)
        
        return advantages, returns
    
    def update(self, next_state):
        with torch.no_grad():
            next_tensor = torch.FloatTensor(next_state).unsqueeze(0).to(device)
            _, next_value = self.network(next_tensor)
            next_value = next_value.item()
        
        advantages, returns = self.compute_gae(next_value)
        
        states = torch.FloatTensor(np.array(self.states)).to(device)
        actions = torch.LongTensor(self.actions).to(device)
        old_log_probs = torch.FloatTensor(self.log_probs).to(device)
        advantages = torch.FloatTensor(advantages).to(device)
        returns = torch.FloatTensor(returns).to(device)
        
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        
        for _ in range(self.n_epochs):
            indices = np.random.permutation(len(states))
            
            for start in range(0, len(states), self.batch_size):
                end = start + self.batch_size
                idx = indices[start:end]
                
                probs, values = self.network(states[idx])
                values = values.squeeze()
                
                dist = Categorical(probs)
                new_log_probs = dist.log_prob(actions[idx])
                entropy = dist.entropy().mean()
                
                ratio = (new_log_probs - old_log_probs[idx]).exp()
                
                surr1 = ratio * advantages[idx]
                surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * advantages[idx]
                policy_loss = -torch.min(surr1, surr2).mean()
                
                value_loss = F.mse_loss(values, returns[idx])
                
                loss = policy_loss + self.value_coef * value_loss - self.entropy_coef * entropy
                
                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.network.parameters(), 0.5)
                self.optimizer.step()
        
        # Clear buffers
        self.states = []
        self.actions = []
        self.log_probs = []
        self.values = []
        self.rewards = []
        self.dones = []


# Train PPO on LunarLander
env = gym.make("LunarLander-v2")
agent = PPO(
    state_dim=env.observation_space.shape[0],
    action_dim=env.action_space.n
)

print("Training PPO on LunarLander...")
rewards_history = []
episodes = 0
rollout_length = 2048
state, _ = env.reset()
current_reward = 0

while episodes < 500:
    for _ in range(rollout_length):
        action = agent.select_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        agent.store(reward, done)
        current_reward += reward
        
        if done:
            rewards_history.append(current_reward)
            episodes += 1
            current_reward = 0
            state, _ = env.reset()
            
            if episodes % 50 == 0:
                avg = np.mean(rewards_history[-50:])
                print(f"Episode {episodes}: Avg Reward = {avg:.1f}")
            
            if episodes >= 500:
                break
        else:
            state = next_state
    
    if len(agent.states) > 0:
        agent.update(state)

print(f"\nFinal average (last 50): {np.mean(rewards_history[-50:]):.1f}")
env.close()

---

## Lab D.5 Solutions: RLHF Concepts

### DPO Loss Implementation

In [None]:
# SOLUTION: DPO Loss (Conceptual Implementation)

def dpo_loss(
    policy_chosen_logps: torch.Tensor,
    policy_rejected_logps: torch.Tensor,
    reference_chosen_logps: torch.Tensor,
    reference_rejected_logps: torch.Tensor,
    beta: float = 0.1
) -> torch.Tensor:
    """
    Direct Preference Optimization (DPO) loss.
    
    DPO optimizes preferences directly without a reward model.
    
    Loss = -log(sigmoid(beta * (log_ratio_chosen - log_ratio_rejected)))
    
    where log_ratio = log(policy(y|x)) - log(reference(y|x))
    
    Args:
        policy_chosen_logps: Log probs of chosen responses under policy
        policy_rejected_logps: Log probs of rejected responses under policy
        reference_chosen_logps: Log probs of chosen under reference (frozen)
        reference_rejected_logps: Log probs of rejected under reference
        beta: Temperature parameter (higher = more conservative)
    
    Returns:
        DPO loss (scalar)
    """
    # Compute log ratios
    chosen_log_ratio = policy_chosen_logps - reference_chosen_logps
    rejected_log_ratio = policy_rejected_logps - reference_rejected_logps
    
    # DPO loss
    logits = beta * (chosen_log_ratio - rejected_log_ratio)
    loss = -F.logsigmoid(logits).mean()
    
    # Useful metrics
    with torch.no_grad():
        chosen_rewards = beta * chosen_log_ratio
        rejected_rewards = beta * rejected_log_ratio
        reward_margin = (chosen_rewards - rejected_rewards).mean()
        accuracy = (chosen_log_ratio > rejected_log_ratio).float().mean()
    
    return loss, {
        'reward_margin': reward_margin.item(),
        'accuracy': accuracy.item()
    }


# Demo
batch_size = 8

# Simulated log probabilities
policy_chosen = torch.randn(batch_size) - 1  # Log probs are negative
policy_rejected = torch.randn(batch_size) - 1.5
ref_chosen = torch.randn(batch_size) - 1
ref_rejected = torch.randn(batch_size) - 1

loss, metrics = dpo_loss(policy_chosen, policy_rejected, ref_chosen, ref_rejected)

print("DPO Loss Demo:")
print(f"  Loss: {loss.item():.4f}")
print(f"  Reward Margin: {metrics['reward_margin']:.4f}")
print(f"  Accuracy: {metrics['accuracy']:.2%}")

---

## Summary

This solutions notebook covered:

1. **Lab D.1**: Policy Iteration and Obstacle Grid Worlds
2. **Lab D.2**: Q-Learning hyperparameter tuning and Double Q-Learning
3. **Lab D.3**: Double DQN implementation
4. **Lab D.4**: Complete PPO implementation with GAE
5. **Lab D.5**: DPO loss for RLHF alternatives

Key takeaways:
- RL is about learning from interaction to maximize reward
- Value-based methods (Q-learning, DQN) learn action values
- Policy-based methods (REINFORCE, PPO) directly optimize the policy
- PPO is the backbone of RLHF for training LLMs
- DPO provides a simpler alternative to full RLHF