### Q-Learning

**Principle:** Value Function Learning

**Definition:** Learns the expected return from each state-action pair using off-policy temporal difference (Q-Learning).

**Algorithm Description:** Q-Learning learns an action-value function Q(s,a) that estimates the expected cumulative reward of taking action a in state s and following the optimal policy thereafter. It updates Q-values using the Bellman equation and observed rewards, learning the optimal policy without requiring a model of the environment.

**Typical Use Cases:**
- Classic algorithm for discrete state/action spaces
- Finding optimal policy in model-free environments
- Off-policy learning (learns optimal policy while exploring)
- Simple control problems (e.g., grid worlds).

**Assumptions:**
- Discrete state/action spaces
- Model-free
- Off-policy learning
- Unsuitable for large/continuous spaces



### 1. Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gymnasium as gym
from collections import defaultdict
import pandas as pd

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

### 2. Q-Learning Agent

Q-Learning is an off-policy, value-based RL algorithm that learns the optimal action-value function Q(s, a) using the Bellman equation:

**Q(s, a) ← Q(s, a) + α [r + γ max Q(s', a') - Q(s, a)]**

Where:
- α = learning rate
- γ = discount factor
- r = reward
- s' = next state

In [None]:
class QLearningAgent:
    """Q-Learning agent with epsilon-greedy exploration"""
    
    def __init__(self, n_states, n_actions, learning_rate=0.1, 
                 discount_factor=0.95, epsilon=1.0, epsilon_decay=0.995, 
                 epsilon_min=0.01):
        """
        Initialize Q-Learning agent
        
        Parameters:
        -----------
        n_states : int
            Number of states in the environment
        n_actions : int
            Number of actions available
        learning_rate : float
            Learning rate (alpha)
        discount_factor : float
            Discount factor (gamma)
        epsilon : float
            Initial exploration rate
        epsilon_decay : float
            Decay rate for epsilon
        epsilon_min : float
            Minimum epsilon value
        """
        self.n_states = n_states
        self.n_actions = n_actions
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        # Initialize Q-table with zeros
        self.q_table = np.zeros((n_states, n_actions))
        
    def select_action(self, state):
        """Select action using epsilon-greedy policy"""
        if np.random.random() < self.epsilon:
            # Exploration: random action
            return np.random.randint(self.n_actions)
        else:
            # Exploitation: best action from Q-table
            return np.argmax(self.q_table[state])
    
    def update(self, state, action, reward, next_state, done):
        """Update Q-value using Q-Learning update rule"""
        # Current Q-value
        current_q = self.q_table[state, action]
        
        # Maximum Q-value for next state (off-policy)
        max_next_q = 0 if done else np.max(self.q_table[next_state])
        
        # Q-Learning update: Q(s,a) ← Q(s,a) + α[r + γ max Q(s',a') - Q(s,a)]
        new_q = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)
        
        self.q_table[state, action] = new_q
    
    def decay_epsilon(self):
        """Decay exploration rate"""
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

### 3. Create Environment and Train Agent

We'll use the FrozenLake environment from Gymnasium. The agent must navigate from start (S) to goal (G) while avoiding holes (H).

In [None]:
def train_q_learning(env, agent, n_episodes=10000, max_steps=100):
    """
    Train Q-Learning agent
    
    Parameters:
    -----------
    env : gym.Env
        Environment to train on
    agent : QLearningAgent
        Q-Learning agent
    n_episodes : int
        Number of episodes to train
    max_steps : int
        Maximum steps per episode
        
    Returns:
    --------
    episode_rewards : list
        Rewards obtained in each episode
    episode_lengths : list
        Length of each episode
    """
    episode_rewards = []
    episode_lengths = []
    
    for episode in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        
        for step in range(max_steps):
            # Select and perform action
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            # Update Q-table
            agent.update(state, action, reward, next_state, done)
            
            total_reward += reward
            state = next_state
            
            if done:
                break
        
        # Decay epsilon after each episode
        agent.decay_epsilon()
        
        episode_rewards.append(total_reward)
        episode_lengths.append(step + 1)
        
        # Print progress
        if (episode + 1) % 1000 == 0:
            avg_reward = np.mean(episode_rewards[-100:])
            print(f"Episode {episode + 1}/{n_episodes}, "
                  f"Avg Reward (last 100): {avg_reward:.3f}, "
                  f"Epsilon: {agent.epsilon:.3f}")
    
    return episode_rewards, episode_lengths

In [None]:
# Create FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=True, render_mode=None)

# Get environment dimensions
n_states = env.observation_space.n
n_actions = env.action_space.n

print(f"Environment: FrozenLake-v1")
print(f"Number of states: {n_states}")
print(f"Number of actions: {n_actions}")

# Create Q-Learning agent
agent = QLearningAgent(
    n_states=n_states,
    n_actions=n_actions,
    learning_rate=0.1,
    discount_factor=0.95,
    epsilon=1.0,
    epsilon_decay=0.995,
    epsilon_min=0.01
)

# Train the agent
print("\nTraining Q-Learning agent...")
episode_rewards, episode_lengths = train_q_learning(
    env, agent, n_episodes=10000, max_steps=100
)

### 4. Visualize Training Results

In [None]:
# Calculate moving average for smoother visualization
def moving_average(data, window_size=100):
    """Calculate moving average"""
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Episode Rewards
axes[0, 0].plot(episode_rewards, alpha=0.3, label='Raw Rewards')
if len(episode_rewards) > 100:
    axes[0, 0].plot(moving_average(episode_rewards, 100), 
                    label='Moving Average (100 episodes)', linewidth=2)
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Total Reward')
axes[0, 0].set_title('Episode Rewards Over Time')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Episode Lengths
axes[0, 1].plot(episode_lengths, alpha=0.3, label='Raw Lengths')
if len(episode_lengths) > 100:
    axes[0, 1].plot(moving_average(episode_lengths, 100), 
                    label='Moving Average (100 episodes)', linewidth=2)
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Episode Length')
axes[0, 1].set_title('Episode Lengths Over Time')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Success Rate
window = 100
success_rate = []
for i in range(len(episode_rewards) - window + 1):
    success_rate.append(np.mean([r > 0 for r in episode_rewards[i:i+window]]))
axes[1, 0].plot(success_rate, linewidth=2)
axes[1, 0].set_xlabel('Episode')
axes[1, 0].set_ylabel('Success Rate')
axes[1, 0].set_title(f'Success Rate (Rolling {window} episodes)')
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Q-Table Heatmap (sample of learned values)
im = axes[1, 1].imshow(agent.q_table, cmap='viridis', aspect='auto')
axes[1, 1].set_xlabel('Action')
axes[1, 1].set_ylabel('State')
axes[1, 1].set_title('Learned Q-Table Values')
plt.colorbar(im, ax=axes[1, 1])

plt.tight_layout()
plt.show()

# Print final statistics
print(f"\nTraining Complete!")
print(f"Final epsilon: {agent.epsilon:.4f}")
print(f"Average reward (last 100 episodes): {np.mean(episode_rewards[-100:]):.4f}")
print(f"Success rate (last 100 episodes): {np.mean([r > 0 for r in episode_rewards[-100:]]):.2%}")

### 5. Evaluate Learned Policy

Test the learned policy on multiple episodes to measure final performance.

In [None]:
def evaluate_policy(env, agent, n_episodes=100, max_steps=100):
    """
    Evaluate the learned policy
    
    Parameters:
    -----------
    env : gym.Env
        Environment to evaluate on
    agent : QLearningAgent
        Trained Q-Learning agent
    n_episodes : int
        Number of evaluation episodes
    max_steps : int
        Maximum steps per episode
        
    Returns:
    --------
    results : dict
        Evaluation metrics
    """
    rewards = []
    lengths = []
    successes = []
    
    # Save current epsilon and set to 0 for pure exploitation
    original_epsilon = agent.epsilon
    agent.epsilon = 0.0
    
    for episode in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        
        for step in range(max_steps):
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            total_reward += reward
            state = next_state
            
            if done:
                break
        
        rewards.append(total_reward)
        lengths.append(step + 1)
        successes.append(total_reward > 0)
    
    # Restore original epsilon
    agent.epsilon = original_epsilon
    
    results = {
        'mean_reward': np.mean(rewards),
        'std_reward': np.std(rewards),
        'mean_length': np.mean(lengths),
        'success_rate': np.mean(successes)
    }
    
    return results

# Evaluate the trained agent
eval_results = evaluate_policy(env, agent, n_episodes=100)

print("=" * 50)
print("EVALUATION RESULTS")
print("=" * 50)
print(f"Mean Reward: {eval_results['mean_reward']:.4f} ± {eval_results['std_reward']:.4f}")
print(f"Mean Episode Length: {eval_results['mean_length']:.2f}")
print(f"Success Rate: {eval_results['success_rate']:.2%}")
print("=" * 50)

### 6. Visualize Learned Policy

Display the optimal action for each state based on the learned Q-table.

In [None]:
# Extract optimal policy from Q-table
optimal_policy = np.argmax(agent.q_table, axis=1)

# Map action indices to arrow symbols for visualization
action_symbols = {0: '←', 1: '↓', 2: '→', 3: '↑'}

# Create policy visualization for 4x4 grid (FrozenLake)
grid_size = 4
policy_grid = optimal_policy.reshape(grid_size, grid_size)

# Create figure
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot 1: Policy as arrows
for i in range(grid_size):
    for j in range(grid_size):
        state = i * grid_size + j
        action = optimal_policy[state]
        axes[0].text(j, i, action_symbols[action], 
                    ha='center', va='center', fontsize=20)
axes[0].set_xlim(-0.5, grid_size - 0.5)
axes[0].set_ylim(-0.5, grid_size - 0.5)
axes[0].set_xticks(range(grid_size))
axes[0].set_yticks(range(grid_size))
axes[0].grid(True)
axes[0].invert_yaxis()
axes[0].set_title('Learned Policy (Optimal Actions)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Column')
axes[0].set_ylabel('Row')

# Plot 2: State Values (max Q-value for each state)
state_values = np.max(agent.q_table, axis=1).reshape(grid_size, grid_size)
im = axes[1].imshow(state_values, cmap='RdYlGn', aspect='auto')
axes[1].set_title('State Values (Max Q-value)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Column')
axes[1].set_ylabel('Row')
axes[1].set_xticks(range(grid_size))
axes[1].set_yticks(range(grid_size))

# Add values as text
for i in range(grid_size):
    for j in range(grid_size):
        text = axes[1].text(j, i, f'{state_values[i, j]:.3f}',
                           ha='center', va='center', color='black', fontsize=10)

plt.colorbar(im, ax=axes[1])
plt.tight_layout()
plt.show()

print("\nLearned Policy Summary:")
print("=" * 50)
print(f"Total states: {n_states}")
print(f"Actions: {['LEFT', 'DOWN', 'RIGHT', 'UP']}")
print(f"Optimal policy shape: {optimal_policy.shape}")