### Dyna-Q

**Principle:** Integrated Planning & Learning

**Definition:** Learns a model of the environment and uses it to simulate transitions for planning (Dyna-Q).

**Algorithm Description:** Dyna-Q combines model-free Q-Learning with model-based planning by learning both a Q-function and a model of environment dynamics. During learning, it performs real updates from actual experience and simulated updates by planning with the learned model, improving sample efficiency.

**Typical Use Cases:**
- Combines model-free (q-learning) with model-based (planning) updates
- Improving sample efficiency when real interaction is costly
- Learns a model of the environment to generate simulated experiences.

**Assumptions:**
- Discrete state/action spaces
- Model learning
- Sample efficiency
- Simple environment dynamics



### 1. Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gymnasium as gym
from collections import defaultdict
import random

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

### 2. Dyna-Q Algorithm

Dyna-Q integrates planning and learning:

**Key Components:**
1. **Direct RL:** Learn from real experience (Q-Learning)
2. **Model Learning:** Build model of environment dynamics
3. **Planning:** Use model to generate simulated experience

**Update Process:**
1. Take real action, observe transition
2. Update Q-table from real experience
3. Update environment model
4. Perform n planning steps using model

In [None]:
class DynaQAgent:
    """Dyna-Q agent with model-based planning"""
    
    def __init__(self, n_states, n_actions, learning_rate=0.1, 
                 discount_factor=0.95, epsilon=0.1, n_planning_steps=10):
        self.n_states = n_states
        self.n_actions = n_actions
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.n_planning_steps = n_planning_steps
        
        # Q-table
        self.q_table = np.zeros((n_states, n_actions))
        
        # Environment model: model[(s,a)] = (next_state, reward)
        self.model = {}
        
        # Track visited state-action pairs for planning
        self.visited_pairs = []
    
    def select_action(self, state):
        """Epsilon-greedy action selection"""
        if np.random.random() < self.epsilon:
            return np.random.randint(self.n_actions)
        else:
            return np.argmax(self.q_table[state])
    
    def q_learning_update(self, state, action, reward, next_state, done):
        """Q-Learning update from real or simulated experience"""
        current_q = self.q_table[state, action]
        max_next_q = 0 if done else np.max(self.q_table[next_state])
        new_q = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state, action] = new_q
    
    def update_model(self, state, action, reward, next_state):
        """Update environment model"""
        self.model[(state, action)] = (next_state, reward)
        if (state, action) not in self.visited_pairs:
            self.visited_pairs.append((state, action))
    
    def planning(self):
        """Perform planning steps using learned model"""
        for _ in range(self.n_planning_steps):
            if not self.visited_pairs:
                break
            
            # Sample random previously visited state-action pair
            state, action = random.choice(self.visited_pairs)
            
            # Get predicted next state and reward from model
            next_state, reward = self.model[(state, action)]
            
            # Update Q-table using simulated experience
            self.q_learning_update(state, action, reward, next_state, False)
    
    def step(self, state, action, reward, next_state, done):
        """Complete Dyna-Q step: learn + plan"""
        # (a) Direct RL: Update Q from real experience
        self.q_learning_update(state, action, reward, next_state, done)
        
        # (b) Model Learning: Update model
        self.update_model(state, action, reward, next_state)
        
        # (c) Planning: Learn from simulated experience
        self.planning()

### 3. Train Dyna-Q Agent

In [None]:
def train_dyna_q(env, agent, n_episodes=300, max_steps=100):
    episode_rewards = []
    episode_lengths = []
    
    for episode in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        
        for step in range(max_steps):
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            # Dyna-Q step (learning + planning)
            agent.step(state, action, reward, next_state, done)
            
            total_reward += reward
            state = next_state
            
            if done:
                break
        
        episode_rewards.append(total_reward)
        episode_lengths.append(step + 1)
        
        if (episode + 1) % 50 == 0:
            avg_reward = np.mean(episode_rewards[-50:])
            print(f'Episode {episode+1}, Avg Reward: {avg_reward:.3f}, '
                  f'Model size: {len(agent.model)}')
    
    return episode_rewards, episode_lengths

# Create environment
env = gym.make('FrozenLake-v1', is_slippery=True, render_mode=None)

# Create Dyna-Q agent
agent = DynaQAgent(
    n_states=env.observation_space.n,
    n_actions=env.action_space.n,
    learning_rate=0.1,
    discount_factor=0.95,
    epsilon=0.1,
    n_planning_steps=50  # More planning steps
)

print('Training Dyna-Q with 50 planning steps per update...')
episode_rewards, episode_lengths = train_dyna_q(env, agent, n_episodes=300)

### 4. Visualize Results

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Episode Rewards
axes[0].plot(episode_rewards, alpha=0.3, label='Raw')
if len(episode_rewards) > 20:
    ma = np.convolve(episode_rewards, np.ones(20)/20, mode='valid')
    axes[0].plot(ma, linewidth=2, label='MA(20)')
axes[0].set_xlabel('Episode')
axes[0].set_ylabel('Reward')
axes[0].set_title('Dyna-Q: Episode Rewards')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Plot 2: Success Rate
window = 20
success_rate = [np.mean([r > 0 for r in episode_rewards[i:i+window]]) 
                for i in range(len(episode_rewards) - window + 1)]
axes[1].plot(success_rate, linewidth=2, color='green')
axes[1].set_xlabel('Episode')
axes[1].set_ylabel('Success Rate')
axes[1].set_title(f'Success Rate (Rolling {window} episodes)')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f'\nFinal Success Rate: {np.mean([r > 0 for r in episode_rewards[-100:]]):.2%}')
print(f'Model learned {len(agent.model)} state-action transitions')
env.close()