# TP Apprentissage par Renforcement avec MiniGrid

**Auteur:** [Votre Nom]  
**Date:** F√©vrier 2026

Ce notebook vous guide √† travers l'impl√©mentation et la comparaison de Q-Learning et DQN sur MiniGrid.

## üì¶ Installation et Imports

In [None]:
# Installation des d√©pendances (d√©commentez si n√©cessaire)
# !pip install gymnasium minigrid numpy matplotlib torch tqdm

In [None]:
import gymnasium as gym
import minigrid
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from collections import defaultdict, deque
from tqdm.notebook import tqdm
import pickle

print("‚úì Imports r√©ussis!")

## üîç Partie 1: Exploration de MiniGrid

In [None]:
# Cr√©er l'environnement
env = gym.make('MiniGrid-Empty-8x8-v0', render_mode='rgb_array')

print("Informations sur l'environnement:")
print(f"Espace d'observation: {env.observation_space}")
print(f"Espace d'actions: {env.action_space}")
print(f"Nombre d'actions: {env.action_space.n}")

In [None]:
# Actions disponibles
actions = {
    0: "Tourner √† gauche",
    1: "Tourner √† droite",
    2: "Avancer",
    3: "Ramasser",
    4: "D√©poser",
    5: "Basculer",
    6: "Terminer"
}

for action_id, description in actions.items():
    if action_id < env.action_space.n:
        print(f"Action {action_id}: {description}")

In [None]:
# Visualiser l'environnement
obs, info = env.reset()
img = env.render()

plt.figure(figsize=(6, 6))
plt.imshow(img)
plt.title("MiniGrid-Empty-8x8-v0")
plt.axis('off')
plt.show()

print("\nStructure de l'observation:")
if isinstance(obs, dict):
    for key, value in obs.items():
        if isinstance(value, np.ndarray):
            print(f"{key}: shape={value.shape}")
        else:
            print(f"{key}: {value}")

In [None]:
# Test avec des actions al√©atoires
env.reset()
total_reward = 0

for step in range(50):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    
    if terminated or truncated:
        print(f"√âpisode termin√© √† l'√©tape {step+1}")
        print(f"R√©compense totale: {total_reward}")
        break

env.close()

## üß† Partie 2: Impl√©mentation Q-Learning

In [None]:
class QLearningAgent:
    """Agent Q-Learning simplifi√© pour notebook."""
    
    def __init__(self, action_space_size, learning_rate=0.1, gamma=0.99, 
                 epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995):
        self.action_space_size = action_space_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.q_table = defaultdict(lambda: np.zeros(action_space_size))
        
    def state_to_key(self, obs):
        if isinstance(obs, dict) and 'image' in obs:
            return tuple(obs['image'].flatten())
        return tuple(np.array(obs).flatten())
    
    def select_action(self, state, training=True):
        if training and np.random.random() < self.epsilon:
            return np.random.randint(self.action_space_size)
        state_key = self.state_to_key(state)
        return np.argmax(self.q_table[state_key])
    
    def update(self, state, action, reward, next_state, done):
        state_key = self.state_to_key(state)
        next_state_key = self.state_to_key(next_state)
        
        current_q = self.q_table[state_key][action]
        max_next_q = 0 if done else np.max(self.q_table[next_state_key])
        new_q = current_q + self.learning_rate * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state_key][action] = new_q
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)

print("‚úì Classe QLearningAgent d√©finie")

In [None]:
# Entra√Æner l'agent Q-Learning
env = gym.make('MiniGrid-Empty-8x8-v0')
agent_qlearning = QLearningAgent(env.action_space.n)

num_episodes = 1000
rewards_qlearning = []
steps_qlearning = []

for episode in tqdm(range(num_episodes), desc="Entra√Ænement Q-Learning"):
    state, _ = env.reset()
    episode_reward = 0
    episode_steps = 0
    
    for step in range(500):
        action = agent_qlearning.select_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        agent_qlearning.update(state, action, reward, next_state, done)
        
        episode_reward += reward
        episode_steps += 1
        state = next_state
        
        if done:
            break
    
    agent_qlearning.decay_epsilon()
    rewards_qlearning.append(episode_reward)
    steps_qlearning.append(episode_steps)

env.close()
print("‚úì Entra√Ænement Q-Learning termin√©")

In [None]:
# Visualiser les r√©sultats Q-Learning
window = 100
moving_avg = np.convolve(rewards_qlearning, np.ones(window)/window, mode='valid')

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(rewards_qlearning, alpha=0.3, label='R√©compense')
plt.plot(range(window-1, len(rewards_qlearning)), moving_avg, 
         color='red', label=f'Moyenne mobile ({window})')
plt.xlabel('√âpisode')
plt.ylabel('R√©compense')
plt.title('Q-Learning - R√©compenses')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
moving_avg_steps = np.convolve(steps_qlearning, np.ones(window)/window, mode='valid')
plt.plot(steps_qlearning, alpha=0.3, label='√âtapes')
plt.plot(range(window-1, len(steps_qlearning)), moving_avg_steps,
         color='orange', label=f'Moyenne mobile ({window})')
plt.xlabel('√âpisode')
plt.ylabel('Nombre d\'√©tapes')
plt.title('Q-Learning - √âtapes par √©pisode')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"R√©compense moyenne (derniers 100 √©p.): {np.mean(rewards_qlearning[-100:]):.3f}")
print(f"√âtapes moyennes (derniers 100 √©p.): {np.mean(steps_qlearning[-100:]):.1f}")

## ü§ñ Partie 3: Impl√©mentation DQN

*(Note: L'impl√©mentation compl√®te de DQN est dans 3_dqn_agent.py)*

Pour utiliser DQN dans ce notebook, vous pouvez importer la classe depuis le fichier Python.

## üìä Partie 4: √âvaluation

In [None]:
# √âvaluer Q-Learning
env = gym.make('MiniGrid-Empty-8x8-v0')
eval_rewards = []
eval_steps = []
successes = 0

for episode in range(100):
    state, _ = env.reset()
    episode_reward = 0
    episode_steps = 0
    
    for step in range(500):
        action = agent_qlearning.select_action(state, training=False)
        state, reward, terminated, truncated, _ = env.step(action)
        
        episode_reward += reward
        episode_steps += 1
        
        if terminated or truncated:
            if reward > 0:
                successes += 1
            break
    
    eval_rewards.append(episode_reward)
    eval_steps.append(episode_steps)

env.close()

print("\n=== R√âSULTATS D'√âVALUATION Q-LEARNING ===")
print(f"R√©compense moyenne: {np.mean(eval_rewards):.3f} ¬± {np.std(eval_rewards):.3f}")
print(f"√âtapes moyennes: {np.mean(eval_steps):.1f} ¬± {np.std(eval_steps):.1f}")
print(f"Taux de succ√®s: {successes}%")

In [None]:
# Distribution des r√©compenses
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.hist(eval_rewards, bins=20, edgecolor='black', alpha=0.7)
plt.axvline(np.mean(eval_rewards), color='red', linestyle='--', 
            label=f'Moyenne: {np.mean(eval_rewards):.2f}')
plt.xlabel('R√©compense')
plt.ylabel('Fr√©quence')
plt.title('Distribution des r√©compenses')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(eval_steps, bins=20, edgecolor='black', alpha=0.7, color='orange')
plt.axvline(np.mean(eval_steps), color='red', linestyle='--',
            label=f'Moyenne: {np.mean(eval_steps):.1f}')
plt.xlabel('Nombre d\'√©tapes')
plt.ylabel('Fr√©quence')
plt.title('Distribution des √©tapes')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## üíæ Sauvegarde des r√©sultats

In [None]:
# Sauvegarder l'agent
with open('qlearning_agent.pkl', 'wb') as f:
    pickle.dump({
        'q_table': dict(agent_qlearning.q_table),
        'training_rewards': rewards_qlearning,
        'training_steps': steps_qlearning,
        'eval_rewards': eval_rewards,
        'eval_steps': eval_steps
    }, f)

print("‚úì Agent sauvegard√© dans 'qlearning_agent.pkl'")

## üéØ Conclusions

### √Ä compl√©ter:

1. **Performance observ√©e:**
   - Taux de succ√®s: ____%
   - R√©compense moyenne: ____
   - Convergence apr√®s ____ √©pisodes

2. **Analyse:**
   - Points forts de Q-Learning:
   - Limitations observ√©es:
   - Comparaison avec DQN:

3. **Am√©liorations possibles:**
   - 
   - 
   - 

## üìö Prochaines √©tapes

1. Impl√©menter DQN (voir `3_dqn_agent.py`)
2. Comparer les deux algorithmes
3. Tester sur des environnements plus complexes
4. Exp√©rimenter avec les hyperparam√®tres
5. Compl√©ter le rapport final