# Multi-Agent Independent Q-Learning (IQL) Demo

This notebook demonstrates how to use the Independent Q-Learning (IQL) implementation with multiple agents in a simple gridworld environment.

In [None]:
import sys
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time

# Add the src directory to the path
sys.path.append('../')
from src import IQLAgent, MultiAgentGridworldEnv

## Environment Setup

First, let's configure and initialize our multi-agent gridworld environment.

In [None]:
# Environment configuration
env_config = {
    'grid_size': (5, 5),  # 5x5 grid
    'num_agents': 2,      # 2 agents
    'agent_start_pos': {0: (0, 0), 1: (4, 4)},  # Agent 0 starts at top-left, Agent 1 at bottom-right
    'agent_goal_pos': {0: (4, 4), 1: (0, 0)},   # Agent 0's goal is bottom-right, Agent 1's is top-left
    'obstacles_pos': [(2, 2)],  # One obstacle in the middle
    'max_steps': 50,            # Maximum steps per episode
    'observation_type': 'coords',  # Agents observe their coordinates
    'reward_type': 'individual',   # Individual rewards
    'slip_prob': 0.0            # No slipping (deterministic)
}

# Create the environment
env = MultiAgentGridworldEnv(**env_config)

# Print environment information
print(f"Grid size: {env.grid_height}x{env.grid_width}")
print(f"Number of agents: {env.num_agents}")
print(f"Agent start positions: {env.agent_start_pos}")
print(f"Agent goal positions: {env.agent_goal_pos}")
print(f"Obstacles: {env.obstacles_pos}")
print(f"Action space size: {env.action_space_size}")

# Render the initial environment
print("\nInitial environment state:")
env.reset()
env.render()

## Agent Setup

Now, let's create our Independent Q-Learning agents.

In [None]:
# Determine observation dimension based on environment configuration
if env.observation_type == 'coords':
    obs_dim = 2
elif env.observation_type == 'local_grid_3x3':
    obs_dim = 9
elif env.observation_type == 'full_state':
    obs_dim = env.grid_width * env.grid_height
else:
    raise ValueError("Unsupported observation type")

action_dim = env.action_space_size

# Hyperparameters
BUFFER_CAPACITY = 10000
LEARNING_RATE = 1e-4
GAMMA = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.05
EPSILON_DECAY = 0.99
TARGET_UPDATE_FREQ = 50
BATCH_SIZE = 64

# Create agents
agents = {i: IQLAgent(i, obs_dim, action_dim, BUFFER_CAPACITY, LEARNING_RATE,
                      GAMMA, EPSILON_START, EPSILON_END, EPSILON_DECAY,
                      TARGET_UPDATE_FREQ)
          for i in env.agent_ids}

# Print agent information
for agent_id, agent in agents.items():
    print(f"Agent {agent_id}:")
    print(f"  Observation dimension: {agent.observation_dim}")
    print(f"  Action dimension: {agent.action_dim}")
    print(f"  Device: {agent.device}")

## Training Loop

Let's train our agents using Independent Q-Learning.

In [None]:
def plot_rewards(rewards_history, avg_rewards_history):
    plt.figure(figsize=(10, 5))
    plt.plot(rewards_history, alpha=0.3, label='Episode Rewards')
    plt.plot(avg_rewards_history, label='Avg Rewards (50 episodes)')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.title('IQL Training Performance')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
# Training parameters
NUM_EPISODES = 500
LEARN_EVERY_N_STEPS = 4
LOG_FREQ = 50
RENDER_FREQ = 100

episode_rewards_history = []
avg_rewards_history = []
total_steps = 0

print(f"Starting IQL Training for {NUM_EPISODES} episodes...")

for episode in range(NUM_EPISODES):
    observations = env.reset()
    episode_rewards = {i: 0 for i in env.agent_ids}
    done = False
    
    while not done:
        total_steps += 1
        # 1. Select action for each agent
        joint_action = {agent_id: agent.select_action(observations[agent_id])
                      for agent_id, agent in agents.items()}
        
        # 2. Step the environment
        next_observations, rewards, dones, info = env.step(joint_action)
        
        # 3. Store experience in each agent's buffer
        for agent_id, agent in agents.items():
            agent.replay_buffer.add(observations[agent_id], joint_action[agent_id],
                                  rewards[agent_id], next_observations[agent_id],
                                  dones[agent_id])
        
        # Update observations
        observations = next_observations
        
        # 4. Perform learning step for each agent
        if total_steps % LEARN_EVERY_N_STEPS == 0:
            for agent_id, agent in agents.items():
                if len(agent.replay_buffer) >= BATCH_SIZE:
                    agent.learn(BATCH_SIZE)
        
        # Update episode rewards
        for agent_id in env.agent_ids:
            episode_rewards[agent_id] += rewards[agent_id]
        
        # Check if episode is finished
        done = dones['__all__']
        
        # Render environment periodically
        if episode % RENDER_FREQ == 0 and episode > 0 and not done:
            clear_output(wait=True)
            print(f"Episode {episode}/{NUM_EPISODES}")
            env.render()
            time.sleep(0.1)  # Add a small delay for better visualization
    
    # End of episode
    # Decay epsilon for all agents
    for agent in agents.values():
        agent.decay_epsilon()
    
    # Log results
    total_episode_reward = sum(episode_rewards.values())
    episode_rewards_history.append(total_episode_reward)
    
    # Calculate average reward over last 50 episodes
    if len(episode_rewards_history) >= 50:
        avg_reward = np.mean(episode_rewards_history[-50:])
    else:
        avg_reward = np.mean(episode_rewards_history)
    
    avg_rewards_history.append(avg_reward)
    
    if (episode + 1) % LOG_FREQ == 0:
        clear_output(wait=True)
        print(f"Episode {episode + 1}/{NUM_EPISODES} | "
              f"Avg Reward (Last 50): {avg_reward:.2f} | "
              f"Epsilon: {agents[0].epsilon:.3f}")
        
        # Plot rewards
        plot_rewards(episode_rewards_history, avg_rewards_history)

print("Training finished.")

# Final reward plot
plot_rewards(episode_rewards_history, avg_rewards_history)

## Evaluation

Now let's evaluate the trained agents by visualizing their behavior.

In [None]:
def evaluate_agents(env, agents, num_episodes=5, render=True):
    """Evaluate the agents' performance."""
    total_rewards = []
    
    for episode in range(num_episodes):
        observations = env.reset()
        episode_rewards = {i: 0 for i in env.agent_ids}
        done = False
        step = 0
        
        if render:
            print(f"\nEvaluation Episode {episode + 1}/{num_episodes}, Step {step}")
            env.render()
            time.sleep(0.5)
        
        while not done:
            step += 1
            # Use greedy actions for evaluation (epsilon = 0)
            joint_action = {}
            for agent_id, agent in agents.items():
                # Save original epsilon
                original_epsilon = agent.epsilon
                agent.epsilon = 0.0  # Set to 0 for greedy action selection
                joint_action[agent_id] = agent.select_action(observations[agent_id])
                agent.epsilon = original_epsilon  # Restore original epsilon
            
            next_observations, rewards, dones, info = env.step(joint_action)
            
            # Update episode rewards
            for agent_id in env.agent_ids:
                episode_rewards[agent_id] += rewards[agent_id]
            
            # Update observations
            observations = next_observations
            
            # Render if required
            if render:
                clear_output(wait=True)
                print(f"Evaluation Episode {episode + 1}/{num_episodes}, Step {step}")
                print(f"Actions: {joint_action}")
                env.render()
                time.sleep(0.5)  # Slow down for visualization
            
            # Check if episode is done
            done = dones['__all__']
        
        # Episode finished
        total_reward = sum(episode_rewards.values())
        total_rewards.append(total_reward)
        
        if render:
            print(f"Episode Rewards: {episode_rewards}")
            print(f"Total Reward: {total_reward}")
    
    avg_reward = np.mean(total_rewards)
    print(f"\nEvaluation Results:")
    print(f"Average Total Reward: {avg_reward:.2f}")
    return avg_reward

In [None]:
# Evaluate the trained agents
avg_reward = evaluate_agents(env, agents, num_episodes=3, render=True)

## Conclusion

In this notebook, we have seen how to implement and train Independent Q-Learning (IQL) agents in a multi-agent setting. The key insights are:

1. Each agent has its own Q-network and learns independently without explicitly considering other agents
2. Agents interact through the environment, indirectly affecting each other's state and rewards
3. Over time, agents learn to navigate the environment and reach their goals while avoiding obstacles

IQL provides a simple yet effective baseline for multi-agent reinforcement learning. However, it has limitations in highly coordinated tasks because each agent treats other agents as part of the environment rather than explicitly modeling their behavior.