# Multi-Agent Deep Reinforcement Learning: Cooperative Tennis

This notebook implements a Multi-Agent Deep Deterministic Policy Gradient (MADDPG) system to train two cooperative agents to play tennis in the Unity ML-Agents environment.

## Project Overview

The goal is to train two agents controlling rackets to keep a ball in play for as long as possible. Each agent receives a reward of +0.1 for successfully hitting the ball over the net, and -0.01 for letting it hit the ground or go out of bounds.

<table style="margin: auto;">
  <tr>
      <td>Trained Agents in Action</td>
  </tr>
  <tr>
    <td><img src="animation.gif" align="center" width="500"></td>
  </tr>
</table>

## Key Features

- **Algorithm**: DDPG with distributional learning, noisy layers, and prioritized experience replay
- **Environment**: Unity ML-Agents Tennis (continuous control)
- **State Space**: 24 dimensions per agent (position & velocity of racket and ball)
- **Action Space**: 2 continuous actions per agent (movement toward/away from net, jumping)
- **Success Criterion**: Average score â‰¥ 0.5 over 100 consecutive episodes

In [None]:
import random
from collections import deque

import matplotlib.pyplot as plt
%matplotlib inline

from mlagents_envs.environment import UnityEnvironment
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from ddpg_wrapper import Agent
from wrapper import ddpg

random.seed(0)
np.random.seed(0)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.empty(1, device=device);

## Environment Setup and Initialisation

First, I'll load the Unity Tennis environment and examine its properties. The environment consists of two agents that must learn to cooperate in order to achieve the longest possible rally.

In [None]:
# Initialize Unity Tennis environment
# Use project-relative path so no absolute user path is exposed
tennis_env = UnityEnvironment(
    file_name="Unity/Tennis.app" #Or any path you choose
)

# Reset and get initial state
tennis_env.reset()

# Get behavior name
behavior_names = list(tennis_env.behavior_specs)
brain_id = behavior_names[0]
spec = tennis_env.behavior_specs[brain_id]

# Get environment info
decision_steps, terminal_steps = tennis_env.get_steps(brain_id)
n_agents = len(decision_steps)
action_dim = spec.action_spec.continuous_size
state_dim = spec.observation_specs[0].shape[0]

In [None]:
# Display environment specifications
print(f'Number of cooperative agents: {n_agents}\n')
print(f'Dimension of action space: {action_dim}\n')

# Examine observation space
current_states = env_state.vector_observations
print(f'Each of the {current_states.shape[0]} agents observes a state vector of length: {state_dim}')
print(f'Initial state for agent 0:\n{current_states[0]}\n')

# Verify environment structure
print('\nEnvironment Structure:')
print(f"  Reward signals: {len(env_state.rewards)}")
print(f"  Observation tensor shape: {env_state.vector_observations.shape}")
print(f"  Done flags: {len(env_state.local_done)}")

In [None]:
# Test environment with random actions
num_test_episodes = 5

for episode_idx in range(num_test_episodes):
    env_state = tennis_env.reset(train_mode=False)[brain_id]
    current_states = env_state.vector_observations
    episode_rewards = np.zeros(n_agents)
    timesteps = 0
    
    while True:
        timesteps += 1
        # Sample random actions from uniform distribution
        random_actions = np.random.randn(n_agents, action_dim)
        random_actions = np.clip(random_actions, -1, 1)
        
        env_state = tennis_env.step(random_actions)[brain_id]
        next_states = env_state.vector_observations
        rewards = env_state.rewards
        dones = env_state.local_done
        
        episode_rewards += rewards
        current_states = next_states
        
        if np.any(dones):
            break
    
    print(f'Episode {episode_idx + 1} | Steps: {timesteps} | Max score: {np.max(episode_rewards):.3f}')

In [None]:
# Configuration for the DDPG agent with advanced features
agent_config = {
    # Reproducibility
    'seed': 42,
    
    # Core learning parameters
    'batch_size': 1024,
    'buffer_size': int(1e6),
    'start_since': 1024,
    'gamma': 0.95,
    'update_every': 1,
    'n_updates': 1,
    'tau': 0.2,
    
    # Network optimization
    'actor_lr': 1e-3,
    'critic_lr': 1e-3,
    'clip': None,
    'weight_decay': 0,
    
    # Advanced features
    'distributional': True,
    
    # Prioritized Experience Replay
    'priority_eps': 1e-3,
    'a': 0.,
    
    # Multi-step returns
    'n_multisteps': 1,
    
    # Distributional RL support values
    'v_min': -0.1,
    'v_max': 0.1,
    'n_atoms': 51,
    
    # Noisy networks for exploration
    'initial_sigma': 0.500,
    'linear_type': 'noisy',
    'factorized': True
}

# Training configuration
training_config = {
    'n_episodes': 500,  # Train for longer to ensure robust convergence
    'continue_after_solved': True,
    
    # Exploration schedule (using noisy layers, so OU noise disabled)
    'eps_start': 0.,
    'eps_min': 0.,
    'eps_decay': 0.,
    
    # Importance sampling for prioritized replay
    'beta_start': 0.,
    'beta_end': 0.
}

In [None]:
# Instantiate the DDPG agent
tennis_agent = Agent(state_dim, action_dim, n_agents, **agent_config)

# Display network architectures
print("Actor Network Architecture:")
print(tennis_agent.actor_local)
print("\nCritic Network Architecture:")
print(tennis_agent.critic_local)

In [None]:
# Train the agent
training_scores = ddpg(tennis_env, tennis_agent, **training_config)

In [None]:
# Visualize training performance with custom styling
import os
from datetime import datetime

# Create results directory if it doesn't exist
os.makedirs('results', exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

fig = plt.figure(figsize=(12, 5), facecolor='#2E3440')
ax = fig.add_subplot(111)
ax.grid(True, alpha=0.3, linestyle='--', color='#4C566A')
ax.set_facecolor('#3B4252')

# Plot raw scores
ax.plot(np.arange(len(training_scores)), training_scores,
        alpha=0.6, linewidth=1.5, color='#88C0D0', label="Episode Score")

# Plot moving average
moving_avg = np.array([np.mean(training_scores[max(0, i-100):i]) 
                       for i in range(1, len(training_scores) + 1)])
ax.plot(np.arange(len(training_scores)), moving_avg,
        alpha=0.9, linewidth=2.5, color='#A3BE8C', label="100-Episode Average")

# Mark solved threshold
ax.axhline(y=0.5, color='#BF616A', linestyle='--', linewidth=2, label='Solved Threshold')

# Styling
ax.legend(loc='upper left', framealpha=0.9)
ax.set_ylabel('Max Score per Episode', color='#ECEFF4', fontsize=12)
ax.set_xlabel('Episode Number', color='#ECEFF4', fontsize=12)
ax.set_title('MADDPG Training Progress', color='#ECEFF4', fontsize=14, fontweight='bold')
ax.tick_params(colors='#ECEFF4')
plt.tight_layout()

# Save plot to results folder
plot_path = os.path.join('results', f'training_plot_{timestamp}.png')
plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='#2E3440')
print(f'Training plot saved to {plot_path}')

plt.show()

In [None]:
# Save trained model and training artifacts
torch.save(
    f='trained_tennis_agent.pth',
    obj={
        'agent_config': agent_config,
        'training_config': training_config,
        'episode_scores': training_scores,
        'actor_weights': tennis_agent.actor_local.state_dict(),
        'critic_weights': tennis_agent.critic_local.state_dict()
    }
)

In [None]:
# Load previously trained agent
checkpoint = torch.load(
    f="trained_tennis_agent.pth",
    map_location=device
)

tennis_agent = Agent(state_dim, action_dim, n_agents, **checkpoint['agent_config'])
tennis_agent.actor_local.load_state_dict(checkpoint['actor_weights'], strict=False)
tennis_agent.critic_local.load_state_dict(checkpoint['critic_weights'], strict=False)

In [None]:
# Watch the trained agent play
env_state = tennis_env.reset(train_mode=False)[brain_id]
current_states = env_state.vector_observations
episode_rewards = np.zeros(n_agents)
step_counter = 0

while True:
    actions = tennis_agent.act(current_states)
    env_state = tennis_env.step(actions)[brain_id]
    next_states = env_state.vector_observations
    rewards = env_state.rewards
    dones = env_state.local_done
    episode_rewards += rewards
    current_states = next_states
    step_counter += 1
    
    print(f"\rTimestep {step_counter} | Current max score: {np.max(episode_rewards):.3f}", end='')
    
    if np.any(dones):
        break

print(f"\n\nFinal episode score: {np.max(episode_rewards):.3f}")

In [None]:
# Comprehensive evaluation over multiple episodes
n_eval_episodes = 150  # Extended evaluation for statistical significance

episode_scores = []
status_format = "\rEpisode {}/{} | Current: {:.3f} | Previous: {:.3f} | Mean: {:.3f}"

for ep_num in range(n_eval_episodes):
    env_state = tennis_env.reset(train_mode=True)[brain_id]
    current_states = env_state.vector_observations
    ep_rewards = np.zeros(n_agents)
    
    while True:
        actions = tennis_agent.act(current_states)
        env_state = tennis_env.step(actions)[brain_id]
        next_states = env_state.vector_observations
        rewards = env_state.rewards
        dones = env_state.local_done
        ep_rewards += rewards
        current_states = next_states
        
        prev_score = episode_scores[-1] if episode_scores else 0
        mean_score = np.mean(episode_scores) if episode_scores else 0
        print(status_format.format(
            ep_num + 1, n_eval_episodes,
            np.max(ep_rewards), prev_score, mean_score
        ), end='')
        
        if np.any(dones):
            break
    
    episode_scores.append(np.max(ep_rewards))
    
    if (ep_num + 1) % 10 == 0:
        print()

print(f"\n\nEvaluation complete!")
print(f"Average score over {n_eval_episodes} episodes: {np.mean(episode_scores):.3f}")
print(f"Standard deviation: {np.std(episode_scores):.3f}")

## Clean Up

Close the Unity environment to free system resources.

In [None]:
tennis_env.close()