In [1]:
!pip install pettingzoo
!pip install pygame
!pip install torch



In [10]:
# Step 1: Import required libraries and create the simple tag environment
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
from pettingzoo.mpe import simple_tag_v2
from collections import deque
import random

# Create the simple tag environment
env = simple_tag_v2.parallel_env(num_obstacles = 3, max_cycles=50, render_mode="human")
obs = env.reset()

In [11]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)

    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [19]:
class MADDPGAgent:
    def __init__(self, state_dim, action_dim, hidden_dim, lr_actor, lr_critic, agent, device):
        self.actor = Actor(state_dim, action_dim, hidden_dim).to(device)
        self.target_actor = Actor(state_dim, action_dim, hidden_dim).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        
        self.critic = Critic(70, 20, hidden_dim).to(device)
        self.target_critic = Critic(70, 20, hidden_dim).to(device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic)

        self.name = agent
        
        self.device = device
        self.update_target_networks()
        

    def update_target_networks(self):
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    def act(self, state, noise=None):
        state = torch.from_numpy(state).float().to(self.device)
        action = self.actor(state).cpu().data.numpy()
        if noise:
            action += noise.noise()
        return action

def soft_update(target, source, tau):
    for target_param, source_param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data * (1.0 - tau) + source_param.data * tau)

In [13]:
def train_maddpg(agents, env, episodes, noise, device, batch_size=128, gamma=0.99, tau=0.05):
    memory = deque(maxlen=100000)
    rewards_list = []

    for episode in range(episodes):
        states = env.reset()
        episode_rewards = 0

        while True:
            actions = {agent.name : agent.act(states[agent.name], noise) for agent in agents}
            action_vals = {agent.name : np.argmax(actions[agent.name]) for agent in agents}
            next_states, rewards, _, dones, _ = env.step(action_vals)
#             env.render()
            
            memory.append((states, actions, rewards, next_states, dones))
            episode_rewards += np.sum(list(rewards.values()))
        
            if all(value == True for value in dones.values()):
                break
        
            if len(memory) >= batch_size: 
                experiences = random.sample(memory, batch_size)
                batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = zip(*experiences)
                    
                # Extracting and organizing data from memory.
                actors_states = {}
                actors_actions = {}
                actors_rewards = {}
                actors_next_states = {}
                actors_dones = {}
                                
                for agent in env.agents:
                    actors_states[agent] = torch.stack([torch.from_numpy(batch_states[itr][agent]).float().to(device) for itr in range(batch_size)])
                    actors_actions[agent] = torch.stack([torch.from_numpy(batch_actions[itr][agent]).float().to(device) for itr in range(batch_size)])
                    actors_rewards[agent] = torch.Tensor([batch_rewards[itr][agent] for itr in range(batch_size)])
                    actors_next_states[agent] = torch.stack([torch.from_numpy(batch_next_states[itr][agent]).float().to(device) for itr in range(batch_size)])
                    actors_dones[agent] = torch.Tensor([batch_dones[itr][agent]*1 for itr in range(batch_size)])
        
                batch_states = torch.cat([actors_states[agent] for agent in env.agents], dim = 1)
                batch_next_states = torch.cat([actors_next_states[agent] for agent in env.agents], dim = 1)
                batch_actions = torch.cat([actors_actions[agent] for agent in env.agents], dim = 1)
        
                # Preparing data for Critic Model
#                 batch_states = torch.from_numpy(batch_states).float().to(device)
#                 batch_actions = torch.from_numpy(batch_actions).float().to(device)
#                 batch_next_states = torch.from_numpy(batch_next_states).float().to(device)
                
#                 batch_dones = torch.from_numpy(np.array(batch_dones).astype(np.uint8)).float().to(device)

                for agent in agents:
                    # Update the critic
                    next_actions = [a.target_actor(actors_next_states[a.name]) for a in agents]
                    next_actions = torch.cat(next_actions, dim=1)
                    target_q_values = agent.target_critic(batch_next_states, next_actions)
                    expected_q_values = actors_rewards[agent.name][:,None] + (1 - actors_dones[agent.name][:,None]) * gamma * target_q_values
                    q_values = agent.critic(batch_states, batch_actions)
                    critic_loss = F.mse_loss(q_values, expected_q_values.detach())
                    agent.critic_optimizer.zero_grad()
                    critic_loss.backward()
                    agent.critic_optimizer.step()

                    # Update the actor
                    actions = [a.actor(actors_states[a.name]) for a in agents]
                    actions = torch.cat(actions, dim=1)
                    actor_loss = -agent.critic(batch_states, actions).mean()
                    agent.actor_optimizer.zero_grad()
                    actor_loss.backward()
                    agent.actor_optimizer.step()

                    # Update target networks
                    soft_update(agent.target_actor, agent.actor, tau)
                    soft_update(agent.target_critic, agent.critic, tau)
                    
            states = next_states

        rewards_list.append(episode_rewards)
        print(f"Episode {episode + 1}/{episodes}, Reward: {episode_rewards}")

    return rewards_list


In [14]:
def test_maddpg(agents, env, episodes):
    for episode in range(episodes):
        states = env.reset()
        episode_rewards = 0

        while True:
            actions = {agent.name : np.argmax(agent.act(states[agent.name])) for agent in agents}
            next_states, rewards, _, dones, _ = env.step(actions)
            env.render()
            episode_rewards += np.sum(list(rewards.values()))

            if all(value == True for value in dones.values()):
                break

            states = next_states

        print(f"Test Episode {episode + 1}/{episodes}, Reward: {episode_rewards}")


In [15]:
class OUNoise:
    def __init__(self, action_dim, mu=0.0, theta=0.15, sigma=0.2):
        self.action_dim = action_dim
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dim) * self.mu
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu

    def noise(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_agents = len(env.agents)
state_dims = {agent : env.observation_space(agent).shape[0] for agent in env.agents}
action_dim = env.action_space('agent_0').n

print(state_dims, action_dim, '\n',type(env.action_space('adversary_0')))

hidden_dim = 128
lr_actor = 1e-4
lr_critic = 1e3
episodes = 5000
test_episodes = 100

# Create the agents
agents = [MADDPGAgent(state_dims[agent], action_dim, hidden_dim, lr_actor, lr_critic, agent, device) for agent in env.agents]

for agent in agents:
    print(agent.name)
    agent.actor.load_state_dict(torch.load(f"simple_tag_models/{agent.name}_actor.pth"))
    agent.critic.load_state_dict(torch.load(f"simple_tag_models/{agent.name}_critic.pth"))

# Test the trained agents
test_maddpg(agents, env, test_episodes)

{'adversary_0': 12, 'adversary_1': 12, 'adversary_2': 12, 'agent_0': 10} 5 
 <class 'gymnasium.spaces.discrete.Discrete'>
adversary_0
adversary_1
adversary_2
agent_0
Test Episode 1/100, Reward: -4873.094272649464
Test Episode 2/100, Reward: -4924.55821315223
Test Episode 3/100, Reward: -5171.792812069902
Test Episode 4/100, Reward: -4854.449733963409


KeyboardInterrupt: 