In [48]:
# Step 1: Import required libraries and create the simple tag environment
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
from pettingzoo.mpe import simple_tag_v2
from collections import deque
import random
from torch.distributions import Categorical

# Create the simple tag environment
env = simple_tag_v2.parallel_env(num_obstacles = 0, max_cycles=50, render_mode="human")
obs = env.reset()

In [50]:
class Actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = torch.softmax(self.fc3(x), dim=-1)
        return x

class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = self.fc3(x)
        return x

class MAPPOAgent:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, agent, eps_clip, K_epochs, device):
        self.actor = Actor(state_dim, action_dim).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        
        self.critic = Critic(46).to(device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic)

        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.name = agent
        self.device = device        

#     def get_action(self, state):
#         state = torch.from_numpy(state).float().to(device)
#         action_probs = self.actor(state)
#         print(action_probs)
#         action_dist = Categorical(action_probs)
#         print(action_dist)
#         action = action_dist.sample()
#         return action.item()

    def act(self, state, noise=None):
        state = torch.from_numpy(state).float().to(self.device)
        action = self.actor(state).cpu().data.numpy()
        if noise:
            action += noise.noise()
        return action


In [45]:
def train_mappo(agents, env, episodes, noise, device, batch_size=128, gamma=0.99, tau=0.05):
    memory = deque(maxlen=100000)
    rewards_list = []

    for episode in range(episodes):
        states = env.reset()
        episode_rewards = 0

        while True:
            actions = {agent.name : agent.act(states[agent.name], noise) for agent in agents}
            action_vals = {agent.name : np.argmax(actions[agent.name]) for agent in agents}
            next_states, rewards, _, dones, _ = env.step(action_vals)
            env.render()
#             print(actions)
            
            memory.append((states, action_vals, rewards, next_states, dones))
            episode_rewards += np.sum(list(rewards.values()))
        
            if all(value == True for value in dones.values()):
                break
        
            if len(memory) >= batch_size: 
                experiences = random.sample(memory, batch_size)
                batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = zip(*experiences)
                    
                # Extracting and organizing data from memory.
                actors_states = {}
                actors_actions = {}
                actors_rewards = {}
                actors_next_states = {}
                actors_dones = {}
                                
                for agent in env.agents:
                    actors_states[agent] = torch.stack([torch.from_numpy(batch_states[itr][agent]).float().to(device) for itr in range(batch_size)])
                    actors_actions[agent] = torch.Tensor([batch_actions[itr][agent] for itr in range(batch_size)])
                    actors_rewards[agent] = torch.Tensor([batch_rewards[itr][agent] for itr in range(batch_size)])
                    actors_next_states[agent] = torch.stack([torch.from_numpy(batch_next_states[itr][agent]).float().to(device) for itr in range(batch_size)])
                    actors_dones[agent] = torch.Tensor([batch_dones[itr][agent]*1 for itr in range(batch_size)])
        
                batch_states = torch.cat([actors_states[agent] for agent in env.agents], dim = 1)
                batch_next_states = torch.cat([actors_next_states[agent] for agent in env.agents], dim = 1)
                
                for agent in agents:
                    for _ in range(agent.K_epochs):
                        # Calculate advantages
                        state_values = agent.critic(batch_states)
                        next_state_values = agent.critic(batch_next_states).detach()
                        advantages = actors_rewards[agent.name][:,None] + (1 - actors_dones[agent.name])[:,None] * gamma * next_state_values[:,None] - state_values[:,None]

                        # Update the critic
                        critic_loss = advantages.pow(2).mean()
                        agent.critic_optimizer.zero_grad()
                        critic_loss.backward()
                        agent.critic_optimizer.step()
                        
                        # Calculate the new action probabilities and the old action probabilities
                        new_action_probs = agent.actor(actors_states[agent.name])
                        old_action_probs = new_action_probs.detach()
                        new_action_probs = torch.sum(new_action_probs*torch.nn.functional.one_hot(actors_actions[agent.name].long(), num_classes = 5), 1)
                        old_action_probs = torch.sum(old_action_probs*torch.nn.functional.one_hot(actors_actions[agent.name].long(), num_classes = 5), 1)
                                                
                        # Calculate the surrogate loss for the actor
                        ratio = (new_action_probs / old_action_probs).exp()
                        surrogate1 = ratio * advantages.detach()
                        surrogate2 = torch.clamp(ratio, 1 - agent.eps_clip, 1 + agent.eps_clip) * advantages.detach()
                        actor_loss = -torch.min(surrogate1, surrogate2).mean()

                        # Update the actor
                        agent.actor_optimizer.zero_grad()
                        actor_loss.backward()
                        agent.actor_optimizer.step()
                    
            states = next_states

        rewards_list.append(episode_rewards)
        print(f"Episode {episode + 1}/{episodes}, Reward: {episode_rewards}")

    return rewards_list



In [46]:
class OUNoise:
    def __init__(self, action_dim, mu=0.5, theta=0.15, sigma=0.2):
        self.action_dim = action_dim
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dim) * self.mu
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu

    def noise(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state

In [51]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_agents = len(env.agents)
state_dims = {agent : env.observation_space(agent).shape[0] for agent in env.agents}
action_dim = env.action_space('agent_0').n

print(state_dims, action_dim, '\n',type(env.action_space('adversary_0')))

hidden_dim = 128
lr_actor = 1e-4
lr_critic = 1e-3
episodes = 2000
test_episodes = 10
eps_clip = 0.4

# Create the agents
agents = [MAPPOAgent(state_dims[agent], action_dim, lr_actor, lr_critic, agent, eps_clip, 1, device) for agent in env.agents]

# Create noise for exploration
noise = OUNoise(action_dim)

# Train the agents
rewards = train_mappo(agents, env, episodes, noise, device)

for agent in agents:
    torch.save(agent.actor.state_dict(), f"simple_tag_models/{agent.name}_actor.pth")
    torch.save(agent.critic.state_dict(), f"simple_tag_models/{agent.name}_critic.pth")

for agent in agents:
        agent.actor.load_state_dict(torch.load(f"simple_tag_models/{agent.name}_actor.pth"))
    
# Test the trained agents
test_maddpg(agents, env, test_episodes)

# Save Rewards
np.save(f"simple_tag_models/rewards.npy", np.array(rewards))

{'adversary_0': 12, 'adversary_1': 12, 'adversary_2': 12, 'agent_0': 10} 5 
 <class 'gymnasium.spaces.discrete.Discrete'>
Episode 1/2000, Reward: -104.75804775955447
Episode 2/2000, Reward: -23.69604049074809
Episode 3/2000, Reward: -27.11355955524026
Episode 4/2000, Reward: -66.26501479531304
Episode 5/2000, Reward: -317.7102066939231
Episode 6/2000, Reward: -22.25582448105389
Episode 7/2000, Reward: -11.30350216134312
Episode 8/2000, Reward: -29.931055776407714
Episode 9/2000, Reward: -41.58050321652352
Episode 10/2000, Reward: 34.684186626174004
Episode 11/2000, Reward: 0.0
Episode 12/2000, Reward: 0.0
Episode 13/2000, Reward: -12.47362604240265
Episode 14/2000, Reward: -58.60399308628546
Episode 15/2000, Reward: -24.87889047831528
Episode 16/2000, Reward: 0.0
Episode 17/2000, Reward: 0.0
Episode 18/2000, Reward: 0.0
Episode 19/2000, Reward: 0.0
Episode 20/2000, Reward: 59.98144997411807
Episode 21/2000, Reward: -0.6009422994151326
Episode 22/2000, Reward: 0.0
Episode 23/2000, Rewar

KeyboardInterrupt: 