In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch_geometric.nn as pyg_nn
from pettingzoo.mpe import simple_tag_v2
import os

# Initialize the environment
env = simple_tag_v2.parallel_env(render_mode=None, num_adversaries=4, num_good=1, num_obstacles=2)
env.reset()

# Parameters
num_class_a = 3
num_class_b = 1
num_adversaries = 4
num_good_agents = 1  # Only one agent being chased by adversaries
num_obstacles = 2

# Initial lists of agents
adversary_agents = [agent for agent in env.agents if 'adversary' in agent]
good_agents = [agent for agent in env.agents if 'agent' in agent]

print("Adversary Agents:", adversary_agents)
print("Good Agents:", good_agents)


Adversary Agents: ['adversary_0', 'adversary_1', 'adversary_2', 'adversary_3']
Good Agents: ['agent_0']


### Define Embedding and CGN layer

In [2]:
# Define Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        
    def forward(self, src):
        # src: [batch_size, seq_len, input_dim]
        outputs, (hidden, cell) = self.lstm(src)
        # hidden, cell: [num_layers, batch_size, hidden_dim]
        return hidden, cell

# Define Decoder (if needed)
# In this case, we might not need a Decoder unless we're generating sequences
# For simplicity, let's assume we only need the Encoder's hidden state

# Define GCN layer
class GCNLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GCNLayer, self).__init__()
        self.conv1 = pyg_nn.GCNConv(input_dim, output_dim)
    
    def forward(self, x, edge_index):
        return self.conv1(x, edge_index)

# Parameters
communication_range = 1.5  # Define the communication range
input_dim = 5             # Observation dimensions (including class identifier)
hidden_dim = 16           # Hidden dimension for LSTM encoder
gcn_output_dim = 16       # Dimension of GCN output

# Initialize encoder and GCN layer
encoder = Encoder(input_dim=input_dim, hidden_dim=hidden_dim)
gcn_layer = GCNLayer(input_dim=hidden_dim, output_dim=gcn_output_dim)

# Example usage:
batch_size = 4
input_actor_network_max_dim = 40  # Temporary solution with padding


### Encoder-Decoder

In [3]:
#output_decoder_dim = 18
#input_actor_network_max_dim = 40 # Temporary solution with padding

### Observation Wrapper

In [4]:
def adversary_observation_wrapper(observations, num_class_A, num_class_B, adversary_agents, good_agents, num_adversaries, num_good_agents, num_obstacles):
    # Ensure the number of class A and class B agents matches the current number of adversaries
    assert num_class_A + num_class_B == len(adversary_agents), "Number of agents assigned to Class A and Class B must match the total number of adversaries."
    
    class_a_agents = adversary_agents[:num_class_A]
    class_b_agents = adversary_agents[num_class_A:num_class_A + num_class_B]
    
    updated_observations = {}
    adversary_positions = {}  # Store adversary positions for communication
    
    # Step 1: Modify observations to include class identifier
    for agent, obs in observations.items():
        if agent in adversary_agents:
            agent_class = 0 if agent in class_a_agents else 1  # Class A: 0, Class B: 1
            updated_obs = np.concatenate([obs, [agent_class]])  # Add class identifier
            updated_observations[agent] = updated_obs
        elif agent in good_agents:
            updated_observations[agent] = obs  # Good agents keep their observation
        else:
            # Handle any unexpected agents
            updated_observations[agent] = obs
    
    # Step 2: Gather positions for communication
    for agent in adversary_agents:
        position = updated_observations[agent][2:4]  # Assume position is at index 2:4
        adversary_positions[agent] = position
    
    # Step 3: Communication and gather data for GCN input
    node_features = {}
    edge_index = []
    agent_to_idx = {agent: idx for idx, agent in enumerate(adversary_agents)}
    
    for agent in adversary_agents:
        own_position = adversary_positions[agent]
        neighbors = []
        
        # Find neighbors within communication range
        for other_agent in adversary_agents:
            if agent != other_agent:
                other_position = adversary_positions[other_agent]
                distance = np.linalg.norm(own_position - other_position)
                if distance <= communication_range:
                    neighbors.append(other_agent)
                    edge_index.append([agent_to_idx[agent], agent_to_idx[other_agent]])  # Add edge
        
        # Step 4: Collect observations of neighbors to form a sequence
        neighbor_obs_list = []
        for neighbor in neighbors:
            neighbor_obs = updated_observations[neighbor][:5]  # Exclude class identifier
            neighbor_obs_list.append(neighbor_obs)
        
        # If no neighbors, use a zero tensor
        if len(neighbor_obs_list) == 0:
            neighbor_obs_seq = torch.zeros(1, 1, input_dim)  # [batch_size, seq_len, input_dim]
        else:
            neighbor_obs_seq = torch.tensor(neighbor_obs_list, dtype=torch.float32)
            neighbor_obs_seq = neighbor_obs_seq.unsqueeze(0)  # Add batch dimension
        
        # Step 5: Pass the sequence through the Encoder
        hidden, cell = encoder(neighbor_obs_seq)
        # Use the last layer's hidden state
        encoder_output = hidden[-1, :, :]  # [batch_size, hidden_dim]
        node_features[agent] = encoder_output.squeeze(0)  # Remove batch dimension
    
    # Convert node_features to a list in the order of agents
    node_features_list = [node_features[agent] for agent in adversary_agents]
    node_features_tensor = torch.stack(node_features_list)  # [num_nodes, hidden_dim]
    
    # Convert edge_index to tensor
    if edge_index:
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long)
    
    # Step 6: Pass the gathered information into a GCN
    gcn_output = gcn_layer(node_features_tensor, edge_index)
    
    # Step 7: Concatenate GCN output with agent's own observation
    final_inputs = {}
    for agent_idx, agent in enumerate(adversary_agents):
        own_obs = torch.tensor(updated_observations[agent], dtype=torch.float32)
        final_input = torch.cat([own_obs, gcn_output[agent_idx]])
        final_inputs[agent] = final_input  # This will be used as input to the Actor network
    
    return updated_observations, final_inputs  # Return the updated observations and inputs for Actor network


### Initialize the Environment and Run a Step

In [5]:
# Initialize environment
env.reset()

# Sample action spaces for all agents
actions = {agent: env.action_space(agent).sample() for agent in env.agents}

# Step through the environment
observations, rewards, terminations, truncations, infos = env.step(actions)

# Update adversary_agents and good_agents based on current observations
adversary_agents = [agent for agent in observations.keys() if 'adversary' in agent]
good_agents = [agent for agent in observations.keys() if 'agent' in agent]

# Apply the observation wrapper for adversaries
observations, final_inputs = adversary_observation_wrapper(
    observations, num_class_a, num_class_b, adversary_agents, good_agents, num_adversaries, num_good_agents, num_obstacles)

print("Updated Observations:")


Updated Observations:


  neighbor_obs_seq = torch.tensor(neighbor_obs_list, dtype=torch.float32)


### Actor Network


In [6]:
class ActorNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ActorNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, output_dim)  # Output dimension should match the action space
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Initialize Actor networks for adversaries
actor_networks = {}
for agent in adversary_agents:
    input_dim = len(env.observation_space(agent).low) + 1 + gcn_output_dim  # Observation length plus class id plus GCN output
    output_dim = env.action_space(agent).n  # Assuming discrete action space
    print(f'Agent: {agent}, Input Dim: {input_dim}, Output Dim: {output_dim}')
    input_dim = max(input_dim, input_actor_network_max_dim)
    actor_networks[agent] = ActorNetwork(input_dim, output_dim)


Agent: adversary_0, Input Dim: 35, Output Dim: 5
Agent: adversary_1, Input Dim: 35, Output Dim: 5
Agent: adversary_2, Input Dim: 35, Output Dim: 5
Agent: adversary_3, Input Dim: 35, Output Dim: 5


### Training Loop

In [7]:
# Training parameters
num_episodes = 50  # Total number of episodes to run
print_interval = 10  # Print rewards every 10 episodes

# Initialize reward tracking
episode_rewards = []

# Optimizers for Actor networks and GCN (assuming we are training them)
learning_rate = 0.001
actor_optimizers = {agent: torch.optim.Adam(actor_networks[agent].parameters(), lr=learning_rate) for agent in adversary_agents}
gcn_optimizer = torch.optim.Adam(gcn_layer.parameters(), lr=learning_rate)
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)

# Loss function (placeholder, you need to define based on your RL algorithm)
loss_fn = nn.MSELoss()

# Training loop
for episode in range(1, num_episodes + 1):
    observations = env.reset()
    done = False
    cumulative_reward = 0  # Reset cumulative reward for the episode
    
    while not done:
        # Update adversary_agents and good_agents based on current observations
        adversary_agents = [agent for agent in observations.keys() if 'adversary' in agent]
        good_agents = [agent for agent in observations.keys() if 'agent' in agent]
        
        num_adversaries = len(adversary_agents)
        num_good_agents = len(good_agents)
        
        # Apply the observation wrapper for adversaries
        observations, final_inputs = adversary_observation_wrapper(
            observations, num_class_a, num_class_b, adversary_agents, good_agents, num_adversaries, num_good_agents, num_obstacles)
        
        actions = {}
        for agent in env.agents:
            if agent in adversary_agents:
                # Pad the input to match the maximum input dimension
                m = nn.ConstantPad1d((0, input_actor_network_max_dim - final_inputs[agent].shape[0]), 0)
                final_inputs_pad = m(final_inputs[agent])
                
                # Get the input for the Actor network
                actor_input = final_inputs_pad
                # Get action probabilities (assuming discrete action space)
                action_probs = actor_networks[agent](actor_input)
                # Sample an action (for simplicity, we take the action with the highest probability)
                action = torch.argmax(action_probs).item()
                actions[agent] = action
            elif agent in good_agents:
                # For good agents, sample random actions
                actions[agent] = env.action_space(agent).sample()
            else:
                # Handle any unexpected agents
                actions[agent] = env.action_space(agent).sample()
        
        # Step the environment
        next_observations, rewards, terminations, truncations, infos = env.step(actions)
        
        # Update cumulative reward
        cumulative_reward += sum(rewards.values())
        
        # Placeholder for training step (you need to implement your RL algorithm here)
        # For example, compute loss and update networks
        
        # For simplicity, let's assume we have a target value (dummy value here)
        target = torch.zeros(1)
        loss = 0
        for agent in adversary_agents:
            # Pad the input to match the maximum input dimension
            m = nn.ConstantPad1d((0, input_actor_network_max_dim - final_inputs[agent].shape[0]), 0)
            final_inputs_pad = m(final_inputs[agent])
            
            # Get the predicted value
            actor_input = final_inputs_pad
            prediction = actor_networks[agent](actor_input)
            # Compute loss (this is a placeholder)
            loss += loss_fn(prediction.unsqueeze(0), target)
        
        # Backpropagation
        encoder_optimizer.zero_grad()
        gcn_optimizer.zero_grad()
        for optimizer in actor_optimizers.values():
            optimizer.zero_grad()
        
        loss.backward()
        
        encoder_optimizer.step()
        gcn_optimizer.step()
        for optimizer in actor_optimizers.values():
            optimizer.step()
        
        # Update observations
        observations = next_observations
        
        # Check if all agents are done
        done = all(terminations.values()) or all(truncations.values())
    
    # Append cumulative reward for the episode
    episode_rewards.append(cumulative_reward)
    
    # Print rewards every 'print_interval' episodes
    if episode % print_interval == 0:
        avg_reward = sum(episode_rewards[-print_interval:]) / print_interval
        print(f"Episode {episode}: Average Reward: {avg_reward}")


  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: input.size(-1) must be equal to input_size. Expected 5, got 40

### Save the Models

In [41]:
# Create a directory to save models
model_dir = 'saved_models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)


def save_models(num_class_A, num_class_B, adversary_agents):
    class_a_agents = adversary_agents[:num_class_A]
    class_b_agents = adversary_agents[num_class_A:num_class_A + num_class_B]
    for agent in class_a_agents:
        torch.save(actor_networks[agent].state_dict(), os.path.join(model_dir, f"actor_class_A.pth"))
        break  # Save only one model for Class A (assuming they share weights)
    for agent in class_b_agents:
        torch.save(actor_networks[agent].state_dict(), os.path.join(model_dir, f"actor_class_B.pth"))
        break  # Save only one model for Class B (assuming they share weights)

save_models(num_class_a, num_class_b, adversary_agents)
# Save the GCN and Encoder models
torch.save(gcn_layer.state_dict(), os.path.join(model_dir, "gcn_model.pth"))
torch.save(encoder.state_dict(), os.path.join(model_dir, "encoder_model.pth"))

print("Models saved successfully.")


ctype:['adversary_0', 'adversary_1', 'adversary_2']
ctype:['adversary_3']
agent A:adversary_0
agent B:adversary_3
Models saved successfully.


### Load the Models and Test with Different Number of Adversaries

In [42]:
# Load the models
# Define Encoder and GCN layer
encoder = Encoder(input_dim=5, hidden_dim=hidden_dim)  # Adjust input_dim as needed
encoder.load_state_dict(torch.load(os.path.join(model_dir, "encoder_model.pth")))

gcn_layer = GCNLayer(input_dim=hidden_dim, output_dim=gcn_output_dim)
gcn_layer.load_state_dict(torch.load(os.path.join(model_dir, "gcn_model.pth")))

# Re-initialize Actor networks for new agents and load the saved models
actor_networks = {}

# You might test with a different number of adversaries here
# For example, testing with 5 adversaries
test_num_adversaries = 5
test_num_class_a = 3
test_num_class_b = 2

# Initialize the environment with the new number of adversaries
env = simple_tag_v2.parallel_env(render_mode=None, num_adversaries=test_num_adversaries, num_good=1, num_obstacles=2)
env.reset()

# Get the new lists of adversaries and good agents
adversary_agents = [agent for agent in env.agents if 'adversary' in agent]
good_agents = [agent for agent in env.agents if 'agent' in agent]

# Initialize Actor networks for the new adversaries
class_a_agents = adversary_agents[:test_num_class_a]
class_b_agents = adversary_agents[test_num_class_a:test_num_class_a + test_num_class_b]

for agent in adversary_agents:
    input_dim = input_actor_network_max_dim  # Use the same padded input dim as before
    output_dim = env.action_space(agent).n  # Assuming discrete action space
    actor_net = ActorNetwork(input_dim, output_dim)
    
    # Load the appropriate saved model based on the class of the agent
    if agent in class_a_agents:
        actor_net.load_state_dict(torch.load(os.path.join(model_dir, f"actor_class_A.pth")))
    else:
        actor_net.load_state_dict(torch.load(os.path.join(model_dir, f"actor_class_B.pth")))
    
    actor_networks[agent] = actor_net

# Run the environment for testing
num_epochs = 50
print_interval = 10  # Calculate average reward every 10 epochs
episode_rewards = []

for epoch in range(1, num_epochs + 1):
    cumulative_reward = 0  # Reset cumulative reward for the epoch
    observations = env.reset()
    done = False

    while not done:
        # Update adversary_agents and good_agents based on current observations
        adversary_agents = [agent for agent in observations.keys() if 'adversary' in agent]
        good_agents = [agent for agent in observations.keys() if 'agent' in agent]
        
        num_adversaries = len(adversary_agents)
        num_good_agents = len(good_agents)
        
        # Apply the observation wrapper for adversaries
        observations, final_inputs = adversary_observation_wrapper(
            observations, test_num_class_a, test_num_class_b, adversary_agents, good_agents, num_adversaries, num_good_agents, num_obstacles
        )

        actions = {}
        for agent in adversary_agents:
            # Pad the input to match the maximum input dimension
            m = nn.ConstantPad1d((0, input_actor_network_max_dim - final_inputs[agent].shape[0]), 0)
            final_inputs_pad = m(final_inputs[agent])
            
            # Pass through the Actor network
            action_probs = actor_networks[agent](final_inputs_pad)
            action = torch.argmax(action_probs).item()  # Take the action with the highest probability
            actions[agent] = action

        # For good agents, sample random actions
        for agent in good_agents:
            actions[agent] = env.action_space(agent).sample()

        # Step the environment
        next_observations, rewards, terminations, truncations, infos = env.step(actions)

        # Update cumulative reward for this epoch
        cumulative_reward += sum(rewards.values())

        # Check if all agents are done
        if all(terminations.values()) or all(truncations.values()):
            done = True

        # Update observations
        observations = next_observations

    # Track the cumulative reward for the epoch
    episode_rewards.append(cumulative_reward)

    # Print average reward every 'print_interval' epochs
    if epoch % print_interval == 0:
        avg_reward = sum(episode_rewards[-print_interval:]) / print_interval
        print(f"Epoch {epoch}: Average Reward over last {print_interval} epochs: {avg_reward}")

print("Test completed.")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x35 and 5x8)