In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

class MultiAgentEnvironment:
    def __init__(self):
        self.state_dim = 6  # Dimensionality of the state space (ball position + keeper position + taker position)
        self.action_dim = 2  # Dimensionality of the action space (x, y displacement for each agent)
        # Initialize agent positions
        self.keeper_pos = np.array([0.0, 0.0])
        self.taker_pos = np.array([random.uniform(-1, 1), random.uniform(-1, 1)])
        self.ball_pos = np.array([random.uniform(-1, 1), random.uniform(-1, 1)])
        self.max_steps = 1000  # Maximum number of steps before episode termination
        self.current_step = 0  # Current step in the episode

    def reset(self):
        # Reset agent positions and ball position
        self.keeper_pos = np.array([0.0, 0.0])
        self.taker_pos = np.array([random.uniform(-1, 1), random.uniform(-1, 1)])
        self.ball_pos = np.array([random.uniform(-1, 1), random.uniform(-1, 1)])
        self.current_step = 0
        # Return initial state
        return self._get_state()

    def step(self, keeper_action, taker_action):
        # Update keeper position
        self.keeper_pos += keeper_action
        # Update taker position
        self.taker_pos += taker_action
        # Update ball position (towards the taker)
        ball_move = self.taker_pos - self.ball_pos
        self.ball_pos += ball_move / np.linalg.norm(ball_move)
        # Increment step count
        self.current_step += 1
        # Check if episode is done
        done = self.current_step >= self.max_steps
        # Calculate rewards
        keeper_reward = -np.linalg.norm(self.keeper_pos - self.ball_pos)  # Negative distance to ball
        taker_reward = np.linalg.norm(self.taker_pos - self.ball_pos)  # Positive distance to ball
        # Return next state, rewards, and done flag
        return self._get_state(), keeper_reward, taker_reward, done

    def _get_state(self):
        return np.concatenate([self.ball_pos, self.keeper_pos, self.taker_pos])


class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class IndependentQLearningAgent:
    def __init__(self, input_dim, output_dim, lr=0.001, gamma=0.99):
        self.q_network = QNetwork(input_dim, output_dim)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.gamma = gamma
        self.loss_fn = nn.MSELoss()

    def select_action(self, state):
        with torch.no_grad():
            q_values = self.q_network(torch.tensor(state, dtype=torch.float32))
            action = q_values.argmax().item()
        return action

    def update(self, state, action, reward, next_state):
        q_values = self.q_network(torch.tensor(state, dtype=torch.float32))
        next_q_values = self.q_network(torch.tensor(next_state, dtype=torch.float32))
        target = reward + self.gamma * next_q_values.max().item()
        loss = self.loss_fn(q_values[action], torch.tensor(target, dtype=torch.float32))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


if __name__ == "__main__":
    env = MultiAgentEnvironment()
    keeper = IndependentQLearningAgent(env.state_dim, env.action_dim)
    taker = IndependentQLearningAgent(env.state_dim, env.action_dim)
    num_episodes = 20
    for episode in range(num_episodes):
        state = env.reset()
        total_keeper_reward = 0
        total_taker_reward = 0
        done = False
        while not done:
            keeper_action = keeper.select_action(state)
            taker_action = taker.select_action(state)
            next_state, keeper_reward, taker_reward, done = env.step(keeper_action, taker_action)
            keeper.update(state, keeper_action, keeper_reward, next_state)
            taker.update(state, taker_action, taker_reward, next_state)
            total_keeper_reward += keeper_reward
            total_taker_reward += taker_reward
            state = next_state
        print(f"Episode {episode + 1}/{num_episodes}, Keeper Reward: {total_keeper_reward}, Taker Reward: {total_taker_reward}")


Episode 1/20, Keeper Reward: -131356.7453975291, Taker Reward: 500.0
Episode 2/20, Keeper Reward: -1101.066171133041, Taker Reward: 500.0
Episode 3/20, Keeper Reward: -621.2533612573226, Taker Reward: 500.0
Episode 4/20, Keeper Reward: -1160.042802791231, Taker Reward: 500.0
Episode 5/20, Keeper Reward: -4264.006279668631, Taker Reward: 500.0
Episode 6/20, Keeper Reward: -612.1241716649247, Taker Reward: 500.0
Episode 7/20, Keeper Reward: -1513.9227210023787, Taker Reward: 500.0
Episode 8/20, Keeper Reward: -859.0678247214785, Taker Reward: 500.0
Episode 9/20, Keeper Reward: -508.20899812200935, Taker Reward: 500.0
Episode 10/20, Keeper Reward: -928.7351330086195, Taker Reward: 500.0
Episode 11/20, Keeper Reward: -1024.6549173469894, Taker Reward: 500.0
Episode 12/20, Keeper Reward: -1050.576208885849, Taker Reward: 500.0
Episode 13/20, Keeper Reward: -814.0986366559483, Taker Reward: 500.0
Episode 14/20, Keeper Reward: -1398.92991297358, Taker Reward: 500.27144614138393
Episode 15/20,