In [5]:
# model and env
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import gymnasium as gym
from collections import deque
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FootballEnv(gym.Env):
    def __init__(self, grid_rows=10, grid_cols=10):
        super(FootballEnv, self).__init__()
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols

        self.action_space = gym.spaces.Discrete(10)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(7,), dtype=np.float32)
        self.layout = np.zeros((grid_rows, grid_cols), dtype=str)
        self.layout[:, :] = "."
        self.layout[self.grid_rows//2, self.grid_cols//2] = "C"
        self.layout[self.grid_rows//2-8 : self.grid_rows//2+9, -6:-1] = "D"
        self.layout[self.grid_rows//2-8 : self.grid_rows//2+9, 0:5] = "d"
        self.layout[:, self.grid_cols//2] = "M"
        self.layout[:, -1] = "O"
        self.layout[:, 0] = "O"
        self.layout[0, :] = "O"
        self.layout[-1, :] = "O"
        self.layout[self.grid_rows//2-4 : self.grid_rows//2+5, -1] = "G"
        self.layout[self.grid_rows//2-4 : self.grid_rows//2+5, 0] = "g"
        self.ball_pos = (self.grid_rows//2, self.grid_cols//2)
        self.episode_steps = 0


    def _get_state(self,player,players):

        same_team = []
        other_team = []
        for play in players:
            if player.team == play.team:
                same_team.extend([play.position[0]/self.grid_rows -1 ,play.position[1]/self.grid_cols -1])
            else:
                other_team.extend([play.position[0]/self.grid_rows -1 ,play.position[1]/self.grid_cols -1])
        other_team = [-1*x for x in other_team]
        # print(same_team,other_team)
        return np.array([
            *same_team,
            *other_team,
            float(player.has_ball),
            self.grid_rows // 2 / (self.grid_rows - 1),
            (self.grid_cols - 1) / (self.grid_cols - 1)
        ], dtype=np.float32)


    def reset(self, players, seed=None, options=None):
        super().reset(seed=seed)

        occupied_positions = set()
        for player in players:
            while True:
                if player.team == 0:
                    player_row = random.randint(self.grid_rows // 2 - 3, self.grid_rows // 2 + 3)
                    player_col = random.randint(1, self.grid_cols // 4)
                else:
                    player_row = random.randint(self.grid_rows // 2 - 3, self.grid_rows // 2 + 3)
                    player_col = random.randint(3 * self.grid_cols // 4, self.grid_cols - 2)

                if (player_row, player_col) not in occupied_positions:
                    occupied_positions.add((player_row, player_col))
                    player.position = (player_row, player_col)
                    break

        while True:
            ball_row = random.randint(1, self.grid_rows - 2)  # Avoid boundary walls
            ball_col = random.randint(self.grid_cols // 4, 3 * self.grid_cols // 4)  # Avoid goalposts


            if (ball_row, ball_col) not in occupied_positions:
                self.ball_pos = (ball_row, ball_col)
                break
        self.episode_steps = 0
        return {}, {}


    def step(self, action,player,players):
        self.episode_steps += 1
        reward = player.step_penalty
        done = False

        if action < 8:  # Movement actions
            dx, dy = [(0, -1), (1, 0), (0, 1), (-1, 0), (-1, -1), (-1, 1), (1, -1), (1, 1)][action]
            new_pos = (player.position[0] + dx, player.position[1] + dy)

            if 0 <= new_pos[0] < self.grid_rows and 0 <= new_pos[1] < self.grid_cols and self.layout[new_pos[0],new_pos[1]] != "O":
                player.prev_position = player.position
                player.position = new_pos
                if player.has_ball:
                    self.ball_pos = new_pos
            else:
                reward -= 20  # Massive penalty
                done = True  # End episode if the player goes out
                # return self._get_state(), reward, done, True, {}
                return self._get_state(player,players), reward, done, True, {}

        # elif action == 8 and player.has_ball:  # Long shot
        #     goal_y_center = self.grid_rows // 2
        #     # Better aim toward goal
        #     target_y = min(max(goal_y_center + random.randint(-2, 2), 0), self.grid_rows-1)
        #     new_ball_col = min(self.ball_pos[1] + 10, self.grid_cols - 1)
        #     self.ball_pos = (target_y, new_ball_col)
        #     self.has_ball = False

        # elif action == 9 and self.has_ball:  # Short pass
        #     new_ball_col = min(self.ball_pos[1] + 5, self.grid_cols - 1)
        #     self.ball_pos = (self.ball_pos[0], new_ball_col)
        #     self.has_ball = False

        # Check if player gets the ball
        player.prev_ball_statues = player.has_ball
        player.has_ball = player.position == self.ball_pos
        
        # Reward shaping
        if player.has_ball:
            reward += player.ball_possession_bonus

        if player.has_ball:
            reward += 0.001
        
        # Distance-based rewards
        dist_to_ball = np.sqrt((player.position[0] - self.ball_pos[0])**2 + 
                              (player.position[1] - self.ball_pos[1])**2)
        if dist_to_ball < 5 and not player.has_ball:
            reward += player.near_ball_bonus
        
        if player.has_ball:
            # Calculate distance to goal
            dist_to_goal = self.grid_cols - 1 - player.position[1]
            if dist_to_goal < 10:
                reward += player.near_goal_bonus * (1 - dist_to_goal/10.0)
        
        # Goal reward
        if self.layout[self.ball_pos[0], self.ball_pos[1]] == 'G':
            reward += player.goal_reward
            done = True

        truncated = self.episode_steps >= 3000
        return self._get_state(player,players), reward, done, truncated, {}

    def render(self,players):
        grid = np.full((self.grid_rows, self.grid_cols), '-')
        
        # Draw field elements
        for i in range(self.grid_rows):
            for j in range(self.grid_cols):
                if self.layout[i,j] == 'O':
                    grid[i,j] = '#'
                elif self.layout[i,j] == 'G':
                    grid[i,j] = '|'
                elif self.layout[i,j] == 'M':
                    grid[i,j] = '.'

        grid[self.ball_pos[0], self.ball_pos[1]] = 'B'
        
        # Draw player and ball
        for player in players:
            if player.team == 0:
                grid[player.position[0], player.position[1]] = 'P'
            else:
                grid[player.position[0], player.position[1]] = 'Q'

        
        
        # Print the grid
        print('-' * (self.grid_cols + 2))
        for row in grid:
            print('|' + ''.join(row) + '|')
        print('-' * (self.grid_cols + 2))


class Player():
    def __init__(self,role,team,env):
        self.role = role
        self.team = team
        self.position = [random.randint(env.grid_rows//2-3, env.grid_rows//2+3),random.randint(1, env.grid_cols//4)]
        self.prev_position = []
        self.prev_ball_statues = False
        self.has_ball = False
        self.goal_reward = 52
        self.step_penalty = -0.001
        self.ball_possession_bonus = 0.008
        self.near_ball_bonus = 0.00001
        self.near_goal_bonus = 0.00002

class DQN(nn.Module):
    def __init__(self, input_dim, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, action_size)
        
        # Initialize weights with better defaults
        for layer in [self.fc1, self.fc2, self.fc3, self.fc4]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)


class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.995
        self.learning_rate = 0.0003
        self.memory = deque(maxlen=100000)
        self.batch_size = 256
        self.target_update_freq = 5  # Update target network every N episodes
        
        self.device = device
        self.model = DQN(state_size, action_size).to(self.device)
        self.target_model = DQN(state_size, action_size).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        
        self.rewards_history = []
        self.episode_count = 0

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, evaluate=False):
        if not evaluate and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return 0
            
        minibatch = random.sample(self.memory, self.batch_size)
    
        states = torch.FloatTensor(np.array([experience[0] for experience in minibatch])).to(self.device)
        actions = torch.LongTensor([experience[1] for experience in minibatch]).to(self.device)
        rewards = torch.FloatTensor([experience[2] for experience in minibatch]).to(self.device)
        next_states = torch.FloatTensor(np.array([experience[3] for experience in minibatch])).to(self.device)
        dones = torch.FloatTensor([experience[4] for experience in minibatch]).to(self.device)
    
        curr_q_values = self.model(states).gather(1, actions.unsqueeze(1))
    
        with torch.no_grad():
            next_q_values = self.target_model(next_states).max(1)[0]
    
        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
    
        loss = F.mse_loss(curr_q_values.squeeze(), target_q_values)
    
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)  
        self.optimizer.step()
    
        return loss.item()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay       


  and should_run_async(code)


In [6]:
# Training the agents
def train_multi_agent(players,episodes,max_steps):
    agents = [DQNAgent(7,10) for player in players]
    for episode in range(episodes):
        # print(f"Episode {episode}")
        env.reset(players)
        total_rewards = [0.0 for _ in players]
        losses = [[] for _ in players]
        truncated = False
        done = False
        for step in range(max_steps):
            if not done and not truncated:
                states = [env._get_state(player, players) for player in players]
                for i in range(len(players)):
                    action = agents[i].act(states[i])
                    next_state,reward,done,truncated,_=env.step(action,players[i],players)
                    agents[i].remember(states[i],action,reward,next_state,done or truncated)
                    if len(agents[i].memory) >= agents[i].batch_size:
                        loss = agents[i].replay()
                        losses[i].append(loss)
                    total_rewards[i] += reward
                    if done or truncated:
                        break
        for idx, agent in enumerate(agents):
            agent.episode_count += 1
            if agent.episode_count % agent.target_update_freq == 0:
                agent.update_target_model()
            agent.decay_epsilon()
            agent.rewards_history.append(total_rewards[idx])

        avg_losses = [np.mean(loss) if loss else 0.0 for loss in losses]
        # print(f"Episode {episode + 1}/{episodes}")
        # print(f"Total Rewards: {total_rewards}")
        # print(f"Average Losses: {avg_losses}")
        # print(f"Epsilon: {agents[0].epsilon:.3f}")
        # env.render(players)
    return agents

env = FootballEnv()
players = [Player('F',i%2,env) for i in range(2)]
agents = train_multi_agent(players,100,1000)


In [8]:
# Evaluvating the agents
def evaluate_multi_agent(agents, env, players, episodes=1, max_steps=1000, render=True):
    episode_rewards = []
    
    for episode in range(episodes):
        env.reset(players)
        total_rewards = [0.0 for _ in players]
        done = False
        truncated = False
        
        for step in range(max_steps):
            if done or truncated:
                break
            
            if render:
                env.render(players)
                time.sleep(0.1)
            
            # Get states for all players
            states = [env._get_state(player, players) for player in players]
            
            # Process actions for all players in random order
            order = np.random.permutation(len(players))
            for idx in order:
                agent = agents[idx]
                player = players[idx]
                
                action = agent.act(states[idx], evaluate=True)  # No exploration
                next_state, reward, done, truncated, _ = env.step(action, player, players)
                
                total_rewards[idx] += reward
                
                if done or truncated:
                    break

        episode_rewards.append(total_rewards)
        print(f"Evaluation Episode {episode+1}:")
        print(f"Total Rewards: {total_rewards}")
        print("-" * 50)
    
    return np.mean(episode_rewards, axis=0)


avg_rewards = evaluate_multi_agent(
    agents,
    env,
    players,
    episodes=100,
    render=False
)
print(f"Average rewards across evaluation episodes: {avg_rewards}")

Evaluation Episode 1:
Total Rewards: [8.013988000000133, -0.9900100000000197]
--------------------------------------------------
Evaluation Episode 2:
Total Rewards: [8.011987999999922, -0.9900200000000198]
--------------------------------------------------
Evaluation Episode 3:
Total Rewards: [-0.9900000000000198, -0.9900100000000197]
--------------------------------------------------
Evaluation Episode 4:
Total Rewards: [-0.9900100000000197, -0.9900000000000198]
--------------------------------------------------
Evaluation Episode 5:
Total Rewards: [-0.9900000000000198, -1.0000000000000007]
--------------------------------------------------
Evaluation Episode 6:
Total Rewards: [-0.9900000000000198, -1.0000000000000007]
--------------------------------------------------
Evaluation Episode 7:
Total Rewards: [-0.9900100000000197, -0.9900000000000198]
--------------------------------------------------
Evaluation Episode 8:
Total Rewards: [-0.9900000000000198, -0.9900000000000198]
-------

KeyboardInterrupt: 