In [1]:
# model and env
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import gymnasium as gym
from collections import deque
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FootballEnv(gym.Env):
    def __init__(self, grid_rows=30, grid_cols=30):
        super(FootballEnv, self).__init__()
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols

        self.action_space = gym.spaces.Discrete(10)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(8,), dtype=np.float32)
        self.layout = np.zeros((grid_rows, grid_cols), dtype=str)
        self.layout[:, :] = "."
        self.layout[self.grid_rows//2, self.grid_cols//2] = "C"
        self.layout[self.grid_rows//2-8 : self.grid_rows//2+9, -6:-1] = "D"
        self.layout[self.grid_rows//2-8 : self.grid_rows//2+9, 0:5] = "d"
        self.layout[:, self.grid_cols//2] = "M"
        self.layout[:, -1] = "O"
        self.layout[:, 0] = "O"
        self.layout[0, :] = "O"
        self.layout[-1, :] = "O"
        self.layout[self.grid_rows//2-4 : self.grid_rows//2+5, -1] = "G"
        self.layout[self.grid_rows//2-4 : self.grid_rows//2+5, 0] = "g"
        self.ball_pos = (self.grid_rows//2, self.grid_cols//2)
        self.episode_steps = 0
        self.goal_team = None
        self.team_has_ball = -1

    def _get_state(self, player, players):
        same_team = []
        other_team = []
        for play in players:
            if player.team == play.team:
                same_team.extend([play.position[0]/(self.grid_rows - 1), 
                                 play.position[1]/(self.grid_cols - 1)])
            else:
                other_team.extend([play.position[0]/(self.grid_rows - 1), 
                                 play.position[1]/(self.grid_cols - 1)])
        if other_team == []:
            for i in range(len(same_team)//2):
                other_team.extend([0/(self.grid_rows-1),0/(self.grid_cols - 1)])

        goal_x = self.grid_rows//2 / (self.grid_rows-1)
        goal_y = (self.grid_cols - 1) / (self.grid_cols - 1) if player.team == 0 else 0
            
        return np.array([
            *same_team,
            *other_team,
            self.ball_pos[0]/(self.grid_rows-1),
            self.ball_pos[1]/(self.grid_cols-1),
            float(player.has_ball),
            float(self.team_has_ball),
            goal_x,
            goal_y
        ], dtype=np.float32)

    def reset(self, players, seed=None, options=None, episode_num=0):
        super().reset(seed=seed)
        occupied_positions = set()
        
        for player in players:
            player.has_ball = False
            player.prev_position = None
            player.moves_with_ball = 0
            player.moves_without_ball = 0
            while True:
                if player.team == 0:
                    player_row = random.randint(self.grid_rows//2-3, self.grid_rows//2+3)
                    player_col = random.randint(1, self.grid_cols//4)
                else:
                    player_row = random.randint(self.grid_rows//2-3, self.grid_rows//2+3)
                    player_col = random.randint(3*self.grid_cols//4, self.grid_cols-2)
    
                if (player_row, player_col) not in occupied_positions:
                    player.position = (player_row, player_col)
                    occupied_positions.add((player_row, player_col))
                    break
        
        # Calculate dynamic radius based on episode number
        # Start with radius 1, increase by 1 every 500 episodes up to grid_cols/4
        base_radius = 1
        max_radius = self.grid_cols // 4
        radius_increase_interval = 500
        current_radius = min(base_radius + (episode_num // radius_increase_interval), max_radius)
        
        # Choose a random player to place the ball near
        random_player = random.choice(players)
        
        max_attempts = 50
        for _ in range(max_attempts):
            # Generate a position within current_radius of the player
            angle = random.uniform(0, 2 * np.pi)
            r = random.uniform(0, current_radius)
            delta_row = int(round(r * np.sin(angle)))
            delta_col = int(round(r * np.cos(angle)))
            
            ball_row = random_player.position[0] + delta_row
            ball_col = random_player.position[1] + delta_col
            
            # Ensure ball is within grid bounds and not in occupied positions
            if (1 <= ball_row < self.grid_rows-1 and 
                1 <= ball_col < self.grid_cols-1 and
                (ball_row, ball_col) not in occupied_positions):
                self.ball_pos = (ball_row, ball_col)
                break
        else:
            # Fallback if no valid position found within radius
            while True:
                ball_row = random.randint(1, self.grid_rows-2)
                ball_col = random.randint(self.grid_cols//4, 3*self.grid_cols//4)
                if (ball_row, ball_col) not in occupied_positions:
                    self.ball_pos = (ball_row, ball_col)
                    break
                
        self.episode_steps = 0
        self.goal_team = None
        self.team_has_ball = -1
        return {}, {}

    def step(self, action, player, players):
        self.episode_steps += 1
        reward = player.step_penalty
        done = False
        truncated = False
        team_reward_applied = False
    
        # Store previous ball possession status
        had_ball_before_move = player.has_ball
        
        if action < 8:
            dx, dy = [(0, -1), (1, 0), (0, 1), (-1, 0), 
                     (-1, -1), (-1, 1), (1, -1), (1, 1)][action]
            new_pos = (player.position[0] + dx, player.position[1] + dy)
    
            if (0 <= new_pos[0] < self.grid_rows and 
                0 <= new_pos[1] < self.grid_cols and 
                self.layout[new_pos[0], new_pos[1]] != "O"):
                player.prev_position = player.position
                player.position = new_pos
                
                # Only move ball if player already has it
                if had_ball_before_move:
                    self.ball_pos = new_pos
            else:
                reward -= 70
                done = True
                return self._get_state(player, players), reward, done, True, {}
    
        player.prev_ball_statues = player.has_ball
        player.has_ball = player.position == self.ball_pos
        if player.has_ball:
            player.moves_with_ball +=1
            player.moves_without_ball = 0
            self.team_has_ball = player.team
        # elif self.team_has_ball == player.team and had_ball_before_move and not player.has_ball:
        #     self.team_has_ball = -1
    
        if player.has_ball or self.team_has_ball == player.team:
            reward += (player.ball_possession_bonus * ((0.99) ** (player.moves_with_ball - 1)))
        else:
            player.moves_without_ball +=1
            player.moves_with_ball = 0
            reward += (player.no_possession * player.moves_without_ball)
    
        if not player.has_ball:
            dist_to_ball = np.sqrt((player.position[0] - self.ball_pos[0])**2 + 
                                  (player.position[1] - self.ball_pos[1])**2)
            reward += player.near_ball_bonus / (dist_to_ball + 1) 
        
        if player.has_ball:
            dist_to_goal = self.grid_cols - 1 - player.position[1]
            if dist_to_goal < 3:
                reward += player.near_goal_bonus * (1 - dist_to_goal/10.0)
    
        current_ball_cell = self.layout[self.ball_pos[0], self.ball_pos[1]]
        goal_team = None
        if current_ball_cell == 'G':
            goal_team = 0
        elif current_ball_cell == 'g':
            goal_team = 1
    
        if goal_team is not None and not team_reward_applied:
            for p in players:
                if p.team == goal_team:
                    p_reward = p.goal_reward
                else:
                    p_reward = p.opp_scoring
                
                if p == player:
                    reward += p_reward
                    
            done = True
            team_reward_applied = True
            self.goal_team = goal_team
    
        truncated = self.episode_steps >= 3000
        return self._get_state(player, players), reward, done, truncated, {'goal_team': self.goal_team}

    def render(self, players):
        grid = np.full((self.grid_rows, self.grid_cols), '-')
        for i in range(self.grid_rows):
            for j in range(self.grid_cols):
                if self.layout[i,j] == 'O':
                    grid[i,j] = '#'
                elif self.layout[i,j] in ['G', 'g']:
                    grid[i,j] = '|'
        
        grid[self.ball_pos[0], self.ball_pos[1]] = 'B'
        for player in players:
            grid[player.position[0], player.position[1]] = 'P' if player.team == 0 else 'Q'
        
        print('-' * (self.grid_cols + 2))
        for row in grid:
            print('|' + ''.join(row) + '|')
        print('-' * (self.grid_cols + 2))


    def resolve_collisions(self, players):
        """
        Resolves collisions between players with skill-based probabilities for ball control outcomes.
        Updates positions and ball possession accordingly.
        """
        # Dictionary to track positions and players at each position
        position_map = {}
        
        # Group players by position
        for player in players:
            position = player.position
            if position in position_map:
                position_map[position].append(player)
            else:
                position_map[position] = [player]
        
        # Track rewards from collisions to return to the players
        collision_rewards = {player: 0 for player in players}
        
        # Process collisions (positions with more than one player)
        for position, players_at_position in position_map.items():
            if len(players_at_position) > 1:
                # Determine ball possession before collision
                ball_at_position = (self.ball_pos == position)
                ball_carrier = None
                
                # Check if any player at this position has the ball
                for p in players_at_position:
                    if p.has_ball:
                        ball_carrier = p
                        break
                
                # If one player has the ball and there are opponents, simulate tackle/dribble
                if ball_carrier is not None:
                    opponents = [p for p in players_at_position if p.team != ball_carrier.team]
                    
                    if opponents:
                        # Handle one-on-one situation
                        defender = opponents[0] if len(opponents) == 1 else random.choice(opponents)
                        
                        # Determine outcome based on skills
                        # Ball carrier tries to dribble past defender
                        dribble_success = random.random() < ball_carrier.dribble_success_prob
                        # Defender tries to tackle ball carrier
                        tackle_success = random.random() < defender.tackle_success_prob
                        
                        # Determine final outcome: tackle succeeds or dribble succeeds
                        if tackle_success and not dribble_success:
                            # Defender successfully tackles
                            previous_carrier = ball_carrier
                            ball_carrier.has_ball = False
                            defender.has_ball = True
                            self.ball_pos = defender.position
                            self.team_has_ball = defender.team
                            
                            # Apply rewards/penalties
                            collision_rewards[defender] += defender.ball_gained_reward
                            collision_rewards[previous_carrier] += previous_carrier.ball_lost_penalty
                            
                            # Update ball carrier reference
                            ball_carrier = defender
                        else:
                            # Attacker keeps the ball
                            collision_rewards[ball_carrier] += ball_carrier.dribble_success_prob * 2  # Small reward for successful dribble
                    
                # If there was no ball carrier but the ball is at this position,
                # players try to get the ball based on tackle probabilities
                elif ball_at_position:
                    # Group players by team
                    team_players = {}
                    for p in players_at_position:
                        if p.team in team_players:
                            team_players[p.team].append(p)
                        else:
                            team_players[p.team] = [p]
                    
                    # If there are players from both teams, they compete for the ball
                    if len(team_players) > 1:
                        # Calculate team tackle probabilities (average of all players)
                        team_tackle_probs = {}
                        for team, team_list in team_players.items():
                            team_tackle_probs[team] = sum(p.tackle_success_prob for p in team_list) / len(team_list)
                        
                        # Normalize probabilities
                        total_prob = sum(team_tackle_probs.values())
                        if total_prob > 0:
                            for team in team_tackle_probs:
                                team_tackle_probs[team] /= total_prob
                        
                        # Determine which team gets the ball
                        rand_val = random.random()
                        cumulative_prob = 0
                        winning_team = list(team_tackle_probs.keys())[0]  # Default
                        
                        for team, prob in team_tackle_probs.items():
                            cumulative_prob += prob
                            if rand_val <= cumulative_prob:
                                winning_team = team
                                break
                        
                        # Choose a random player from the winning team
                        ball_carrier = random.choice(team_players[winning_team])
                        ball_carrier.has_ball = True
                        self.ball_pos = ball_carrier.position
                        self.team_has_ball = ball_carrier.team
                        
                        # Apply rewards
                        for p in players_at_position:
                            if p.team == winning_team:
                                collision_rewards[p] += p.ball_gained_reward * 0.5  # Split the reward among winning team
                    else:
                        # Only players from one team, randomly choose one to get the ball
                        only_team = list(team_players.keys())[0]
                        ball_carrier = random.choice(team_players[only_team])
                        ball_carrier.has_ball = True
                        self.ball_pos = ball_carrier.position
                        self.team_has_ball = ball_carrier.team
                
                # Resolve position conflicts
                # The ball carrier stays at the position, others move to nearby empty spaces
                staying_player = ball_carrier if ball_carrier else random.choice(players_at_position)
                
                # Move all players except the staying one back to their previous positions
                for player in players_at_position:
                    if player != staying_player:
                        if player.prev_position is not None:
                            player.position = player.prev_position
                        else:
                            # If no previous position, find a nearby empty space
                            self._find_empty_space(player, players)
        
        # Update all players' ball possession status based on final positions
        for player in players:
            player.has_ball = (player.position == self.ball_pos)
            
            # Update counters for with/without ball
            if player.has_ball:
                player.moves_with_ball += 1
                player.moves_without_ball = 0
            else:
                player.moves_without_ball += 1
                player.moves_with_ball = 0
        
        return collision_rewards
    def _find_empty_space(self, player, all_players):
        """
        Find an empty adjacent space for a player who needs to be moved.
        """
        occupied_positions = {p.position for p in all_players}
        
        # Check 8 directions around the player
        possible_moves = [
            (0, -1), (1, 0), (0, 1), (-1, 0),
            (-1, -1), (-1, 1), (1, -1), (1, 1)
        ]
        
        for dx, dy in possible_moves:
            new_pos = (player.position[0] + dx, player.position[1] + dy)
            
            # Check if position is valid and empty
            if (0 <= new_pos[0] < self.grid_rows and 
                0 <= new_pos[1] < self.grid_cols and 
                self.layout[new_pos[0], new_pos[1]] != "O" and
                new_pos not in occupied_positions):
                player.position = new_pos
                occupied_positions.add(new_pos)
                return
        
        # If no empty adjacent space, try with a larger radius
        for radius in range(2, 5):
            for i in range(-radius, radius + 1):
                for j in range(-radius, radius + 1):
                    if i == 0 and j == 0:
                        continue
                        
                    new_pos = (player.position[0] + i, player.position[1] + j)
                    
                    if (0 <= new_pos[0] < self.grid_rows and 
                        0 <= new_pos[1] < self.grid_cols and 
                        self.layout[new_pos[0], new_pos[1]] != "O" and
                        new_pos not in occupied_positions):
                        player.position = new_pos
                        occupied_positions.add(new_pos)
                        return
        
        # If no position found, keep player at current position and log the issue
        print(f"Warning: Could not find empty space for player at {player.position}")




In [2]:
# player class
class Player:
    def __init__(self, role, team, env):
        if role == 'ATK':
            self.role = role
            self.team = team
            self.position = (env.grid_rows//2, env.grid_cols//4 if team == 0 else 3*env.grid_cols//4)
            self.prev_position = None
            self.has_ball = False
            self.goal_reward = 1500
            self.step_penalty = -0.03
            self.ball_possession_bonus = 0.2
            self.near_ball_bonus = 0.001
            self.near_goal_bonus = 0.002
            self.opp_scoring = -50
            self.no_possession = -0.005
            self.goal_sym = 'G' if team == 0 else 'g'
            self.opp_goal_sym = 'g' if team == 0 else 'G'
            self.moves_without_ball = 0
            self.moves_with_ball = 0
            self.dribble_success_prob = 0.7 
            self.tackle_success_prob = 0.3
            self.ball_gained_reward = 10
            self.ball_lost_penalty = -5 
        elif role == 'DF':
            self.role = role
            self.team = team
            self.position = (env.grid_rows//2, env.grid_cols//4 if team == 0 else 3*env.grid_cols//4)
            self.prev_position = None
            self.has_ball = False
            self.goal_reward = 300
            self.step_penalty = -0.03
            self.ball_possession_bonus = 0.15
            self.near_ball_bonus = 0.001
            self.near_goal_bonus = 0.002
            self.opp_scoring = -500
            self.no_possession = -0.002
            self.goal_sym = 'G' if team == 0 else 'g'
            self.opp_goal_sym = 'g' if team == 0 else 'G'
            self.moves_without_ball = 0
            self.moves_with_ball = 0
            self.dribble_success_prob = 0.4
            self.tackle_success_prob = 0.6
            self.ball_gained_reward = 15
            self.ball_lost_penalty = -3

  and should_run_async(code)


In [3]:
# dqn network
class Original_DQN(nn.Module):
    def __init__(self, input_dim, action_size):
        super(Original_DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, action_size)

        for layer in [self.fc1, self.fc2, self.fc3, self.fc4]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)

class DQN(nn.Module):
    def __init__(self, new_input_dim, new_action_size,old_fc2,old_fc3):
        super(DQN, self).__init__()
        print(new_input_dim,new_action_size)
        self.fc1 = nn.Linear(new_input_dim, 128)
        self.fc2 = old_fc2
        self.fc3 = old_fc3
        self.fc4 = nn.Linear(64, new_action_size)

        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_uniform_(self.fc4.weight)
        nn.init.zeros_(self.fc4.bias)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)

class DQNAgent:
    def __init__(self, state_size, action_size,old_fc2,old_fc3):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.02
        self.epsilon_decay = 0.99
        self.learning_rate = 0.0005
        self.memory = deque(maxlen=200000)
        self.batch_size = 256
        self.target_update_freq = 2
        
        self.device = device
        self.model = DQN(state_size, action_size,old_fc2,old_fc3).to(self.device)
        self.target_model = DQN(state_size, action_size,old_fc2,old_fc3).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        
        self.rewards_history = []
        self.episode_count = 0

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, evaluate=False):
        if not evaluate and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return 0
            
        minibatch = random.sample(self.memory, self.batch_size)
        states = torch.FloatTensor(np.array([t[0] for t in minibatch])).to(self.device)
        actions = torch.LongTensor([t[1] for t in minibatch]).to(self.device)
        rewards = torch.FloatTensor([t[2] for t in minibatch]).to(self.device)
        next_states = torch.FloatTensor(np.array([t[3] for t in minibatch])).to(self.device)
        dones = torch.FloatTensor([t[4] for t in minibatch]).to(self.device)
    
        curr_q = self.model(states).gather(1, actions.unsqueeze(1))
        next_q = self.target_model(next_states).max(1)[0].detach()
        target = rewards + (1 - dones) * self.gamma * next_q
    
        loss = F.mse_loss(curr_q.squeeze(), target)
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
        self.optimizer.step()
        return loss.item()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [None]:
#training
def train_multi_agent(players, episodes, max_steps):
    env = FootballEnv()
    pre_trained_model_path = "/kaggle/input/codeee/dqn_football_final.pth"
    checkpoint = torch.load(pre_trained_model_path, map_location=torch.device("cpu"))
    
    state_dict = checkpoint["model_state_dict"]
    pre_trained_model = Original_DQN(7, 8)
    
    pre_trained_model.load_state_dict(state_dict)
    pre_trained_model.eval()
    
    old_fc2 = pre_trained_model.fc2
    old_fc3 = pre_trained_model.fc3
    agents = [DQNAgent(6 + len(players) * 2, 8, old_fc2, old_fc3) for _ in players]
    
    for episode in range(episodes):
        env.reset(players, episode)
        states = [env._get_state(p, players) for p in players]
        total_rewards = [0.0 for _ in players]
        losses = [[] for _ in players]
        done = False
        truncated = False
        goal_team = None
        
        for step in range(max_steps):
            if done or truncated:
                break
            
            actions = [agents[i].act(states[i]) for i in range(len(players))]
            experiences = [None] * len(players)  # Initialize with placeholders
            any_player_done = False
            
            # First, all players take their actions independently
            for i in range(len(players)):
                # Store previous position before taking action
                players[i].prev_position = players[i].position
                
                # Take action but don't check for collisions yet
                next_state, reward, player_done, player_truncated, info = env.step(
                    actions[i], players[i], players
                )
                
                # Store the experience
                experiences[i] = (states[i], actions[i], reward, next_state, player_done or player_truncated)
                
                # Update episode termination flags
                if player_done or player_truncated:
                    done = player_done
                    truncated = player_truncated
                    any_player_done = True
                
                # Check for goal
                if 'goal_team' in info and info['goal_team'] is not None:
                    goal_team = info['goal_team']
            
            # If game isn't done yet, resolve collisions
            if not any_player_done:
                env.resolve_collisions(players)
                
                # Update states and rewards after collision resolution
                for i in range(len(players)):
                    # Get updated state after collisions
                    updated_state = env._get_state(players[i], players)
                    
                    # Calculate additional reward/penalty for collision outcomes
                    collision_reward = 0
                    
                    # Reward for stealing the ball in collision
                    if not experiences[i][0][6] and players[i].has_ball:  # Wasn't holding ball before but is now
                        collision_reward += 10
                    
                    # Penalty for losing the ball in collision
                    if experiences[i][0][6] and not players[i].has_ball:  # Was holding ball before but not now
                        collision_reward -= 10
                    
                    # Update the experience with the new state and adjusted reward
                    old_exp = experiences[i]
                    updated_reward = old_exp[2] + collision_reward
                    experiences[i] = (old_exp[0], old_exp[1], updated_reward, updated_state, old_exp[4])
                    
                    total_rewards[i] += updated_reward
                    states[i] = updated_state
            
            # Apply goal rewards if a goal was scored
            if goal_team is not None:
                for i in range(len(players)):
                    team_reward = players[i].goal_reward if players[i].team == goal_team else players[i].opp_scoring
                    old_exp = experiences[i]
                    updated_reward = old_exp[2] + team_reward
                    experiences[i] = (old_exp[0], old_exp[1], updated_reward, old_exp[3], True)  # Force done=True for all players
                    total_rewards[i] += team_reward
            
            # Store experiences in replay memory and train
            for i in range(len(players)):
                if experiences[i] is not None:  # Make sure experience exists
                    agents[i].remember(*experiences[i])
                    if len(agents[i].memory) >= agents[i].batch_size:
                        loss = agents[i].replay()
                        losses[i].append(loss)
                        
        # Update target models and decay epsilon
        for idx, agent in enumerate(agents):
            agent.episode_count += 1
            if agent.episode_count % agent.target_update_freq == 0:
                agent.update_target_model()
            agent.decay_epsilon()
            agent.rewards_history.append(total_rewards[idx])
        
        # Print progress
        avg_losses = [np.mean(loss) if loss else 0.0 for loss in losses]
        if episode % 1000 == 0:
            print(total_rewards)
            print(f"Episode: {episode}, Avg Rewards: {np.mean(total_rewards):.2f}, Epsilon: {agents[0].epsilon:.2f}")
    
    return agents

env = FootballEnv()
players = [Player('ATK', 0, env),Player('DF' , 1 ,env),Player('ATK' , 1 ,env)]
trained_agents = train_multi_agent(players, episodes=7001, max_steps=500)

In [None]:
# Evaluvating the agents
def evaluate_multi_agent(agents, env, players, episodes=1, max_steps=3000, render=True):
    episode_rewards = []
    
    # Print state dimensions for debugging
    sample_state = env._get_state(players[0], players)
    print(f"Evaluation state size: {len(sample_state)}")
    print(f"Agent input size: {agents[0].state_size}")
    
    for episode in range(episodes):
        env.reset(players, 10000)
        total_rewards = [0.0 for _ in players]
        done = False
        truncated = False
        goal_team = None
        
        for step in range(max_steps):
            if done or truncated:
                break
            
            if render:
                env.render(players)
                time.sleep(0.1)
            
            # Get states for all players
            states = [env._get_state(player, players) for player in players]
            
            # Get actions for all players
            actions = [agents[i].act(states[i], evaluate=True) for i in range(len(players))]
            any_player_done = False
            
            # First, all players take their actions independently
            for i in range(len(players)):
                # Store previous position before taking action
                players[i].prev_position = players[i].position
                
                # Take action but don't check for collisions yet
                next_state, reward, player_done, player_truncated, info = env.step(
                    actions[i], players[i], players
                )
                
                # Add reward to total
                total_rewards[i] += reward
                
                # Update episode termination flags
                if player_done or player_truncated:
                    done = player_done
                    truncated = player_truncated
                    any_player_done = True
                
                # Check for goal
                if 'goal_team' in info and info['goal_team'] is not None:
                    goal_team = info['goal_team']
            
            # If game isn't done yet, resolve collisions
            if not any_player_done:
                env.resolve_collisions(players)
                
                # Update states after collision resolution
                states = [env._get_state(player, players) for player in players]
                
                # Calculate additional rewards/penalties for collision outcomes
                for i in range(len(players)):
                    collision_reward = 0
                    
                    # These collision rewards should match those in training
                    # You may need to adjust based on your specific logic
                    if players[i].has_ball:
                        collision_reward += 0.5
                    
                    total_rewards[i] += collision_reward
            for i in range(len(players)):
                print(players[i].position , players[i].has_ball , i , actions[i])
                # print(players[i].has_ball)
            
            # Apply goal rewards if a goal was scored
            if goal_team is not None:
                for i in range(len(players)):
                    team_reward = players[i].goal_reward if players[i].team == goal_team else players[i].opp_scoring
                    total_rewards[i] += team_reward
        
        episode_rewards.append(total_rewards)
        print(f"Episode {episode+1} Total Rewards: {total_rewards}")
    
    return np.mean(episode_rewards, axis=0)


avg_rewards = evaluate_multi_agent(
    trained_agents,
    env,
    players,
    episodes=1,
    render=True
)
print(f"Average rewards across evaluation episodes: {avg_rewards}")

In [2]:
print("Hwllo")

Hwllo
