In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import gymnasium as gym
from collections import deque
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class FootballEnv(gym.Env):
    def __init__(self, grid_rows=50, grid_cols=50):
        super(FootballEnv, self).__init__()
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols

        self.action_space = gym.spaces.Discrete(10)  # Actions for the first player
        self.gk_action_space = gym.spaces.Discrete(12)  # Actions for the goalkeeper
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(11,), dtype=np.float32)  # Updated state size

        # Initialize field layout
        self.layout = np.zeros((grid_rows, grid_cols), dtype=str)
        self.layout[:, :] = "."
        self.layout[self.grid_rows//2, self.grid_cols//2] = "C"
        self.layout[self.grid_rows//2-8 : self.grid_rows//2+9, -6:-1] = "D"
        self.layout[self.grid_rows//2-8 : self.grid_rows//2+9, 0:5] = "d"
        self.layout[:, self.grid_cols//2] = "M"
        self.layout[:, -1] = "O"
        self.layout[:, 0] = "O"
        self.layout[0, :] = "O"
        self.layout[-1, :] = "O"
        self.layout[self.grid_rows//2-4 : self.grid_rows//2+5, -1] = "G"
        self.layout[self.grid_rows//2-4 : self.grid_rows//2+5, 0] = "g"

        # Initialize positions
        self.player_pos = (self.grid_rows//2, 5)
        self.gk_pos = (self.grid_rows//2, self.grid_cols - 5)  # Goalkeeper position
        self.ball_pos = (self.grid_rows//2, self.grid_cols//2)
        self.has_ball = False
        self.gk_has_ball = False
        self.episode_steps = 0

        # Reward parameters
        self.goal_reward = 50
        self.step_penalty = -0.001
        self.ball_possession_bonus = 0.005
        self.near_ball_bonus = 0.0001
        self.near_goal_bonus = 0.0002
        self.gk_stop_ball_reward = 10  # Reward for blocking or diving successfully
        self.gk_lose_ball_penalty = -5  # Penalty for losing the ball
        self.gk_hold_ball_reward = 20   # Reward for holding the ball

    def _get_state(self):
        # Enhanced state representation including has_ball flag and goalkeeper position
        return np.array([
            self.player_pos[0] / (self.grid_rows - 1),
            self.player_pos[1] / (self.grid_cols - 1),
            self.ball_pos[0] / (self.grid_rows - 1),
            self.ball_pos[1] / (self.grid_cols - 1),
            float(self.has_ball),  # Add has_ball as explicit state feature
            self.gk_pos[0] / (self.grid_rows - 1),
            self.gk_pos[1] / (self.grid_cols - 1),
            float(self.gk_has_ball),  # Add gk_has_ball as explicit state feature
            self.grid_rows // 2 / (self.grid_rows - 1),  # Goal Y position
            (self.grid_cols - 1) / (self.grid_cols - 1),  # Goal X position
            self.grid_cols // 2 / (self.grid_cols - 1)  # Center X position
        ], dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        
        # Start with player on left side of field
        player_row = random.randint(self.grid_rows//2-3, self.grid_rows//2+3)
        player_col = random.randint(1, self.grid_cols//4)
        self.player_pos = (player_row, player_col)
        
        # Place ball near player to start
        ball_row = random.randint(max(1, player_row-3), min(self.grid_rows-2, player_row+3))
        ball_col = random.randint(max(1, player_col-3), min(self.grid_cols//3, player_col+3))
        self.ball_pos = (ball_row, ball_col)
        
        # Initialize goalkeeper position
        self.gk_pos = (self.grid_rows//2, self.grid_cols - 5)
        
        self.has_ball = (self.player_pos == self.ball_pos)
        self.gk_has_ball = False
        self.episode_steps = 0
        return self._get_state(), {}

    def step(self, action):
        self.episode_steps += 1
        reward = self.step_penalty
        gk_reward = 0
        done = False

        # Player action handling (unchanged)
        if action < 8:  # Movement actions
            dx, dy = [(0, -1), (1, 0), (0, 1), (-1, 0), (-1, -1), (-1, 1), (1, -1), (1, 1)][action]
            new_pos = (self.player_pos[0] + dx, self.player_pos[1] + dy)

            if 0 <= new_pos[0] < self.grid_rows and 0 <= new_pos[1] < self.grid_cols and self.layout[new_pos] != "O":
                self.player_pos = new_pos
                if self.has_ball:
                    self.ball_pos = new_pos
            else:
                reward -= 20  # Massive penalty
                done = True  # End episode if the player goes out
                return self._get_state(), reward, gk_reward, done, True, {}

        elif action == 8 and self.has_ball:  # Long shot
            goal_y_center = self.grid_rows // 2
            target_y = min(max(goal_y_center + random.randint(-2, 2), 0), self.grid_rows-1)
            new_ball_col = min(self.ball_pos[1] + 10, self.grid_cols - 1)
            self.ball_pos = (target_y, new_ball_col)
            self.has_ball = False

        elif action == 9 and self.has_ball:  # Short pass
            new_ball_col = min(self.ball_pos[1] + 5, self.grid_cols - 1)
            self.ball_pos = (self.ball_pos[0], new_ball_col)
            self.has_ball = False

        # Check if player gets the ball
        self.has_ball = self.player_pos == self.ball_pos

        # Goalkeeper action handling
        gk_action = self.gk_action_space.sample()  # Random action for now
        if gk_action < 8:  # Movement actions
            dx, dy = [(0, -1), (1, 0), (0, 1), (-1, 0), (-1, -1), (-1, 1), (1, -1), (1, 1)][gk_action]
            new_gk_pos = (self.gk_pos[0] + dx, self.gk_pos[1] + dy)

            if 0 <= new_gk_pos[0] < self.grid_rows and 0 <= new_gk_pos[1] < self.grid_cols and self.layout[new_gk_pos] != "O":
                self.gk_pos = new_gk_pos

        elif gk_action == 8:  # Block
            if np.linalg.norm(np.array(self.gk_pos) - np.array(self.ball_pos)) <= 2:  # Block range
                self.gk_pos = self.ball_pos
                self.gk_has_ball = True
                self.has_ball = False
                gk_reward += self.gk_stop_ball_reward + self.gk_hold_ball_reward
                done = True

        # elif gk_action == 9:  # Dive Left
        #     if self.ball_pos[1] < self.gk_pos[1]:  # Ball is to the left
        #         gk_reward += self.gk_stop_ball_reward
        #         self.gk_has_ball = True
        #         self.has_ball = False

        # elif gk_action == 10:  # Dive Right
        #     if self.ball_pos[1] > self.gk_pos[1]:  # Ball is to the right
        #         gk_reward += self.gk_stop_ball_reward
        #         self.gk_has_ball = True
        #         self.has_ball = False

        # elif gk_action == 11:  # Clear
        #     if self.gk_has_ball:
        #         self.ball_pos = (self.gk_pos[0], max(0, self.gk_pos[1] - 10))  # Kick ball away
        #         self.gk_has_ball = False
        #         gk_reward += self.gk_hold_ball_reward

        # Check if goalkeeper gets the ball (only if not already handled by block/dive)
        if not self.gk_has_ball and self.gk_pos == self.ball_pos:
            self.gk_has_ball = True
            self.has_ball = False
            gk_reward += self.gk_stop_ball_reward

        # Check if goalkeeper loses the ball
        if self.gk_has_ball and self.gk_pos != self.ball_pos:
            self.gk_has_ball = False
            gk_reward += self.gk_lose_ball_penalty

        # Check if goalkeeper holds the ball (only if not already handled by block/dive)
        if self.gk_has_ball and self.gk_pos == self.ball_pos:
            gk_reward += self.gk_hold_ball_reward

        # Debugging: Print GK reward and action
        #print(f"GK Action: {gk_action}, GK Reward: {gk_reward}")

        truncated = self.episode_steps >= 3000
        return self._get_state(), reward, gk_reward, done, truncated, {}

    def render(self):
        grid = np.full((self.grid_rows, self.grid_cols), '-')
        
        # Draw field elements
        for i in range(self.grid_rows):
            for j in range(self.grid_cols):
                if self.layout[i,j] == 'O':
                    grid[i,j] = '#'
                elif self.layout[i,j] == 'G':
                    grid[i,j] = '|'
                elif self.layout[i,j] == 'M':
                    grid[i,j] = '.'
        
        # Draw player, goalkeeper, and ball
        grid[self.player_pos[0], self.player_pos[1]] = 'P'
        grid[self.gk_pos[0], self.gk_pos[1]] = 'G'
        if not self.has_ball and not self.gk_has_ball:
            grid[self.ball_pos[0], self.ball_pos[1]] = 'o'
        
        # Print the grid
        print('-' * (self.grid_cols + 2))
        for row in grid:
            print('|' + ''.join(row) + '|')
        print('-' * (self.grid_cols + 2))
        print(f"Has ball: {self.has_ball}, GK has ball: {self.gk_has_ball}, Steps: {self.episode_steps}")

In [3]:
class DQN(nn.Module):
    def __init__(self, input_dim, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, action_size)
        
        # Initialize weights with better defaults
        for layer in [self.fc1, self.fc2, self.fc3, self.fc4]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)


class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.995
        self.learning_rate = 0.0003
        self.memory = deque(maxlen=100000)
        self.batch_size = 256
        self.target_update_freq = 5  # Update target network every N episodes
        
        self.device = device
        self.model = DQN(state_size, action_size).to(self.device)
        self.target_model = DQN(state_size, action_size).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        
        self.rewards_history = []
        self.episode_count = 0

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, evaluate=False):
        if not evaluate and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return 0
            
        # Sample minibatch from memory
        minibatch = random.sample(self.memory, self.batch_size)
        
        states = torch.FloatTensor([experience[0] for experience in minibatch]).to(self.device)
        actions = torch.LongTensor([experience[1] for experience in minibatch]).to(self.device)
        rewards = torch.FloatTensor([experience[2] for experience in minibatch]).to(self.device)
        next_states = torch.FloatTensor([experience[3] for experience in minibatch]).to(self.device)
        dones = torch.FloatTensor([experience[4] for experience in minibatch]).to(self.device)
        
        # Current Q values
        curr_q_values = self.model(states).gather(1, actions.unsqueeze(1))
        
        # Target Q values
        with torch.no_grad():
            next_q_values = self.target_model(next_states).max(1)[0]
        
        # Compute target
        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
        
        # Compute loss
        loss = F.mse_loss(curr_q_values.squeeze(), target_q_values)
        
        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)  
        self.optimizer.step()
        
        return loss.item()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save(self, filepath):
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'epsilon': self.epsilon,
            'episode_count': self.episode_count,
            'rewards_history': self.rewards_history
        }, filepath)
        print(f"Model saved to {filepath}")

    def load(self, filepath):
        checkpoint = torch.load(filepath)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.target_model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.epsilon = checkpoint['epsilon']
        self.episode_count = checkpoint['episode_count']
        self.rewards_history = checkpoint['rewards_history']
        print(f"Model loaded from {filepath}")

    def train(self, env, episodes, max_steps=2000, save_freq=50, render_freq=20):
        for episode in range(episodes):
            state, _ = env.reset()
            total_reward = 0
            done = False
            truncated = False
            losses = []
            
            for step in range(max_steps):
                action = self.act(state)
                next_state, reward, done, truncated, _ = env.step(action)
                total_reward += reward
                
                # Store experience in memory
                self.remember(state, action, reward, next_state, done or truncated)
                
                # Train model with replay
                if len(self.memory) >= self.batch_size:
                    loss = self.replay()
                    losses.append(loss)
                
                state = next_state
                
                if done or truncated:
                    break
                    
            # Update target network periodically
            if episode % self.target_update_freq == 0:
                self.update_target_model()
                
            # Decay exploration rate
            self.decay_epsilon()
            
            # Record stats
            self.episode_count += 1
            self.rewards_history.append(total_reward)
            
            # Print episode statistics
            avg_loss = np.mean(losses) if losses else 0
            print(f"Episode {episode}: Reward = {total_reward:.1f}, Steps = {step+1}, Epsilon = {self.epsilon:.3f}, Avg Loss = {avg_loss:.5f}")
            
            # Save the model periodically
            if episode > 0 and episode % save_freq == 0:
                self.save(f"dqn_football_ep{episode}.pth")

            if episode % 10 == 0:
                avg_reward = np.mean(self.rewards_history[-10:])
                print(f"Last 10 episodes average reward: {avg_reward:.2f}")
                
            # Render occasionally to see progress
            # if episode % render_freq == 0:
            #     print(f"\n--- Episode {episode} Rendering ---")
            #     test_env = FootballEnv()
            #     self.evaluate(test_env, render=True)
                
    def evaluate(self, env, episodes=1, render=True):
        total_rewards = []
        
        for episode in range(episodes):
            state, _ = env.reset()
            total_reward = 0
            done = False
            truncated = False
            
            while not done and not truncated:
                action = self.act(state, evaluate=True)  # No exploration
                next_state, reward, done, truncated, _ = env.step(action)
                total_reward += reward
                
                if render:
                    print(action)
                    env.render()
                    time.sleep(0.5)  # Pause to make rendering visible
                    
                state = next_state
                
            total_rewards.append(total_reward)
            print(f"Evaluation episode {episode}: Reward = {total_reward}")
            
        return np.mean(total_rewards)

In [4]:
class GoalkeeperDQN(nn.Module):
    def __init__(self, input_dim, action_size):
        super(GoalkeeperDQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, action_size)
        
        # Initialize weights with better defaults
        for layer in [self.fc1, self.fc2, self.fc3, self.fc4]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)


class GoalkeeperDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.995
        self.learning_rate = 0.0003
        self.memory = deque(maxlen=100000)
        self.batch_size = 256
        self.target_update_freq = 5  # Update target network every N episodes
        
        self.device = device
        self.model = GoalkeeperDQN(state_size, action_size).to(self.device)
        self.target_model = GoalkeeperDQN(state_size, action_size).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        
        self.rewards_history = []
        self.episode_count = 0

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, evaluate=False):
        if not evaluate and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return 0
            
        # Sample minibatch from memory
        minibatch = random.sample(self.memory, self.batch_size)
        
        states = torch.FloatTensor([experience[0] for experience in minibatch]).to(self.device)
        actions = torch.LongTensor([experience[1] for experience in minibatch]).to(self.device)
        rewards = torch.FloatTensor([experience[2] for experience in minibatch]).to(self.device)
        next_states = torch.FloatTensor([experience[3] for experience in minibatch]).to(self.device)
        dones = torch.FloatTensor([experience[4] for experience in minibatch]).to(self.device)
        
        # Current Q values
        curr_q_values = self.model(states).gather(1, actions.unsqueeze(1))
        
        # Target Q values
        with torch.no_grad():
            next_q_values = self.target_model(next_states).max(1)[0]
        
        # Compute target
        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
        
        # Compute loss
        loss = F.mse_loss(curr_q_values.squeeze(), target_q_values)
        
        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)  
        self.optimizer.step()
        
        return loss.item()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save(self, filepath):
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'epsilon': self.epsilon,
            'episode_count': self.episode_count,
            'rewards_history': self.rewards_history
        }, filepath)
        print(f"Goalkeeper model saved to {filepath}")

    def load(self, filepath):
        checkpoint = torch.load(filepath)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.target_model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.epsilon = checkpoint['epsilon']
        self.episode_count = checkpoint['episode_count']
        self.rewards_history = checkpoint['rewards_history']
        print(f"Goalkeeper model loaded from {filepath}")

In [11]:
class FootballTrainer:
    def __init__(self, env, player_agent, gk_agent, episodes=1000, max_steps=2000, save_freq=50):
        self.env = env
        self.player_agent = player_agent
        self.gk_agent = gk_agent
        self.episodes = episodes
        self.max_steps = max_steps
        self.save_freq = save_freq
        
        self.player_rewards_history = []
        self.gk_rewards_history = []
        
    def train(self):
        for episode in range(self.episodes):
            state, _ = self.env.reset()
            player_total_reward = 0
            gk_total_reward = 0
            done = False
            truncated = False
            player_losses = []
            gk_losses = []
            
            for step in range(self.max_steps):
                # Player action
                player_action = self.player_agent.act(state)
                
                # Step environment using player action
                # Check the actual return values from your env.step method
                next_state, player_reward, gk_reward, done, truncated, info = self.env.step(player_action)
                
                # Record rewards
                player_total_reward += player_reward
                gk_total_reward += gk_reward
                
                # Store player experience
                self.player_agent.remember(state, player_action, player_reward, next_state, done or truncated)
                
                # Store goalkeeper experience (using same state but with its own reward)
                # Get actual goalkeeper action (if your env tracks this)
                gk_action = self.env.gk_action_space.sample()  # Use your env's actual GK action if available
                self.gk_agent.remember(state, gk_action, gk_reward, next_state, done or truncated)
                
                # Train models with replay
                if len(self.player_agent.memory) >= self.player_agent.batch_size:
                    player_loss = self.player_agent.replay()
                    player_losses.append(player_loss)
                
                if len(self.gk_agent.memory) >= self.gk_agent.batch_size:
                    gk_loss = self.gk_agent.replay()
                    gk_losses.append(gk_loss)
                
                state = next_state
                
                if done or truncated:
                    break
                    
            # Update target networks periodically
            if episode % self.player_agent.target_update_freq == 0:
                self.player_agent.update_target_model()
                self.gk_agent.update_target_model()
                
            # Decay exploration rates
            self.player_agent.decay_epsilon()
            self.gk_agent.decay_epsilon()
            
            # Record stats
            self.player_agent.episode_count += 1
            self.gk_agent.episode_count += 1
            self.player_agent.rewards_history.append(player_total_reward)
            self.gk_agent.rewards_history.append(gk_total_reward)
            
            # Print episode statistics
            player_avg_loss = np.mean(player_losses) if player_losses else 0
            gk_avg_loss = np.mean(gk_losses) if gk_losses else 0
            
            print(f"Episode {episode}:")
            print(f"  Player: Reward = {player_total_reward:.1f}, Epsilon = {self.player_agent.epsilon:.3f}, Avg Loss = {player_avg_loss:.5f}")
            print(f"  Goalkeeper: Reward = {gk_total_reward:.1f}, Epsilon = {self.gk_agent.epsilon:.3f}, Avg Loss = {gk_avg_loss:.5f}")
            print(f"  Steps = {step+1}")
            
            # Save the models periodically
            if episode > 0 and episode % self.save_freq == 0:
                self.player_agent.save(f"player_dqn_football_ep{episode}.pth")
                self.gk_agent.save(f"goalkeeper_dqn_football_ep{episode}.pth")

            if episode % 10 == 0:
                player_avg_reward = np.mean(self.player_agent.rewards_history[-10:])
                gk_avg_reward = np.mean(self.gk_agent.rewards_history[-10:])
                print(f"Last 10 episodes average - Player: {player_avg_reward:.2f}, Goalkeeper: {gk_avg_reward:.2f}\n")
    
    def evaluate(self, episodes=5, render=True):
        player_total_rewards = []
        gk_total_rewards = []
        
        for episode in range(episodes):
            state, _ = self.env.reset()
            player_total_reward = 0
            gk_total_reward = 0
            done = False
            truncated = False
            
            step_count = 0
            while not done and not truncated and step_count < self.max_steps:
                # Get player action using trained policy
                player_action = self.player_agent.act(state, evaluate=True)
                
                # Step environment
                next_state, player_reward, gk_reward, done, truncated, _ = self.env.step(player_action)
                
                # Record rewards
                player_total_reward += player_reward
                gk_total_reward += gk_reward
                
                if render:
                    print(f"Step {step_count} - Player action: {player_action}")
                    self.env.render()
                    time.sleep(0.5)  # Pause to make rendering visible
                    
                state = next_state
                step_count += 1
                
            player_total_rewards.append(player_total_reward)
            gk_total_rewards.append(gk_total_reward)
            
            print(f"Evaluation episode {episode}:")
            print(f"  Player reward: {player_total_reward:.2f}")
            print(f"  Goalkeeper reward: {gk_total_reward:.2f}")
            print(f"  Steps: {step_count}")
            
        print("\nEvaluation Summary:")
        print(f"  Average player reward: {np.mean(player_total_rewards):.2f}")
        print(f"  Average goalkeeper reward: {np.mean(gk_total_rewards):.2f}")
        
        return np.mean(player_total_rewards), np.mean(gk_total_rewards)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Initialize environment and agents
env = FootballEnv()
state_size = 11  # Based on the observation space in your environment
player_action_size = 10  # Based on the action space for the player
gk_action_size = 12  # Based on the action space for the goalkeeper

# Create agents
player_agent = DQNAgent(state_size, player_action_size)
gk_agent = GoalkeeperDQNAgent(state_size, gk_action_size)

# Create trainer
trainer = FootballTrainer(
    env=env,
    player_agent=player_agent,
    gk_agent=gk_agent,
    episodes=500,  # Number of training episodes
    max_steps=2000,  # Maximum steps per episode
    save_freq=50    # Save model every 50 episodes
)

Using device: cuda


In [13]:
# Start training
print("Starting training...")
trainer.train()

Starting training...
GK Action: 6, GK Reward: 0
GK Action: 0, GK Reward: 0
GK Action: 1, GK Reward: 0
GK Action: 9, GK Reward: 5
GK Action: 2, GK Reward: 0
GK Action: 11, GK Reward: 0
GK Action: 9, GK Reward: 5
GK Action: 11, GK Reward: 0
GK Action: 9, GK Reward: 5
GK Action: 5, GK Reward: 0
GK Action: 1, GK Reward: 0
GK Action: 8, GK Reward: 0
GK Action: 6, GK Reward: 0
GK Action: 6, GK Reward: 0
GK Action: 11, GK Reward: 0
Episode 0:
  Player: Reward = -20.0, Epsilon = 0.995, Avg Loss = 0.00000
  Goalkeeper: Reward = 15.0, Epsilon = 0.995, Avg Loss = 0.00000
  Steps = 16
Last 10 episodes average - Player: -20.02, Goalkeeper: 15.00

GK Action: 11, GK Reward: 0
GK Action: 10, GK Reward: 0
GK Action: 1, GK Reward: 0
GK Action: 2, GK Reward: 0
GK Action: 1, GK Reward: 0
GK Action: 10, GK Reward: 0
GK Action: 6, GK Reward: 0
GK Action: 8, GK Reward: 0
GK Action: 9, GK Reward: 5
GK Action: 8, GK Reward: 0
GK Action: 0, GK Reward: 0
GK Action: 10, GK Reward: 0
GK Action: 7, GK Reward: 0
GK 

  states = torch.FloatTensor([experience[0] for experience in minibatch]).to(self.device)


GK Action: 7, GK Reward: 0
GK Action: 2, GK Reward: 0
GK Action: 0, GK Reward: 0
GK Action: 1, GK Reward: 0
GK Action: 6, GK Reward: 0
GK Action: 5, GK Reward: 0
GK Action: 8, GK Reward: 0
GK Action: 9, GK Reward: 5
GK Action: 1, GK Reward: 0
GK Action: 10, GK Reward: 0
GK Action: 11, GK Reward: 0
GK Action: 11, GK Reward: 0
GK Action: 1, GK Reward: 0
GK Action: 0, GK Reward: 0
GK Action: 10, GK Reward: 0
GK Action: 2, GK Reward: 0
GK Action: 8, GK Reward: 0
GK Action: 7, GK Reward: 0
GK Action: 4, GK Reward: 0
GK Action: 0, GK Reward: 0
GK Action: 11, GK Reward: 0
GK Action: 6, GK Reward: 0
GK Action: 4, GK Reward: 0
GK Action: 0, GK Reward: 0
GK Action: 7, GK Reward: 0
GK Action: 8, GK Reward: 0
GK Action: 11, GK Reward: 0
GK Action: 9, GK Reward: 5
GK Action: 11, GK Reward: 0
GK Action: 8, GK Reward: 0
GK Action: 9, GK Reward: 5
GK Action: 4, GK Reward: 0
GK Action: 6, GK Reward: 0
GK Action: 9, GK Reward: 5
GK Action: 3, GK Reward: 0
GK Action: 11, GK Reward: 0
GK Action: 10, GK Re

In [15]:

# Evaluate trained agents
print("\nEvaluating trained agents...")
trainer.evaluate(episodes=5, render=True)



Evaluating trained agents...
GK Action: 4, GK Reward: 0
Step 0 - Player action: 6
----------------------------------------------------
|##################################################|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#------------------------.-----------------------#|
|#--------------

KeyboardInterrupt: 

In [None]:
# Save final models
player_agent.save("final_player_model.pth")
gk_agent.save("final_goalkeeper_model.pth")
print("Training complete!")