In [52]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import gymnasium as gym
from collections import deque
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FootballEnv(gym.Env):
    def __init__(self, grid_rows=50, grid_cols=50):
        super(FootballEnv, self).__init__()
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols

        self.action_space = gym.spaces.Discrete(10)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(7,), dtype=np.float32)

        # Initialize field layout
        self.layout = np.zeros((grid_rows, grid_cols), dtype=str)
        self.layout[:, :] = "."
        self.layout[self.grid_rows//2, self.grid_cols//2] = "C"
        self.layout[self.grid_rows//2-8 : self.grid_rows//2+9, -6:-1] = "D"
        self.layout[self.grid_rows//2-8 : self.grid_rows//2+9, 0:5] = "d"
        self.layout[:, self.grid_cols//2] = "M"
        self.layout[:, -1] = "O"
        self.layout[:, 0] = "O"
        self.layout[0, :] = "O"
        self.layout[-1, :] = "O"
        self.layout[self.grid_rows//2-4 : self.grid_rows//2+5, -1] = "G"
        self.layout[self.grid_rows//2-4 : self.grid_rows//2+5, 0] = "g"

        # Initialize positions
        self.player_pos = (self.grid_rows//2, 5)
        self.ball_pos = (self.grid_rows//2, self.grid_cols//2)
        self.has_ball = False
        self.episode_steps = 0

        # Reward parameters
        self.goal_reward = 50
        self.step_penalty = -0.001
        self.ball_possession_bonus = 0.005
        self.near_ball_bonus = 0.0001
        self.near_goal_bonus = 0.0002

    def _get_state(self):
        # Enhanced state representation including has_ball flag
        return np.array([
            self.player_pos[0] / (self.grid_rows - 1),
            self.player_pos[1] / (self.grid_cols - 1),
            self.ball_pos[0] / (self.grid_rows - 1),
            self.ball_pos[1] / (self.grid_cols - 1),
            float(self.has_ball),  # Add has_ball as explicit state feature
            self.grid_rows // 2 / (self.grid_rows - 1),  # Goal Y position
            (self.grid_cols - 1) / (self.grid_cols - 1)  # Goal X position
        ], dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        
        # Start with player on left side of field
        player_row = random.randint(self.grid_rows//2-3, self.grid_rows//2+3)
        player_col = random.randint(1, self.grid_cols//4)
        self.player_pos = (player_row, player_col)
        
        # Place ball near player to start
        ball_row = random.randint(max(1, player_row-3), min(self.grid_rows-2, player_row+3))
        ball_col = random.randint(max(1, player_col-3), min(self.grid_cols//3, player_col+3))
        self.ball_pos = (ball_row, ball_col)
        
        self.has_ball = (self.player_pos == self.ball_pos)
        self.episode_steps = 0
        return self._get_state(), {}

    def step(self, action):
        self.episode_steps += 1
        reward = self.step_penalty
        done = False

        if action < 8:  # Movement actions
            dx, dy = [(0, -1), (1, 0), (0, 1), (-1, 0), (-1, -1), (-1, 1), (1, -1), (1, 1)][action]
            new_pos = (self.player_pos[0] + dx, self.player_pos[1] + dy)

            if 0 <= new_pos[0] < self.grid_rows and 0 <= new_pos[1] < self.grid_cols and self.layout[new_pos] != "O":
                self.player_pos = new_pos
                if self.has_ball:
                    self.ball_pos = new_pos
            else:
                reward -= 20  # Massive penalty
                done = True  # End episode if the player goes out
                return self._get_state(), reward, done, True, {}

        elif action == 8 and self.has_ball:  # Long shot
            goal_y_center = self.grid_rows // 2
            # Better aim toward goal
            target_y = min(max(goal_y_center + random.randint(-2, 2), 0), self.grid_rows-1)
            new_ball_col = min(self.ball_pos[1] + 10, self.grid_cols - 1)
            self.ball_pos = (target_y, new_ball_col)
            self.has_ball = False

        elif action == 9 and self.has_ball:  # Short pass
            new_ball_col = min(self.ball_pos[1] + 5, self.grid_cols - 1)
            self.ball_pos = (self.ball_pos[0], new_ball_col)
            self.has_ball = False

        # Check if player gets the ball
        self.has_ball = self.player_pos == self.ball_pos
        
        # Reward shaping
        if self.has_ball:
            reward += self.ball_possession_bonus
        
        # Distance-based rewards
        dist_to_ball = np.sqrt((self.player_pos[0] - self.ball_pos[0])**2 + 
                              (self.player_pos[1] - self.ball_pos[1])**2)
        if dist_to_ball < 5 and not self.has_ball:
            reward += self.near_ball_bonus
        
        # Reward for moving toward goal with ball
        if self.has_ball:
            # Calculate distance to goal
            dist_to_goal = self.grid_cols - 1 - self.player_pos[1]
            if dist_to_goal < 10:
                reward += self.near_goal_bonus * (1 - dist_to_goal/10.0)
        
        # Goal reward
        if self.layout[self.ball_pos[0], self.ball_pos[1]] == 'G':
            reward += self.goal_reward
            done = True

        truncated = self.episode_steps >= 3000  # Shorter episodes
        return self._get_state(), reward, done, truncated, {}

    def render(self):
        grid = np.full((self.grid_rows, self.grid_cols), '-')
        
        # Draw field elements
        for i in range(self.grid_rows):
            for j in range(self.grid_cols):
                if self.layout[i,j] == 'O':
                    grid[i,j] = '#'
                elif self.layout[i,j] == 'G':
                    grid[i,j] = '|'
                elif self.layout[i,j] == 'M':
                    grid[i,j] = '.'
        
        # Draw player and ball
        grid[self.player_pos[0], self.player_pos[1]] = 'P'
        if not self.has_ball:
            grid[self.ball_pos[0], self.ball_pos[1]] = 'o'
        
        # Print the grid
        print('-' * (self.grid_cols + 2))
        for row in grid:
            print('|' + ''.join(row) + '|')
        print('-' * (self.grid_cols + 2))
        print(f"Has ball: {self.has_ball}, Steps: {self.episode_steps}")


class DQN(nn.Module):
    def __init__(self, input_dim, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, action_size)
        
        # Initialize weights with better defaults
        for layer in [self.fc1, self.fc2, self.fc3, self.fc4]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)


class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.995
        self.learning_rate = 0.0003
        self.memory = deque(maxlen=100000)
        self.batch_size = 256
        self.target_update_freq = 5  # Update target network every N episodes
        
        self.device = device
        self.model = DQN(state_size, action_size).to(self.device)
        self.target_model = DQN(state_size, action_size).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        
        self.rewards_history = []
        self.episode_count = 0

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, evaluate=False):
        if not evaluate and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return 0
            
        # Sample minibatch from memory
        minibatch = random.sample(self.memory, self.batch_size)
        
        states = torch.FloatTensor([experience[0] for experience in minibatch]).to(self.device)
        actions = torch.LongTensor([experience[1] for experience in minibatch]).to(self.device)
        rewards = torch.FloatTensor([experience[2] for experience in minibatch]).to(self.device)
        next_states = torch.FloatTensor([experience[3] for experience in minibatch]).to(self.device)
        dones = torch.FloatTensor([experience[4] for experience in minibatch]).to(self.device)
        
        # Current Q values
        curr_q_values = self.model(states).gather(1, actions.unsqueeze(1))
        
        # Target Q values
        with torch.no_grad():
            next_q_values = self.target_model(next_states).max(1)[0]
        
        # Compute target
        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
        
        # Compute loss
        loss = F.mse_loss(curr_q_values.squeeze(), target_q_values)
        
        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)  
        self.optimizer.step()
        
        return loss.item()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save(self, filepath):
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'epsilon': self.epsilon,
            'episode_count': self.episode_count,
            'rewards_history': self.rewards_history
        }, filepath)
        print(f"Model saved to {filepath}")

    def load(self, filepath):
        checkpoint = torch.load(filepath)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.target_model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.epsilon = checkpoint['epsilon']
        self.episode_count = checkpoint['episode_count']
        self.rewards_history = checkpoint['rewards_history']
        print(f"Model loaded from {filepath}")

    def train(self, env, episodes, max_steps=2000, save_freq=50, render_freq=20):
        for episode in range(episodes):
            state, _ = env.reset()
            total_reward = 0
            done = False
            truncated = False
            losses = []
            
            for step in range(max_steps):
                action = self.act(state)
                next_state, reward, done, truncated, _ = env.step(action)
                total_reward += reward
                
                # Store experience in memory
                self.remember(state, action, reward, next_state, done or truncated)
                
                # Train model with replay
                if len(self.memory) >= self.batch_size:
                    loss = self.replay()
                    losses.append(loss)
                
                state = next_state
                
                if done or truncated:
                    break
                    
            # Update target network periodically
            if episode % self.target_update_freq == 0:
                self.update_target_model()
                
            # Decay exploration rate
            self.decay_epsilon()
            
            # Record stats
            self.episode_count += 1
            self.rewards_history.append(total_reward)
            
            # Print episode statistics
            avg_loss = np.mean(losses) if losses else 0
            print(f"Episode {episode}: Reward = {total_reward:.1f}, Steps = {step+1}, Epsilon = {self.epsilon:.3f}, Avg Loss = {avg_loss:.5f}")
            
            # Save the model periodically
            if episode > 0 and episode % save_freq == 0:
                self.save(f"dqn_football_ep{episode}.pth")

            if episode % 10 == 0:
                avg_reward = np.mean(self.rewards_history[-10:])
                print(f"Last 10 episodes average reward: {avg_reward:.2f}")
                
            # Render occasionally to see progress
            # if episode % render_freq == 0:
            #     print(f"\n--- Episode {episode} Rendering ---")
            #     test_env = FootballEnv()
            #     self.evaluate(test_env, render=True)
                
    def evaluate(self, env, episodes=1, render=True):
        total_rewards = []
        
        for episode in range(episodes):
            state, _ = env.reset()
            total_reward = 0
            done = False
            truncated = False
            
            while not done and not truncated:
                action = self.act(state, evaluate=True)  # No exploration
                next_state, reward, done, truncated, _ = env.step(action)
                total_reward += reward
                
                if render:
                    print(action)
                    env.render()
                    time.sleep(0.5)  # Pause to make rendering visible
                    
                state = next_state
                
            total_rewards.append(total_reward)
            print(f"Evaluation episode {episode}: Reward = {total_reward}")
            
        return np.mean(total_rewards)

In [None]:
# Create environment and agent
env = FootballEnv()
agent = DQNAgent(state_size=7, action_size=10)  # Updated state size
# Train the agent
agent.train(env, episodes=5000, save_freq=50)
# Save the final model
# agent.save("dqn_football_final.pth")

In [None]:
agent.evaluate(env, episodes=1, render=True)