In [8]:
import numpy as np 
import matplotlib.pyplot as plt
import time
import seaborn as sns
import random
from gymnasium import Env, spaces
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [10]:
def bresenham_line(x0, y0, x1, y1):
    points = []
    dx = abs(x1 - x0)
    dy = abs(y1 - y0)
    sx = 1 if x0 < x1 else -1
    sy = 1 if y0 < y1 else -1
    err = dx - dy
    
    while True:
        points.append((x0, y0))
        if x0 == x1 and y0 == y1:
            break
        e2 = 2 * err
        if e2 > -dy:
            err -= dy
            x0 += sx
        if e2 < dx:
            err += dx
            y0 += sy
            
    return points

class Agent:
    def __init__(self, field_height, field_width, pos=None):
        self.field_height = field_height
        self.field_width = field_width
        self.action_space = spaces.Discrete(10)  # 8 moves + shoot + stay still
        self.pos = pos
        self.position = np.array(pos) if pos is not None else np.array([
            np.random.randint(0, field_height),
            np.random.randint(0, field_width)
        ])
        self.shoot_speed = 4
        self.shoot_speed_decay = 1

    def policy(self, state, epsilon):
        if np.random.uniform(0, 1) < epsilon:
            return self.action_space.sample()
        else:
            return np.argmax(self.q_table[state, :])

    def reset_position(self):
        if self.pos is None:
            self.position = np.array([
                np.random.randint(0, self.field_height),
                np.random.randint(0, self.field_width)
            ])
        return self.position

    def shoot_ball(self, current_position, goal_position):
        # Convert positions to integers for Bresenham algorithm
        start_x, start_y = int(current_position[0]), int(current_position[1])
        target_x, target_y = int(goal_position[0]), int(goal_position[1])
        
        # Calculate trajectory points using Bresenham's algorithm
        trajectory_points = bresenham_line(start_x, start_y, target_x, target_y)
        
        # Skip the first point (current position)
        if len(trajectory_points) > 1:
            trajectory_points = trajectory_points[1:]
        
        # Calculate direction vector (normalized)
        direction = np.array([goal_position[0] - current_position[0], 
                             goal_position[1] - current_position[1]])
        direction_norm = np.linalg.norm(direction)
        
        if direction_norm > 0:
            direction = direction / direction_norm
        else:
            direction = np.array([0, 0])
        
        return trajectory_points, self.shoot_speed, direction
    

class FootballEnv(Env):
    def __init__(self, field_height, field_width, agent):
        super(FootballEnv, self).__init__()

        self.field_height = field_height  # x-axis (rows)
        self.field_width = field_width  # y-axis (columns)
        self.goal_y = self.field_width - 1  # Goal at the far right (y-axis)

        # Observation Space: (player_x, player_y, ball_x, ball_y)
        self.observation_space = spaces.Discrete(field_height * field_width)

        self.field = np.full((self.field_height, self.field_width), '.', dtype=str)
        self.rewards = np.full((self.field_height, self.field_width), -1, dtype=np.float32)
        
        # Goal area has a higher reward
        self.rewards[:, self.goal_y] = 10.0  

        self.ball_holder = 0
        self.agent = agent
        self.ball_in_transit = False
        self.ball_transit_speed = 0
        self.ball_transit_direction = None
        self.ball_trajectory = []
        self.ball_trajectory_index = 0

        self.agent.q_table = np.zeros((self.observation_space.n, agent.action_space.n))

        self.reset()

    def reset(self, seed=None, options=None):
        self.agents_position= self.agent.reset_position()
        # self.ball_pos = np.array([np.random.randint(0, self.field_height), np.random.randint(0, self.field_width)]  # (x, y) format
        self.ball_pos = np.array([2, 0])  # Center of the field
        self.ball_holder = 0
        self.ball_in_transit = False
        self.ball_transit_speed = 0
        self.ball_transit_direction = None
        self.ball_trajectory = []
        self.ball_trajectory_index = 0
        self.done = False

        if self.ball_holder == 0:
            if np.array_equal(self.agent.position, self.ball_pos):
                self.ball_holder = 1
                self.ball_pos = self.agent.position.copy()
                self.ball_in_transit = False

        return self._get_agent_position(), self.ball_pos, self.ball_holder

    def _get_agent_position(self):
        return self.agent.position[0] * self.field_width + self.agent.position[1]
    
    def closest_goal_cell(self, agent_pos):
        # select one cell from the goal area
        goal_cells = [(i, self.goal_y) for i in range(self.field_height)]
        # return the closest goal cell to the agent
        closest_cell = min(goal_cells, key=lambda cell: np.linalg.norm(np.array(agent_pos) - np.array(cell)))
        return closest_cell
        
    def step(self, action):
        reward = 0  # Initialize rewards

        # Define movement mapping for 8 directions
        move_map = {
            0: np.array([0, 1]),   # Right
            1: np.array([1, 1]),   # Down-Right
            2: np.array([1, 0]),   # Down
            3: np.array([1, -1]),  # Down-Left
            4: np.array([0, -1]),  # Left
            5: np.array([-1, -1]), # Up-Left
            6: np.array([-1, 0]),  # Up
            7: np.array([-1, 1])   # Up-Right
        }

        # Get the action for the agent
        if action in move_map:
            move = move_map[action]
            new_position = self.agent.position + move

            # Check if the new position is within bounds
            if (0 <= new_position[0] < self.field_height) and (0 <= new_position[1] < self.field_width):
                self.agent.position = new_position
                if self.ball_holder == 0:
                    # Check if the agent is on the ball
                    if np.array_equal(self.agent.position, self.ball_pos):
                        self.ball_holder = 1
                        self.ball_pos = self.agent.position.copy()  # Ball is now with the agent
                if self.ball_holder == 1:
                    self.ball_pos = self.agent.position.copy()  # Ball is with the agent
        
        elif action == 8:  # Shoot
            # Check if the agent is holding the ball
            if self.ball_holder == 1:
                closest_shot = self.closest_goal_cell(self.agent.position)
                print(f"Agent shooting towards: {closest_shot}")
                # Calculate the trajectory of the ball
                self.ball_trajectory, self.ball_transit_speed, self.ball_transit_direction = self.agent.shoot_ball(self.agent.position, closest_shot)
                self.ball_in_transit = True
                self.ball_holder = 0  # Ball is no longer held by the agent
                self.ball_trajectory_index = 0  # Reset trajectory index
        
        elif action == 9:  # Stay still
            pass  # Agent doesn't move
        
        # Handle ball movement if in transit
        if self.ball_in_transit:
            # First apply speed decay at the beginning of the step
            if self.ball_transit_speed > 0:
                self.ball_transit_speed -= self.agent.shoot_speed_decay
                
                # If speed has decayed to zero or below, stop the ball
                if self.ball_transit_speed <= 0:
                    self.ball_in_transit = False
                    self.ball_transit_speed = 0
                    print("Ball stopped due to speed decay")
            
            # If ball is still in transit after speed decay
            if self.ball_in_transit:
                # Calculate how many steps to move based on current speed
                steps_to_move = max(1, int(round(self.ball_transit_speed)))
                
                # Move the ball along its trajectory by the number of steps determined by speed
                for _ in range(steps_to_move):
                    if self.ball_trajectory_index < len(self.ball_trajectory):
                        # Get the next position from the trajectory
                        next_pos = self.ball_trajectory[self.ball_trajectory_index]
                        self.ball_pos = np.array(next_pos)
                        self.ball_trajectory_index += 1
                        
                        # Check if ball reached the goal
                        if self.ball_pos[1] == self.goal_y:
                            # Add goal reward (1000) + field reward (10) = 1010
                            goal_reward = 1000
                            field_reward = self.rewards[self.ball_pos[0], self.ball_pos[1]]
                            total_reward = goal_reward + field_reward
                            reward += total_reward
                            print(f"Goal! Reward: {total_reward}")
                            self.ball_holder = 0  # Ball is no longer held by the agent
                            self.done = True  # End the episode on goal
                            self.ball_in_transit = False
                            break
                    else:
                        # Ball has completed its trajectory
                        self.ball_in_transit = False
                        self.ball_transit_speed = 0
                        print("Ball reached end of trajectory")
                        break
                        
                # Check if any agent is at the ball's position after movement
                if not self.ball_in_transit and np.array_equal(self.agent.position, self.ball_pos):
                    self.ball_holder = 1  # Agent reclaims the ball
        
        # Only add field reward if we haven't already scored a goal (to avoid double counting)
        if not self.done:
            # Get the current state reward from the rewards matrix
            state_reward = self.rewards[self.ball_pos[0], self.ball_pos[1]]
            reward += state_reward
        
        # Return the new state, reward, and done flag
        return self._get_agent_position(), self.ball_pos, self.ball_holder, reward, self.done, {}


    def render(self):
        field_copy = np.full((self.field_height, self.field_width), '.', dtype=str)
        
        # First, place the ball (if it's not held by an agent)
        if self.ball_holder == 0:
            field_copy[self.ball_pos[0], self.ball_pos[1]] = 'B'  # (x, y) indexing
        
        # render 'A' for agent
        field_copy[self.agent.position[0], self.agent.position[1]] = 'A'
        # render 'G' for goal area
        field_copy[:, self.goal_y] = 'G'
                
        # If ball is in transit, show trajectory
        if self.ball_in_transit:
            for idx, (x, y) in enumerate(self.ball_trajectory):
                if idx >= self.ball_trajectory_index:  # Only show remaining trajectory
                    if 0 <= x < self.field_height and 0 <= y < self.field_width:
                        if field_copy[x, y] == '.':  # Don't overwrite agents
                            field_copy[x, y] = '*'

        print("\n".join(["".join(row) for row in field_copy]) + "\n")

In [None]:
# field_height = 7
# field_width = 10

# # Create agent and environment
# agent = Agent(field_height, field_width)
# env = FootballEnv(field_height, field_width, agent)


# # Save trained models
# # torch.save(agent.actor.state_dict(), "football_actor.pth")
# # torch.save(agent.critic.state_dict(), "football_critic.pth")

# print("Training complete")

KeyboardInterrupt: 

In [11]:
agent = Agent(5, 11, pos=(2, 0))
env = FootballEnv(5, 11, agent)
env.reset()
env.render()

# take steps 0, 8 and three 9s and print reward at each step
for action in [0, 8, 9, 9]:
    state, ball_pos, ball_holder, reward, done, _ = env.step(action)
    print(f"Action: {action}, State: {state}, Ball Position: {ball_pos}, Ball Holder: {ball_holder}, Reward: {reward}")
    env.render()
    time.sleep(1)  # Pause for a second to visualize the rendering

..........G
..........G
A.........G
..........G
..........G

Action: 0, State: 23, Ball Position: [2 1], Ball Holder: 1, Reward: -1.0
..........G
..........G
.A........G
..........G
..........G

Agent shooting towards: (2, 10)
Action: 8, State: 23, Ball Position: [2 4], Ball Holder: 0, Reward: -1.0
..........G
..........G
.A..B*****G
..........G
..........G

Action: 9, State: 23, Ball Position: [2 6], Ball Holder: 0, Reward: -1.0
..........G
..........G
.A....B***G
..........G
..........G

Action: 9, State: 23, Ball Position: [2 7], Ball Holder: 0, Reward: -1.0
..........G
..........G
.A.....B**G
..........G
..........G

