In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import gymnasium as gym      
from gymnasium import * 
import time
import seaborn as sns
import random

In [None]:
class FootballEnv(gym.Env):
    
    def __init__(self, grid_rows=51,grid_cols=121):
        super(FootballEnv, self).__init__()
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols

        self.action_space = spaces.Discrete(11) # 8 directions + 3 actions (kick, pass, dribble)

        self.observation_space = spaces.Discrete(grid_rows*grid_cols)

        self.layout = np.zeros((grid_rows,grid_cols),dtype=str)
        self.layout[:, :] = "."
        self.layout[self.grid_rows//2, self.grid_cols//2] = "C"
        self.layout[self.grid_rows//2-8 : self.grid_rows//2+9, -6:-1] = "D"
        self.layout[self.grid_rows//2-8 : self.grid_rows//2+9, 0:5] = "d"
        self.layout[:, self.grid_cols//2] = "M"
        self.layout[:, -1] = "O"
        self.layout[:, 0] = "O"
        self.layout[0, :] = "O"
        self.layout[-1, :] = "O"
        self.layout[self.grid_rows//2-4 : self.grid_rows//2+5, -1] = "G"
        self.layout[self.grid_rows//2-4 : self.grid_rows//2+5, 0] = "g"
        self.layout[self.grid_rows//2, 11] = "X"
        self.layout[self.grid_rows//2, -13] = "x"        

        self.state = (self.grid_rows-1,0)

        self.rewards = np.zeros((grid_rows,grid_cols))
        self.rewards[self.layout == "G"] = 100
        self.rewards[self.layout == "."] = -1
        self.rewards[self.layout == "*"] = 1
        
        self.ball_position = (self.grid_rows//2,self.grid_cols//2)
        self.player_position = (np.random.randint(0,self.grid_rows-1),np.random.randint(0,self.grid_cols-1))
        self.goal_post_opp = "G"
        self.goal_post_self = "g"
        self.penalty_post_opp = "X"
        self.penalty_post_self = "x"
        self.layout[self.ball_position[0], self.ball_position[1]] = "B"
        self.layout[self.player_position[0], self.player_position[1]] = "P"
        
        
    def _get_state_index(self):
        return self.state[0] * self.grid_cols + self.state[1]
    
    def step(self,action):
        x,y = self.state

        if action == 0: #left
            y = max(0, y-1) 
        elif action == 1: #down
            x = min(self.grid_rows-1, x+1)
        elif action == 2: #right 
            y = min(self.grid_cols-1, y+1)
        elif action == 3: #up
            x = max(0, x-1)
        elif action == 4: #up-left
            x = max(0, x-1)
            y = max(0, y-1)            
        elif action == 5: #up-right
            x = max(0, x-1)
            y = min(self.grid_cols-1, y+1)
        elif action == 6: #down-left
            x = min(self.grid_rows-1, x+1)
            y = max(0, y-1)        
        elif action == 7: #down-right
            x = min(self.grid_rows-1, x+1)
            y = min(self.grid_cols-1, y+1)

        self.state = (x,y)  #State Updation

        reward = self.rewards[x,y]

        done =  self.layout[x,y] == 1 or self.layout[x,y] == 2

        return self._get_state_index(),reward,done,{}
    
    def reset(self):
        self.state = (self.grid_rows-1,0)
        return self._get_state_index()
    
    def render(self):
        grid = np.full(self.layout.shape, '.', dtype=str)
        grid[self.layout == "C"] = 'C'
        grid[self.layout == "G"] = 'G'
        grid[self.layout == "g"] = 'g'
        grid[self.layout == "B"] = 'B'
        grid[self.layout == "P"] = 'P'
        grid[self.layout == 'M'] = "|"
        grid[self.layout == 'D'] = "*"
        grid[self.layout == 'd'] = "^"
        grid[self.layout == "O"] = "o"
        grid[self.layout == "X"] = "X"
        grid[self.layout == "x"] = "x"
        
        row, col = self.state
        
        for row in grid:
            print(end=' ')
            for cell in row:
                print(cell, end=' ')
            print()


In [17]:
env = FootballEnv()
env.reset()
env.render()

 o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o 
 o . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . o 
 o . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . o 
 o . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . o 
 o . . . . . . . . . . .

In [18]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class FootballEnv(gym.Env):
    def __init__(self):
        super(FootballEnv, self).__init__()

        # Field size (10x5 grid for simplicity)
        self.field_width = 10
        self.field_height = 5
        self.goal_x = self.field_width - 1  # Goal is at the far right

        # Action Space: Move in 4 directions + kick
        self.action_space = spaces.Discrete(5)  # 0: Left, 1: Right, 2: Up, 3: Down, 4: Kick

        # Observation Space: (player_x, player_y, ball_x, ball_y)
        self.observation_space = spaces.Box(low=0, high=max(self.field_width, self.field_height), 
                                            shape=(4,), dtype=np.int32)

        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.player_pos = np.array([np.random.randint(0, self.field_width // 2), 
                                    np.random.randint(0, self.field_height)])
        self.ball_pos = np.array([np.random.randint(self.field_width // 2, self.field_width - 2),
                                  np.random.randint(0, self.field_height)])
        self.done = False
        self.has_ball = False

        return self._get_obs(), {}

    def step(self, action):
        reward = -1  # Small penalty for each step (encourages optimal play)

        # Movement actions
        if action == 0:  # Left
            self.player_pos[0] = max(0, self.player_pos[0] - 1)
        elif action == 1:  # Right
            self.player_pos[0] = min(self.field_width - 1, self.player_pos[0] + 1)
        elif action == 2:  # Up
            self.player_pos[1] = max(0, self.player_pos[1] - 1)
        elif action == 3:  # Down
            self.player_pos[1] = min(self.field_height - 1, self.player_pos[1] + 1)

        # Picking up the ball
        if np.array_equal(self.player_pos, self.ball_pos):
            self.has_ball = True
            reward += 2  # Small reward for reaching the ball

        # Kicking the ball
        if action == 4 and self.has_ball:
            if self.player_pos[0] == self.goal_x:  # If near the goal
                reward += 10  # Reward for scoring
                self.done = True

        return self._get_obs(), reward, self.done, False, {}

    def _get_obs(self):
        return np.concatenate([self.player_pos, self.ball_pos])

    def render(self):
        field = np.full((self.field_height, self.field_width), '.', dtype=str)
        field[self.ball_pos[1], self.ball_pos[0]] = 'B'
        field[self.player_pos[1], self.player_pos[0]] = 'P'
        print("\n".join(["".join(row) for row in field]) + "\n")



In [19]:
import random

class RandomAgent:
    def __init__(self, action_space):
        self.action_space = action_space

    def select_action(self, obs):
        return random.randint(0, 4)  # Randomly chooses an action

env = FootballEnv()
agent = RandomAgent(env.action_space)

obs, _ = env.reset()
done = False

while not done:
    env.render()
    action = agent.select_action(obs)
    obs, reward, done, _, _ = env.step(action)

print("Game Over!")
env.render()


.......B..
..........
..........
..P.......
..........

.......B..
..........
..........
..........
..P.......

.......B..
..........
..........
..........
...P......

.......B..
..........
..........
...P......
..........

.......B..
..........
..........
..P.......
..........

.......B..
..........
..........
...P......
..........

.......B..
..........
..........
....P.....
..........

.......B..
..........
..........
...P......
..........

.......B..
..........
..........
....P.....
..........

.......B..
..........
....P.....
..........
..........

.......B..
..........
..........
....P.....
..........

.......B..
..........
..........
....P.....
..........

.......B..
..........
....P.....
..........
..........

.......B..
..........
...P......
..........
..........

.......B..
..........
...P......
..........
..........

.......B..
..........
..........
...P......
..........

.......B..
..........
...P......
..........
..........

.......B..
...P......
..........
..........
....

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, input_dim, action_space, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995):
        self.action_space = action_space
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.model = DQN(input_dim, action_space).to(self.device)
        self.target_model = DQN(input_dim, action_space).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        self.memory = deque(maxlen=10000)

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, self.action_space - 1)
        state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
        with torch.no_grad():
            return torch.argmax(self.model(state)).item()

    def store_experience(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train(self, batch_size=32):
        if len(self.memory) < batch_size:
            return
        
        batch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        # Convert lists of NumPy arrays into a single NumPy array before converting to a tensor
        states = torch.tensor(np.array(states), dtype=torch.float32, device=self.device)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32, device=self.device)
        actions = torch.tensor(actions, dtype=torch.int64, device=self.device).unsqueeze(1)
        rewards = torch.tensor(rewards, dtype=torch.float32, device=self.device)
        dones = torch.tensor(dones, dtype=torch.float32, device=self.device)

        q_values = self.model(states).gather(1, actions).squeeze()
        next_q_values = self.target_model(next_states).max(1)[0]
        targets = rewards + self.gamma * next_q_values * (1 - dones)

        loss = self.criterion(q_values, targets.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())


In [None]:
import gymnasium as gym
import time

env = FootballEnv()
agent = DQNAgent(input_dim=4, action_space=env.action_space.n)

num_episodes = 1000
batch_size = 32
target_update_freq = 10  # Update target model every 10 episodes

for episode in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = agent.select_action(state)
        next_state, reward, done, _, _ = env.step(action)
        agent.store_experience(state, action, reward, next_state, done)
        agent.train(batch_size)

        state = next_state
        total_reward += reward

    if episode % target_update_freq == 0:
        agent.update_target_model()

    print(f"Episode {episode + 1}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.4f}")

env.close()