In [27]:
# Import necessary libraries
import pandas as pd
import requests
import time
import os
from dotenv import load_dotenv

In [28]:
# Authentication header required for requests to the football-data.org API
# Loading API key from environment variables

load_dotenv()
API_KEY = os.getenv("API_KEY")

headers = { "X-Auth-Token": API_KEY }

In [29]:
# See all available competitions (leagues) data  
url_competitions = "https://api.football-data.org/v4/competitions/"
r_competitions = requests.get(url_competitions, headers=headers)
data_competitions = r_competitions.json()

# data["competitions"] contains all leagues
competitions = data_competitions["competitions"]
df_competitions = pd.json_normalize(competitions)
leagues = df_competitions[["id", "name", "code", "type"]]
leagues

# We will collect match data for the following leagues
# Premier League (England)
# La Liga (Spain)
# Serie A (Italy)
# Bundesliga (Germany)
# Ligue 1 (France)

Unnamed: 0,id,name,code,type
0,2013,Campeonato Brasileiro Série A,BSA,LEAGUE
1,2016,Championship,ELC,LEAGUE
2,2021,Premier League,PL,LEAGUE
3,2001,UEFA Champions League,CL,CUP
4,2018,European Championship,EC,CUP
5,2015,Ligue 1,FL1,LEAGUE
6,2002,Bundesliga,BL1,LEAGUE
7,2019,Serie A,SA,LEAGUE
8,2003,Eredivisie,DED,LEAGUE
9,2017,Primeira Liga,PPL,LEAGUE


## Fetching Data

In [None]:
years = [2024,2023] # Last two seasons
league_ids = [2021,2019,2002,2015,2014] # League IDs
all_matches = [] # List to hold all match data

# This function retrieves all teams in a league for a specific season
# Then, for each team, it retrieves all matches played in that season
def get_teams_and_matches(league_id, year):
    
    # Fetch team names and id's
    url_teams = f"https://api.football-data.org/v4/competitions/{league_id}/teams?season={year}"
    response_teams = requests.get(url_teams, headers=headers)
    data_teams = response_teams.json()
    
    season_teams = data_teams["teams"]
    df_teams = pd.json_normalize(season_teams)

    # Retrieves all matches played in season and put into a list
    for team_id in df_teams["id"]:
        url_matches = f"https://api.football-data.org/v4/teams/{team_id}/matches?season={year}&competitions={league_id}"
        response_matches = requests.get(url_matches, headers=headers)
        data_matches = response_matches.json()

        # Check if matches data is available 
        if "matches" in data_matches:
            df_team_matches = pd.json_normalize(data_matches["matches"])
            all_matches.append(df_team_matches)

        time.sleep(9) # avoid hitting API rate limits


# Loop through each league and year to collect match data
for league in league_ids:
    for year in years:
        get_teams_and_matches(league, year)

# Concatenate all match data into a single DataFrame
df_all_matches = pd.concat(all_matches, ignore_index=True)

## Data cleaning and transformation

In [30]:
df_all_matches.columns

Index(['id', 'utcDate', 'status', 'matchday', 'stage', 'group', 'lastUpdated',
       'referees', 'area.id', 'area.name', 'area.code', 'area.flag',
       'competition.id', 'competition.name', 'competition.code',
       'competition.type', 'competition.emblem', 'season.id',
       'season.startDate', 'season.endDate', 'season.currentMatchday',
       'season.winner', 'homeTeam.id', 'homeTeam.name', 'homeTeam.shortName',
       'homeTeam.tla', 'homeTeam.crest', 'awayTeam.id', 'awayTeam.name',
       'awayTeam.shortName', 'awayTeam.tla', 'awayTeam.crest', 'score.winner',
       'score.duration', 'score.fullTime.home', 'score.fullTime.away',
       'score.halfTime.home', 'score.halfTime.away', 'odds.msg',
       'season.winner.id', 'season.winner.name', 'season.winner.shortName',
       'season.winner.tla', 'season.winner.crest', 'season.winner.address',
       'season.winner.website', 'season.winner.founded',
       'season.winner.clubColors', 'season.winner.venue',
       'season.winner

In [31]:
# Select relevant columns for analysis
df = df_all_matches[[
        #'id',
        'utcDate', 
        #'status', 
        'matchday',
        #'stage', 'group', 'lastUpdated',
        #'referees', 'area.id', 'area.name', 'area.code', 'area.flag',
        #'competition.id', 'competition.code', 
        'competition.name',
        #'competition.type', 'competition.emblem', 'season.id',
        #'season.startDate', 'season.endDate', 
        #'season.currentMatchday',
        #'season.winner', 
        'homeTeam.id', 
        #'homeTeam.name', 
        'homeTeam.shortName',
        #'homeTeam.tla', 
        #'homeTeam.crest', 
        'awayTeam.id', 
        #'awayTeam.name',
        'awayTeam.shortName', 
        #'awayTeam.tla', 
        #'awayTeam.crest',
        'score.winner',
        #'score.duration', 
        'score.fullTime.home', 'score.fullTime.away',
        #'score.halfTime.home', 'score.halfTime.away', 'odds.msg'
    ]]

In [32]:
# Copy the dataframe to avoid SettingWithCopyWarning
df = df.copy()

In [33]:
# Rename columns for better readability
df = df.rename(columns={"utcDate":"date", "matchday":"match_week", "competition.name":"league",
                   "homeTeam.id":"home_id", "homeTeam.shortName":"home_name",
                   "awayTeam.id":"away_id", "awayTeam.shortName":"away_name",
                   "score.winner":"winner", "score.fullTime.home":"home_goals", "score.fullTime.away":"away_goals"})

In [34]:
# Standardize league and team names
df["league"] = df["league"].replace("Primera Division", "LaLiga")
df["home_name"] = df["home_name"].replace("Barça", "Barcelona")
df["away_name"] = df["away_name"].replace("Barça", "Barcelona")

In [35]:
# Convert date column to datetime and create separate columns for date and hour
df["date"] = pd.to_datetime(df["date"])
df["match_date"] = df["date"].dt.date
df["match_hour"] = df["date"].dt.hour  # this leaves only hours

df = df.drop(columns=["date"])

In [36]:
# Reorder the columns
cols = df.columns.tolist()
cols.remove("match_date")
cols.remove("match_hour")
new_order = [cols[0], "match_date", "match_hour"] + cols[1:]

# Reorder the dataframe
df = df[new_order]

In [37]:
df.head()

Unnamed: 0,match_week,match_date,match_hour,league,home_id,home_name,away_id,away_name,winner,home_goals,away_goals
0,1,2024-08-17,14,Premier League,57,Arsenal,76,Wolverhampton,HOME_TEAM,2,0
1,2,2024-08-24,16,Premier League,58,Aston Villa,57,Arsenal,AWAY_TEAM,0,2
2,3,2024-08-31,11,Premier League,57,Arsenal,397,Brighton Hove,DRAW,1,1
3,4,2024-09-15,13,Premier League,73,Tottenham,57,Arsenal,AWAY_TEAM,0,1
4,5,2024-09-22,15,Premier League,65,Man City,57,Arsenal,DRAW,2,2


In [None]:
# Save the DataFrame as CSV for the predictor
df.to_csv("top5_leagues_matches.csv")