In [1]:
# Import necessary libraries
import pandas as pd
import requests
import time
import os
from dotenv import load_dotenv

In [2]:
# Authentication header required for requests to the football-data.org API
# Loading API key from environment variables

load_dotenv()
API_KEY = os.getenv("API_KEY")

headers = { "X-Auth-Token": API_KEY }

In [3]:
# See all available competitions (leagues) data  
url_competitions = "https://api.football-data.org/v4/competitions/"
r_competitions = requests.get(url_competitions, headers=headers)
data_competitions = r_competitions.json()

# data["competitions"] contains all leagues
competitions = data_competitions["competitions"]
df_competitions = pd.json_normalize(competitions)
leagues = df_competitions[["id", "name", "code", "type"]]
leagues

# We will collect match data for the following leagues
# Premier League (England)
# La Liga (Spain)
# Serie A (Italy)
# Bundesliga (Germany)
# Ligue 1 (France)

Unnamed: 0,id,name,code,type
0,2013,Campeonato Brasileiro Série A,BSA,LEAGUE
1,2016,Championship,ELC,LEAGUE
2,2021,Premier League,PL,LEAGUE
3,2001,UEFA Champions League,CL,CUP
4,2018,European Championship,EC,CUP
5,2015,Ligue 1,FL1,LEAGUE
6,2002,Bundesliga,BL1,LEAGUE
7,2019,Serie A,SA,LEAGUE
8,2003,Eredivisie,DED,LEAGUE
9,2017,Primeira Liga,PPL,LEAGUE


In [None]:
# Fetch team names and id's
url_teams = f"https://api.football-data.org/v4/competitions/2021/teams?season=2024"
response_teams = requests.get(url_teams, headers=headers)
data_teams = response_teams.json()
    
season_teams = data_teams["teams"]
df_teams = pd.json_normalize(season_teams)

df_teams = df_teams[["id", "name", "shortName", "tla", "crest"]]
df_teams

Unnamed: 0,id,name,shortName,tla,crest
0,57,Arsenal FC,Arsenal,ARS,https://crests.football-data.org/57.png
1,58,Aston Villa FC,Aston Villa,AVL,https://crests.football-data.org/58.png
2,61,Chelsea FC,Chelsea,CHE,https://crests.football-data.org/61.png
3,62,Everton FC,Everton,EVE,https://crests.football-data.org/62.png
4,63,Fulham FC,Fulham,FUL,https://crests.football-data.org/63.png
5,64,Liverpool FC,Liverpool,LIV,https://crests.football-data.org/64.png
6,65,Manchester City FC,Man City,MCI,https://crests.football-data.org/65.png
7,66,Manchester United FC,Man United,MUN,https://crests.football-data.org/66.png
8,67,Newcastle United FC,Newcastle,NEW,https://crests.football-data.org/67.png
9,73,Tottenham Hotspur FC,Tottenham,TOT,https://crests.football-data.org/73.png


## Fetching Data

In [None]:
years = [2024,2023] # Last two seasons
league_ids = [2021,2019,2002,2015,2014] # League IDs
all_matches = [] # List to hold all match data

# This function retrieves all teams in a league for a specific season
# Then, for each team, it retrieves all matches played in that season
def get_teams_and_matches(league_id, year):
    
    # Fetch team names and id's
    url_teams = f"https://api.football-data.org/v4/competitions/{league_id}/teams?season={year}"
    response_teams = requests.get(url_teams, headers=headers)
    data_teams = response_teams.json()
    
    season_teams = data_teams["teams"]
    df_teams = pd.json_normalize(season_teams)

    # Retrieves all matches played in season and put into a list
    for team_id, team_shortname in zip(df_teams["id"], df_teams["shortName"]):
        url_matches = f"https://api.football-data.org/v4/teams/{team_id}/matches?season={year}&competitions={league_id}"
        response_matches = requests.get(url_matches, headers=headers)
        data_matches = response_matches.json()

        # Check if matches data is available 
        if "matches" in data_matches:
            df_team_matches = pd.json_normalize(data_matches["matches"])
            df_team_matches["team"] = team_shortname
            df_team_matches["opponent"] = df_team_matches.apply(lambda row: row["awayTeam.shortName"] if row["homeTeam.shortName"] == team_shortname else row["homeTeam.shortName"], axis=1)
            df_team_matches["team_is_home"] = df_team_matches.apply(lambda row: 1 if row["homeTeam.shortName"] == team_shortname else 0, axis=1)
            df_team_matches["result"] = df_team_matches.apply(lambda row: 1 if (row["score.winner"] == "HOME_TEAM" and row["team_is_home"] == 1) or
                                         (row["score.winner"] == "AWAY_TEAM" and row["team_is_home"] == 0) else 
                                         (2 if row["score.winner"] == "DRAW" else 0), axis=1)
            df_team_matches[["gf", "ga"]] = df_team_matches.apply(lambda row: pd.Series([row["score.fullTime.home"], row["score.fullTime.away"]]) if row["team_is_home"] == 1 
                                   else pd.Series([row["score.fullTime.away"], row["score.fullTime.home"]]), axis=1)
            all_matches.append(df_team_matches)

        time.sleep(9) # avoid hitting API rate limits


# Loop through each league and year to collect match data
#for league in league_ids:
    #for year in years:
        #get_teams_and_matches(league, year)
get_teams_and_matches(2021, 2025)
# Concatenate all match data into a single DataFrame
df_all_matches = pd.concat(all_matches, ignore_index=True)

## Data cleaning and transformation

In [5]:
df_all_matches.columns

Index(['id', 'utcDate', 'status', 'matchday', 'stage', 'group', 'lastUpdated',
       'referees', 'area.id', 'area.name', 'area.code', 'area.flag',
       'competition.id', 'competition.name', 'competition.code',
       'competition.type', 'competition.emblem', 'season.id',
       'season.startDate', 'season.endDate', 'season.currentMatchday',
       'season.winner', 'homeTeam.id', 'homeTeam.name', 'homeTeam.shortName',
       'homeTeam.tla', 'homeTeam.crest', 'awayTeam.id', 'awayTeam.name',
       'awayTeam.shortName', 'awayTeam.tla', 'awayTeam.crest', 'score.winner',
       'score.duration', 'score.fullTime.home', 'score.fullTime.away',
       'score.halfTime.home', 'score.halfTime.away', 'odds.msg'],
      dtype='object')

In [6]:
# Select relevant columns for analysis
df = df_all_matches[[
        #'id',
        'utcDate', 
        'status', 
        'matchday',
        #'stage', 'group', 'lastUpdated',
        #'referees', 'area.id', 'area.name', 'area.code', 'area.flag',
        #'competition.id', 'competition.code', 
        'competition.name',
        #'competition.type', 'competition.emblem', 
        'season.id',
        #'season.startDate', 'season.endDate', 
        #'season.currentMatchday',
        #'season.winner', 
        'homeTeam.id', 
        #'homeTeam.name', 
        'homeTeam.shortName',
        #'homeTeam.tla', 
        #'homeTeam.crest', 
        'awayTeam.id', 
        #'awayTeam.name',
        'awayTeam.shortName', 
        #'awayTeam.tla', 
        #'awayTeam.crest',
        'score.winner',
        #'score.duration', 
        'score.fullTime.home', 'score.fullTime.away',
        #'score.halfTime.home', 'score.halfTime.away', 'odds.msg'
    ]]

In [10]:
# Copy the dataframe to avoid SettingWithCopyWarning
df = df.copy()
df = df[df["status"] == "FINISHED"]
df.head(20)

Unnamed: 0,utcDate,status,matchday,competition.name,season.id,homeTeam.id,homeTeam.shortName,awayTeam.id,awayTeam.shortName,score.winner,score.fullTime.home,score.fullTime.away
0,2025-08-17T15:30:00Z,FINISHED,1,Premier League,2403,66,Man United,57,Arsenal,AWAY_TEAM,0.0,1.0
1,2025-08-23T16:30:00Z,FINISHED,2,Premier League,2403,57,Arsenal,341,Leeds United,HOME_TEAM,5.0,0.0
38,2025-08-16T11:30:00Z,FINISHED,1,Premier League,2403,58,Aston Villa,67,Newcastle,DRAW,0.0,0.0
39,2025-08-23T14:00:00Z,FINISHED,2,Premier League,2403,402,Brentford,58,Aston Villa,HOME_TEAM,1.0,0.0
76,2025-08-17T13:00:00Z,FINISHED,1,Premier League,2403,61,Chelsea,354,Crystal Palace,DRAW,0.0,0.0
77,2025-08-22T19:00:00Z,FINISHED,2,Premier League,2403,563,West Ham,61,Chelsea,AWAY_TEAM,1.0,5.0
114,2025-08-18T19:00:00Z,FINISHED,1,Premier League,2403,341,Leeds United,62,Everton,HOME_TEAM,1.0,0.0
115,2025-08-24T13:00:00Z,FINISHED,2,Premier League,2403,62,Everton,397,Brighton Hove,HOME_TEAM,2.0,0.0
152,2025-08-16T14:00:00Z,FINISHED,1,Premier League,2403,397,Brighton Hove,63,Fulham,DRAW,1.0,1.0
153,2025-08-24T15:30:00Z,FINISHED,2,Premier League,2403,63,Fulham,66,Man United,DRAW,1.0,1.0


In [None]:
# Rename columns for better readability
df = df.rename(columns={"utcDate":"date", "matchday":"week", "competition.name":"league",
                   "homeTeam.id":"home_id", "homeTeam.shortName":"home_name",
                   "awayTeam.id":"away_id", "awayTeam.shortName":"away_name",
                   "score.winner":"winner", "score.fullTime.home":"home_goals", "score.fullTime.away":"away_goals"})

In [34]:
# Standardize league and team names
df["league"] = df["league"].replace("Primera Division", "LaLiga")
df["home_name"] = df["home_name"].replace("Barça", "Barcelona")
df["away_name"] = df["away_name"].replace("Barça", "Barcelona")

In [35]:
# Convert date column to datetime and create separate columns for date and hour
df["date"] = pd.to_datetime(df["date"])
df["match_date"] = df["date"].dt.date
df["match_hour"] = df["date"].dt.hour  # this leaves only hours

df = df.drop(columns=["date"])

In [36]:
# Reorder the columns
cols = df.columns.tolist()
cols.remove("match_date")
cols.remove("match_hour")
new_order = [cols[0], "match_date", "match_hour"] + cols[1:]

# Reorder the dataframe
df = df[new_order]

In [37]:
df.head()

Unnamed: 0,match_week,match_date,match_hour,league,home_id,home_name,away_id,away_name,winner,home_goals,away_goals
0,1,2024-08-17,14,Premier League,57,Arsenal,76,Wolverhampton,HOME_TEAM,2,0
1,2,2024-08-24,16,Premier League,58,Aston Villa,57,Arsenal,AWAY_TEAM,0,2
2,3,2024-08-31,11,Premier League,57,Arsenal,397,Brighton Hove,DRAW,1,1
3,4,2024-09-15,13,Premier League,73,Tottenham,57,Arsenal,AWAY_TEAM,0,1
4,5,2024-09-22,15,Premier League,65,Man City,57,Arsenal,DRAW,2,2


In [None]:
# Save the DataFrame as CSV for the predictor
df.to_csv("top5_leagues_matches.csv")