## Import libraries

In [1]:
import requests
import pandas as pd
import time

## API key and get PL id

In [2]:
# Get API Key
with open('api_key.txt', 'r') as file:
    api_key = file.read().replace('\n', '')

base_url = 'https://v3.football.api-sports.io/'

In [3]:
headers = {
    'x-apisports-key': api_key
}

In [4]:
# Existing code to get Premier League ID
leagues_endpoint = f'{base_url}leagues'

# Parameters to filter for Premier League
params = {
    'name': 'Premier League',
    'country': 'England'
}

response = requests.get(leagues_endpoint, headers=headers, params=params)
leagues = response.json()

# Extract the league ID
premier_league_id = leagues['response'][0]['league']['id']
print(f"Premier League ID: {premier_league_id}")

Premier League ID: 39


In [5]:
# Extract league info and available seasons
league_info = leagues['response'][0]
available_seasons = league_info['seasons']

# Print all available seasons
print(f"\nAvailable Seasons for Premier League:")
for season in available_seasons:
    year = season['year']
    start_date = season.get('start', 'N/A')
    end_date = season.get('end', 'N/A')
    is_current = season['current']
    current_flag = ' (Current Season)' if is_current else ''
    print(f"- {year}: {start_date} to {end_date}{current_flag}")


Available Seasons for Premier League:
- 2010: 2010-08-14 to 2011-05-17
- 2011: 2011-08-13 to 2012-05-13
- 2012: 2012-08-18 to 2013-05-19
- 2013: 2013-08-17 to 2014-05-11
- 2014: 2014-08-16 to 2015-05-24
- 2015: 2015-08-08 to 2016-05-17
- 2016: 2016-08-13 to 2017-05-21
- 2017: 2017-08-11 to 2018-05-13
- 2018: 2018-08-10 to 2019-05-12
- 2019: 2019-08-09 to 2020-07-26
- 2020: 2020-09-12 to 2021-05-23
- 2021: 2021-08-13 to 2022-05-22
- 2022: 2022-08-05 to 2023-05-28
- 2023: 2023-08-11 to 2024-05-19
- 2024: 2024-08-16 to 2025-05-25 (Current Season)


## Retrieve lists of statistics available

In [6]:
# Define the season
season = '2023'

# Endpoint to get fixtures
fixtures_endpoint = f'{base_url}fixtures'

# Parameters to filter fixtures
params = {
    'league': premier_league_id,
    'season': season
}

# Fetch fixtures
response = requests.get(fixtures_endpoint, headers=headers, params=params)
fixtures = response.json()

# Check if fixtures are available
# Check if fixtures are available
if not fixtures['response']:
    print("No fixtures found for the selected season and league.")
else:
    # Pick the first fixture to explore stats
    first_fixture = fixtures['response'][0]
    first_fixture_id = first_fixture['fixture']['id']

    # Extract and print general fixture information
    fixture_info = first_fixture['fixture']
    teams_info = first_fixture['teams']
    goals_info = first_fixture['goals']

    print(f"\nFixture ID: {first_fixture_id}")
    print(f"Date: {fixture_info['date']}")
    print(f"Venue: {fixture_info['venue']['name']} ({fixture_info['venue']['city']})")
    print(f"Referee: {fixture_info.get('referee', 'N/A')}")
    print(f"{teams_info['home']['name']} vs {teams_info['away']['name']}")
    print(f"Score: {goals_info['home']} - {goals_info['away']}")

    # Determine Match Result
    if goals_info['home'] > goals_info['away']:
        match_result = f"{teams_info['home']['name']} Wins"
    elif goals_info['home'] < goals_info['away']:
        match_result = f"{teams_info['away']['name']} Wins"
    else:
        match_result = "Draw"
    print(f"Result: {match_result}")

    # Endpoint to get fixture statistics
    stats_endpoint = f'{base_url}fixtures/statistics'
    stats_params = {'fixture': first_fixture_id}

    stats_response = requests.get(stats_endpoint, headers=headers, params=stats_params)
    stats = stats_response.json()

    # List all available statistics categories
    print(f"\nAvailable statistics for fixture ID {first_fixture_id}:")
    for team_stats in stats['response']:
        team_name = team_stats['team']['name']
        print(f"\nStats for {team_name}:")
        for stat in team_stats['statistics']:
            print(f"{stat['type']}: {stat['value']}")



Fixture ID: 1035037
Date: 2023-08-11T19:00:00+00:00
Venue: Turf Moor (Burnley)
Referee: C. Pawson
Burnley vs Manchester City
Score: 0 - 3
Result: Manchester City Wins

Available statistics for fixture ID 1035037:

Stats for Burnley:
Shots on Goal: 1
Shots off Goal: 3
Total Shots: 6
Blocked Shots: 2
Shots insidebox: 5
Shots outsidebox: 1
Fouls: 11
Corner Kicks: 6
Offsides: 0
Ball Possession: 34%
Yellow Cards: None
Red Cards: 1
Goalkeeper Saves: 5
Total passes: 365
Passes accurate: 290
Passes %: 79%
expected_goals: 0.33

Stats for Manchester City:
Shots on Goal: 8
Shots off Goal: 4
Total Shots: 17
Blocked Shots: 5
Shots insidebox: 14
Shots outsidebox: 3
Fouls: 8
Corner Kicks: 5
Offsides: 1
Ball Possession: 66%
Yellow Cards: None
Red Cards: 0
Goalkeeper Saves: 1
Total passes: 706
Passes accurate: 634
Passes %: 90%
expected_goals: 2.08


## Get Fixtures and Statistics

### Parameters

In [43]:
csv_output_file_2021 = "../data/API-Football/API-Football-Data-2021:22.csv"
csv_output_file_2022 = "../data/API-Football/API-Football-Data-2022:23.csv"
csv_output_file_2023 = "../data/API-Football/API-Football-Data-2023:24.csv"
premier_league_id = premier_league_id

desired_stats = [
    "Shots on Goal", "Shots off Goal", "Total Shots", "Blocked Shots",
    "Shots insidebox", "Shots outsidebox", "Fouls", "Corner Kicks", "Offsides",
    "Ball Possession", "Yellow Cards", "Red Cards", "Goalkeeper Saves",
    "Total passes", "Passes accurate", "Passes %", "expected_goals"
]

In [18]:
def get_season_data(season_in):
    # Initialize an empty list to store match data
    match_data = []

    # Fetch fixtures for the 2023 season
    fixtures_endpoint = f'{base_url}fixtures'
    params = {
        'league': premier_league_id,
        'season': season_in
    }

    response = requests.get(fixtures_endpoint, headers=headers, params=params)
    fixtures = response.json()

    if not fixtures['response']:
        print("No fixtures found for the 2023 season.")
    else:
        print(f"Found {len(fixtures['response'])} fixtures. Gathering data...")

        for fixture in fixtures['response']:
            fixture_id = fixture['fixture']['id']
            fixture_info = fixture['fixture']
            teams_info = fixture['teams']
            goals_info = fixture['goals']
            
            if goals_info['home'] is None or goals_info['away'] is None:
                continue

            # Basic match details
            match_details = {
                'Fixture ID': fixture_id,
                'Date': fixture_info['date'],
                'Referee': fixture_info.get('referee', 'N/A'),
                'Venue': fixture_info['venue']['name'],
                'City': fixture_info['venue']['city'],
                'Home Team': teams_info['home']['name'],
                'Away Team': teams_info['away']['name'],
                'Home Team Goals': goals_info['home'],  # Organized under Home Team Goals
                'Away Team Goals': goals_info['away']   # Organized under Away Team Goals
            }

            # Determine match result
            if goals_info['home'] > goals_info['away']:
                match_details['Result'] = 'Home Win'
            elif goals_info['home'] < goals_info['away']:
                match_details['Result'] = 'Away Win'
            else:
                match_details['Result'] = 'Draw'

            # Fetch match statistics
            stats_endpoint = f'{base_url}fixtures/statistics'
            stats_params = {'fixture': fixture_id}
            stats_response = requests.get(stats_endpoint, headers=headers, params=stats_params)
            stats_data = stats_response.json()

            # Initialize stats for both teams
            for team_stats in stats_data.get('response', []):
                team_name = team_stats['team']['name']
                prefix = 'Home' if team_name == match_details['Home Team'] else 'Away'

                # Extract desired stats
                for stat in team_stats['statistics']:
                    stat_name = stat['type']
                    stat_value = stat['value'] if stat['value'] is not None else 0

                    if stat_name in desired_stats:
                        match_details[f'{prefix} {stat_name}'] = stat_value

            # Append match data
            match_data.append(match_details)

            # To avoid hitting API rate limits
            time.sleep(1)
            
    return match_data

In [50]:
match_data_2021 = get_season_data('2021')
match_data_2022 = get_season_data('2022')
match_data_2023 = get_season_data('2023')

Found 380 fixtures. Gathering data...
Found 380 fixtures. Gathering data...
Found 380 fixtures. Gathering data...


In [51]:
# Convert to DataFrame
df_21 = pd.DataFrame(match_data_2021)

# Fill missing columns if any stats were missing in some matches
for stat in desired_stats:
    for prefix in ['Home', 'Away']:
        col_name = f'{prefix} {stat}'
        if col_name not in df_21.columns:
            df_21[col_name] = 0

# Save to CSV
df_21.to_csv(csv_output_file_2021, index=False)
print("Data saved to 'csv_output_file_2021'.")

Data saved to 'csv_output_file_2021'.


In [52]:
# Convert to DataFrame
df_22 = pd.DataFrame(match_data_2022)

# Fill missing columns if any stats were missing in some matches
for stat in desired_stats:
    for prefix in ['Home', 'Away']:
        col_name = f'{prefix} {stat}'
        if col_name not in df_22.columns:
            df_22[col_name] = 0

# Save to CSV
df_22.to_csv(csv_output_file_2022, index=False)
print("Data saved to 'csv_output_file_2022'.")

Data saved to 'csv_output_file_2022'.


In [53]:
# Convert to DataFrame
df_23 = pd.DataFrame(match_data_2023)

# Fill missing columns if any stats were missing in some matches
for stat in desired_stats:
    for prefix in ['Home', 'Away']:
        col_name = f'{prefix} {stat}'
        if col_name not in df_23.columns:
            df_23[col_name] = 0

# Save to CSV
df_23.to_csv(csv_output_file_2023, index=False)
print("Data saved to 'csv_output_file_2023'.")

Data saved to 'csv_output_file_2023'.


In [54]:
print(match_data_2023[0])

{'Fixture ID': 1035037, 'Date': '2023-08-11T19:00:00+00:00', 'Referee': 'C. Pawson', 'Venue': 'Turf Moor', 'City': 'Burnley', 'Home Team': 'Burnley', 'Away Team': 'Manchester City', 'Home Team Goals': 0, 'Away Team Goals': 3, 'Result': 'Away Win', 'Home Shots on Goal': 1, 'Home Shots off Goal': 3, 'Home Total Shots': 6, 'Home Blocked Shots': 2, 'Home Shots insidebox': 5, 'Home Shots outsidebox': 1, 'Home Fouls': 11, 'Home Corner Kicks': 6, 'Home Offsides': 0, 'Home Ball Possession': '34%', 'Home Yellow Cards': 0, 'Home Red Cards': 1, 'Home Goalkeeper Saves': 5, 'Home Total passes': 365, 'Home Passes accurate': 290, 'Home Passes %': '79%', 'Home expected_goals': '0.33', 'Away Shots on Goal': 8, 'Away Shots off Goal': 4, 'Away Total Shots': 17, 'Away Blocked Shots': 5, 'Away Shots insidebox': 14, 'Away Shots outsidebox': 3, 'Away Fouls': 8, 'Away Corner Kicks': 5, 'Away Offsides': 1, 'Away Ball Possession': '66%', 'Away Yellow Cards': 0, 'Away Red Cards': 0, 'Away Goalkeeper Saves': 1, '

## Get stats from seasons prior 2021

In [None]:
csv_output_file_2014 = "../data/API-Football/API-Football-Data-2014:15.csv"
csv_output_file_2015 = "../data/API-Football/API-Football-Data-2015:16.csv"
csv_output_file_2016 = "../data/API-Football/API-Football-Data-2016:17.csv"
csv_output_file_2017 = "../data/API-Football/API-Football-Data-2017:18.csv"
csv_output_file_2018 = "../data/API-Football/API-Football-Data-2018:19.csv"
csv_output_file_2019 = "../data/API-Football/API-Football-Data-2019:20.csv"
csv_output_file_2020 = "../data/API-Football/API-Football-Data-2020:21.csv"
premier_league_id = premier_league_id

desired_stats = [
    "Shots on Goal", "Shots off Goal", "Total Shots", "Blocked Shots",
    "Shots insidebox", "Shots outsidebox", "Fouls", "Corner Kicks", "Offsides",
    "Ball Possession", "Yellow Cards", "Red Cards", "Goalkeeper Saves",
    "Total passes", "Passes accurate", "Passes %", "expected_goals"
]

In [11]:
match_data_2014 = get_season_data('2014')
match_data_2015 = get_season_data('2015')
match_data_2016 = get_season_data('2016')
match_data_2017 = get_season_data('2017')
match_data_2018 = get_season_data('2018')
match_data_2019 = get_season_data('2019')
match_data_2020 = get_season_data('2020')

Found 380 fixtures. Gathering data...
Found 380 fixtures. Gathering data...
Found 380 fixtures. Gathering data...
Found 380 fixtures. Gathering data...
Found 380 fixtures. Gathering data...
Found 380 fixtures. Gathering data...
Found 380 fixtures. Gathering data...


In [12]:
def process_season_data(match_data, csv_output_file):
    df = pd.DataFrame(match_data)

    for stat in desired_stats:
        for prefix in ['Home', 'Away']:
            col_name = f'{prefix} {stat}'
            if col_name not in df.columns:
                df[col_name] = 0
    
    df.to_csv(csv_output_file, index=False)

In [13]:
process_season_data(match_data_2014, csv_output_file_2014)
process_season_data(match_data_2015, csv_output_file_2015)
process_season_data(match_data_2016, csv_output_file_2016)
process_season_data(match_data_2017, csv_output_file_2017)
process_season_data(match_data_2018, csv_output_file_2018)
process_season_data(match_data_2019, csv_output_file_2019)
process_season_data(match_data_2020, csv_output_file_2020)

In [14]:
csv_output_file_2010 = "../data/API-Football/API-Football-Data-2010:11.csv"
csv_output_file_2011 = "../data/API-Football/API-Football-Data-2011:12.csv"
csv_output_file_2012 = "../data/API-Football/API-Football-Data-2012:13.csv"
csv_output_file_2013 = "../data/API-Football/API-Football-Data-2013:14.csv"

In [15]:
match_data_2010 = get_season_data('2010')
match_data_2011 = get_season_data('2011')
match_data_2012 = get_season_data('2012')
match_data_2013 = get_season_data('2013')

Found 370 fixtures. Gathering data...
Found 380 fixtures. Gathering data...
Found 380 fixtures. Gathering data...
Found 380 fixtures. Gathering data...


In [16]:
process_season_data(match_data_2010, csv_output_file_2010)
process_season_data(match_data_2011, csv_output_file_2011)
process_season_data(match_data_2012, csv_output_file_2012)
process_season_data(match_data_2013, csv_output_file_2013)

In [19]:
match_data_2024 = get_season_data('2024')
csv_output_file_2024 = "../data/API-Football/API-Football-Data-2024:25.csv"
process_season_data(match_data_2024, csv_output_file_2024)

Found 380 fixtures. Gathering data...
