In [1]:
# Imports
import requests
import pandas as pd
import numpy as np
import sys

In [3]:
# Get race data starting from 1995

def get_race_data(start_year:int=1995) -> pd.DataFrame:
    base_url = "http://ergast.com/api/f1"
    current_year = 2024  # Update this to the current year
    all_races = []

    for year in range(start_year, current_year):
        year_url = f"{base_url}/{year}.json"
        response = requests.get(year_url)
        if response.status_code == 200:
            year_data = response.json()
            try:
                races = year_data['MRData']['RaceTable']['Races']
                for race in races:
                    race_info = {
                        'season': race['season'],
                        'round': race['round'],
                        'raceName': race['raceName'],
                        'date': race['date'],
                        'time': race.get('time', 'N/A'),  # 'time' might not be available for all races
                        'circuitId': race['Circuit']['circuitId'], 
                        'circuit': race['Circuit']['circuitName'],
                        'location': race['Circuit']['Location']['locality'],
                        'country': race['Circuit']['Location']['country'],
                        'long': race['Circuit']['Location']['long'], 
                        'lat':  race['Circuit']['Location']['lat'],
                    }
                    all_races.append(race_info)
            except KeyError:
                print(f"Data format error for year {year}")
        else:
            print(f"Failed to fetch data for year {year}")

    return pd.DataFrame(all_races)

# Fetch the data and create a DataFrame
race_data_df = get_race_data()
race_data_df.to_csv("1995_data/Race_Information_1995_2023.csv")

In [None]:
# Fetch Race results for all races
# This step will take time
# Last time this function took: ~15 minutes to complete
def fetch_race_results(race_data_df):
    race_dfs = []  # List to store individual race DataFrames

    for index, row in race_data_df.iterrows():
        season = row['season']
        round_num = row['round']
        sys.stdout.write(f"\rFetching results for Season: {season}, Round: {round_num}")
        sys.stdout.flush()
        results_url = f"http://ergast.com/api/f1/{season}/{round_num}/results.json"
        response = requests.get(results_url)
        
        if response.status_code == 200:
            results_data = response.json()
            try:
                results = results_data['MRData']['RaceTable']['Races'][0]['Results']
                results_df = pd.DataFrame(results)
                results_df['season'] = season
                results_df['round'] = round_num
                race_dfs.append(results_df)
            except KeyError:
                print(f"Data format error for season {season}, round {round_num}")
        else:
            print(f"Failed to fetch data for season {season}, round {round_num}")

    return race_dfs

race_results_data_df_list = fetch_race_results(race_data_df)
race_results_data_df = pd.concat(race_results_data_df_list, ignore_index=True)
race_results_data_df.to_csv("1995_data/Race_Results_1995_2023.csv")

In [None]:
# Fetch driver information 

def fetch_all_f1_drivers():
    base_url = "http://ergast.com/api/f1/drivers.json"
    all_drivers = []
    offset = 0
    limit = 30  # You can adjust the limit as needed

    while True:
        url = f"{base_url}?limit={limit}&offset={offset}"
        response = requests.get(url)

        if response.status_code == 200:
            drivers_data = response.json()
            drivers = drivers_data['MRData']['DriverTable']['Drivers']
            all_drivers.extend(drivers)

            # Check if there are more pages of data
            total_drivers = int(drivers_data['MRData']['total'])
            offset += limit
            if offset >= total_drivers:
                break
        else:
            print(f"Failed to fetch data at offset {offset}")
            break

    return pd.DataFrame(all_drivers)

# Fetch all F1 drivers
all_drivers_df = fetch_all_f1_drivers()
all_drivers_df.to_csv("1995_data/drivers_information.csv")

In [None]:
# Fetch circuits information

def fetch_all_f1_circuits() -> pd.DataFrame:
    base_url = "http://ergast.com/api/f1/circuits.json"
    all_circuits = []
    offset = 0
    limit = 30  # Adjust the limit as needed

    while True:
        url = f"{base_url}?limit={limit}&offset={offset}"
        response = requests.get(url)

        if response.status_code == 200:
            circuits_data = response.json()
            circuits = circuits_data['MRData']['CircuitTable']['Circuits']
            all_circuits.extend(circuits)

            # Check if there are more pages of data
            total_circuits = int(circuits_data['MRData']['total'])
            offset += limit
            if offset >= total_circuits:
                break
        else:
            print(f"Failed to fetch data at offset {offset}")
            break

    return pd.DataFrame(all_circuits)

# Fetch all F1 circuits
all_circuits_df = fetch_all_f1_circuits()
all_circuits_df.to_csv("1995_data/circuits_information.csv")

In [None]:
# Fetch driver and constructor standings

def fetch_all_driver_standings(race_data_df:pd.DataFrame) -> pd.DataFrame:

    BASE_URL = "http://ergast.com/api/f1"
    all_top10_standings = []
    for _, row in race_data_df.iterrows():
        season = row['season']
        round_num = row['round']
        results_url = f"{BASE_URL}/{season}/{round_num}/driverStandings.json"
        sys.stdout.write(f"\rFetching results for Season: {season}, Round: {round_num}.")
        sys.stdout.flush()
        response = requests.get(results_url)
        if response.status_code == 200:
            results_data = response.json()
            try:
                race_standings = results_data['MRData']['StandingsTable']['StandingsLists']
                all_top10_standings.append(race_standings)
            except KeyError:
                print(f"\nData format error for season {season}, round {round_num}")
        else:
            print(f"\nFailed to fetch data for season {season}, round {round_num}")

    # Explode the columns to capture all features
    df = pd.DataFrame(all_top10_standings,columns=['StandingsList'])['StandingsList'].apply(pd.Series)
    df = df.explode('DriverStandings',ignore_index=True)
    df = df.join(df['DriverStandings'].apply(pd.Series))
    df = df.join(df['Driver'].apply(pd.Series))
    df = df.join(df['Constructors'].apply(pd.Series))

    # Drop the original columns
    df = df.drop(['DriverStandings','Driver','Constructors','StandingsList'], axis=1)
    return df        

driver_standing_df = fetch_all_driver_standings(race_data_df)
driver_standing_df.to_csv("1995_data/driver_standings.csv")

In [None]:
# # Fetch driver and constructor standing data for each season/race
# # This step will take time
# # Last time this function took: __ minutes to complete

# def fetch_all_standings(race_data_df:pd.DataFrame,driver_standings:bool=True) -> list[pd.DataFrame]:
#     base_url = "http://ergast.com/api/f1"
#     all_standings = []
#     for _,xxx row in race_data_df.iterrows():
#         season = row['season']
#         round_num = row['round']
#         if driver_standings:
#             results_url = f"{base_url}/{season}/{round_num}/driverStandings.json"
#         else:
#             results_url = f"{base_url}/{season}/{round_num}/constructorStandings.json"
#         sys.stdout.write(f"\rFetching results for Season: {season}, Round: {round_num}.")
#         sys.stdout.flush()
#         response = requests.get(results_url)

#         if response.status_code == 200:
#             results_data = response.json()
#             try:
#                 standings = results_data['MRData']['StandingsTable']['StandingsLists']
#                 pos_info = {
#                     "season": standings['season']
#                     "round": standings['round']
#                 }
#                 for position in standings:
#                     pos_info['position'] = position['position'],
#                     pos_info['points'] = position['points'],
#                     pos_info['wins'] = position['wins']

#                     driver_dict = position['Driver']
#                     pos_info['driverId'] = driver_dict['driverId'],
#                     pos_info['driverName'] = driver_dict['driverId']['givenName'] + driver_dict['driverId']['familyName'],
#                     pos_info['driverCode'] = driver_dict['code'],
                    
#                     constructor_dict = 
#                     pos_info['constructorId'] = position['Constructors']['constructorId'],
#                     pos_info['constructorName'] = position['Constructors']['name'],
#                     pos_info['constructorNationality'] = position[]
                    
#                     all_standings.append(pos_info)


#                 results_df = pd.DataFrame(results)
#                 results_df['season'] = season
#                 results_df['round'] = round_num
#                 standing_dfs.append(results_df)
#             except KeyError:
#                     print(f"Data format error for season {season}, round {round_num}")
#         else:
#             print(f"Failed to fetch data for season {season}, round {round_num}")

#     return standing_dfs

# # Fetch all driver and constructor standing data
# driver_standing_list = fetch_all_standings(race_data_df)
# driver_standing_df = pd.concat(driver_standing_list)
# driver_standing_df.to_csv("1995_data/driver_standings.csv")

# constructor_standing_list = fetch_all_standings(race_data_df,driver_standings=False)
# constructor_standing_df = pd.concat(constructor_standing_list)
# driver_standing_df.to_csv("1995_data/constructor_standings.csv")