In [1]:
import requests
import pandas as pd
from tqdm import tqdm

In [2]:
def time_to_seconds(timestr):
    """Convert qualifying time like '1:23.456' to total seconds as float."""
    if timestr is None or pd.isna(timestr):
        return None
    try:
        mins, secs = timestr.split(':')
        return int(mins) * 60 + float(secs)
    except:
        return None


In [3]:
def get_f1_data(seasons=[2022, 2023, 2024]):
    all_data = []

    for season in tqdm(seasons):
        for round_num in range(1, 25):  # max 24 races per season
            # Race results API
            results_url = f"https://ergast.com/api/f1/{season}/{round_num}/results.json?limit=100"
            qual_url = f"https://ergast.com/api/f1/{season}/{round_num}/qualifying.json?limit=100"

            try:
                res_resp = requests.get(results_url).json()
                qual_resp = requests.get(qual_url).json()

                races = res_resp['MRData']['RaceTable']['Races']
                if not races:
                    continue

                race = races[0]
                qual_results = {q['Driver']['driverId']: q for q in qual_resp['MRData']['RaceTable']['Races'][0]['QualifyingResults']} if qual_resp['MRData']['RaceTable']['Races'] else {}

                for result in race['Results']:
                    driver = result['Driver']
                    constructor = result['Constructor']
                    driver_id = driver['driverId']

                    q_data = qual_results.get(driver_id, {})
                    q1 = time_to_seconds(q_data.get('Q1'))
                    q2 = time_to_seconds(q_data.get('Q2'))
                    q3 = time_to_seconds(q_data.get('Q3'))

                    all_data.append({
                        'season': season,
                        'round': int(race['round']),
                        'race_name': race['raceName'],
                        'circuit': race['Circuit']['circuitName'],
                        'date': race['date'],
                        'driver': f"{driver['givenName']} {driver['familyName']}",
                        'driver_id': driver_id,
                        'constructor': constructor['name'],
                        'grid': int(result['grid']),
                        'position': int(result['position']),
                        'status': result['status'],
                        'points': float(result['points']),
                        'Q1': q1,
                        'Q2': q2,
                        'Q3': q3
                    })
            except Exception as e:
                print(f"Error in {season} round {round_num}: {e}")
    
    return pd.DataFrame(all_data)


In [4]:
df_raw = get_f1_data([2022, 2023, 2024])
df_raw.to_csv('../data/f1_raw_2022_2024.csv', index=False)
df_raw.head()

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:44<00:00, 74.87s/it]


Unnamed: 0,season,round,race_name,circuit,date,driver,driver_id,constructor,grid,position,status,points,Q1,Q2,Q3
0,2022,1,Bahrain Grand Prix,Bahrain International Circuit,2022-03-20,Charles Leclerc,leclerc,Ferrari,1,1,Finished,26.0,91.471,90.932,90.558
1,2022,1,Bahrain Grand Prix,Bahrain International Circuit,2022-03-20,Carlos Sainz,sainz,Ferrari,3,2,Finished,18.0,91.567,90.787,90.687
2,2022,1,Bahrain Grand Prix,Bahrain International Circuit,2022-03-20,Lewis Hamilton,hamilton,Mercedes,5,3,Finished,15.0,92.285,91.048,91.238
3,2022,1,Bahrain Grand Prix,Bahrain International Circuit,2022-03-20,George Russell,russell,Mercedes,9,4,Finished,12.0,92.269,91.252,92.216
4,2022,1,Bahrain Grand Prix,Bahrain International Circuit,2022-03-20,Kevin Magnussen,kevin_magnussen,Haas F1 Team,7,5,Finished,10.0,91.955,91.461,91.808


Step 2

In [6]:
import pandas as pd

df = pd.read_csv('../data/f1_raw_2022_2024.csv', parse_dates=['date'])
df.head()

Unnamed: 0,season,round,race_name,circuit,date,driver,driver_id,constructor,grid,position,status,points,Q1,Q2,Q3
0,2022,1,Bahrain Grand Prix,Bahrain International Circuit,2022-03-20,Charles Leclerc,leclerc,Ferrari,1,1,Finished,26.0,91.471,90.932,90.558
1,2022,1,Bahrain Grand Prix,Bahrain International Circuit,2022-03-20,Carlos Sainz,sainz,Ferrari,3,2,Finished,18.0,91.567,90.787,90.687
2,2022,1,Bahrain Grand Prix,Bahrain International Circuit,2022-03-20,Lewis Hamilton,hamilton,Mercedes,5,3,Finished,15.0,92.285,91.048,91.238
3,2022,1,Bahrain Grand Prix,Bahrain International Circuit,2022-03-20,George Russell,russell,Mercedes,9,4,Finished,12.0,92.269,91.252,92.216
4,2022,1,Bahrain Grand Prix,Bahrain International Circuit,2022-03-20,Kevin Magnussen,kevin_magnussen,Haas F1 Team,7,5,Finished,10.0,91.955,91.461,91.808


In [4]:
print(df.info())
print(df.isnull().sum())
print(df['status'].value_counts())
print(df['season'].value_counts())

NameError: name 'df' is not defined

In [5]:
df = df[df['status'] == 'Finished']
df = df.dropna(subset=['position'])
#filtering to only finished races

NameError: name 'df' is not defined

In [6]:
df['podium'] = df['position'].apply(lambda x: 1 if x <= 3 else 0)
# target variable

NameError: name 'df' is not defined

In [7]:
df['qualifying_time'] = df[['Q1', 'Q2', 'Q3']].min(axis=1)
# adding best quali time

NameError: name 'df' is not defined

In [8]:
df['circuit_encoded'] = df['circuit'].astype('category').cat.codes
# encode circuit as numerical

NameError: name 'df' is not defined

In [9]:
df = df.sort_values(['driver', 'date'])

df['driver_form'] = df.groupby('driver')['position'].transform(
    lambda x: x.shift(1).rolling(3).mean()
)
# calculating driver form

NameError: name 'df' is not defined

In [10]:
df['constructor_form'] = df.groupby('constructor')['position'].transform(
    lambda x: x.shift(1).rolling(3).mean()
)
# constructor form

NameError: name 'df' is not defined

In [11]:
df['grid_advantage'] = df['grid'] - df['position']
# grid advantage 

NameError: name 'df' is not defined

In [12]:
df_clean = df.dropna(subset=[
    'podium', 'grid', 'driver_form', 'constructor_form',
    'circuit_encoded', 'grid_advantage', 'qualifying_time'
])

df_clean.to_csv('../data/f1_features_2022_2024.csv', index=False)
df_clean.head()


NameError: name 'df' is not defined

| Function             | Purpose                                   |
| -------------------- | ----------------------------------------- |
| `groupby('driver')`  | Process each driver's data separately     |
| `transform(...)`     | Return a column the same size as original |
| `x.shift(1)`         | Remove current race from the average      |
| `.rolling(3).mean()` | Average last 3 results (after shift)      |


For each driver, shift their position history by one (so we exclude the current race), then compute the average of the last 3 races, and assign that value as their driver_form going into the current race."


In [1]:
df['circuit'].astype('category').cat.categories

NameError: name 'df' is not defined