In [1]:
import fastf1
import pandas as pd
import numpy as np
from tqdm import tqdm
from src.utils import get_lap_data, team_mapping
import logging
import warnings
warnings.filterwarnings("ignore")
logging.getLogger('fastf1').setLevel(logging.ERROR)  # This will disable both logs and warnings
fastf1.Cache.enable_cache('./.cache')  # replace with your cache directory
fastf1.Cache.offline_mode(True)

In [2]:
team_driver_performance = pd.read_csv('.cache/hist_data/team_driver_performance.csv')
Team_Mapping = team_mapping
prac_columns = ['DriverNumber_prac','LapTime_prac', 'Sector1Time_prac', 'Sector2Time_prac', 'Sector3Time_prac', 'SpeedI1_prac', 'SpeedI2_prac', 'TyreLife_prac','Position_prac','Driver_prac']
old_qual_columns = ['LapTime_old_qual', 'Position_old_qual','Sector1Time_old_qual', 'Sector2Time_old_qual', 'Sector3Time_old_qual','Driver_old_qual']

In [None]:
def load_data(race,year):
    no_prac = False
    no_qual = False
    e_name = ''
    # Load sessions 
    try:
        session = fastf1.get_session(year, race, 'Q')
        e_name = session.event.EventName
        session.load()
        lap_qual = get_lap_data(session)
    except Exception as e:
        #print(f"Error loading qualifying session for {race} in {year}: {e}")
        lap_qual = None

    try:
        session_old_qual = fastf1.get_session(year-1,race,'Q')
        session_old_qual.load()
        lap_old_qual = get_lap_data(session_old_qual)
        lap_old_qual.columns = [f'{col}_old_qual' if col not in ['Team'] else col for col in lap_old_qual.columns]
    except Exception as e:
        #print(f"Error loading old qualifying session for {race} in {year-1}: {e}")
        lap_old_qual = pd.DataFrame([[-1]*len(old_qual_columns)]*20,columns=old_qual_columns)  # Empty DataFrame if old qualifying session fails
        lap_old_qual['Driver_old_qual'] = lap_qual['Driver']
        no_qual = True

    try:
        try:
            session_p = fastf1.get_session(year,race,'FP3')
            session_p.load()
            lap_prac = get_lap_data(session_p)
            lap_prac.columns = [f'{col}_prac' if col not in ['Team'] else col for col in lap_prac.columns]
        except:
            session_p = fastf1.get_session(year,race,'SQ')
            session_p.load()
            lap_prac = get_lap_data(session_p)
            lap_prac.columns = [f'{col}_prac' if col not in ['Team'] else col for col in lap_prac.columns]
    except Exception as e:
        #print(f"Error loading practice session for {race} in {year}: {e}")
        lap_prac = pd.DataFrame([[-1]*len(prac_columns)]*20,columns=prac_columns)  # Empty DataFrame if practice session fails
        lap_prac['DriverNumber_prac'] = lap_qual['DriverNumber']
        lap_prac['Driver_prac'] = lap_qual['Driver']
        no_prac = True

    if no_prac and no_qual:
        print("Both practice and qualifying data are missing. Skipping")
        return None, None, None
    else:
        lap_old_qual_clean = lap_old_qual[old_qual_columns]
        lap_prac_clean = lap_prac[prac_columns]
        tdp = team_driver_performance[team_driver_performance['Country'] == e_name]
        if len(tdp) == 0:
            tdp = pd.DataFrame([[-1]*len(tdp.columns)]*20,columns=tdp.columns)
            print(f"Team Driver Performance data missing for {e_name}, filling with -1s.")
            final_lap = (
                lap_prac_clean
                .merge(lap_old_qual_clean, left_on='Driver_prac', right_on='Driver_old_qual')
                .drop(columns=['Driver_prac', 'Driver_old_qual'])
                .set_index('DriverNumber_prac')
            )
            for col in tdp.columns:
                if col not in final_lap.columns and col != 'Name':
                    final_lap[col] = -1
        else:
            final_lap = (
                lap_prac_clean
                .merge(lap_old_qual_clean, left_on='Driver_prac', right_on='Driver_old_qual')
                .merge(
                    tdp,
                    left_on='Driver_prac', right_on='Name'
                )
                .drop(columns=['Driver_prac', 'Driver_old_qual','Country'])
                .set_index('DriverNumber_prac')
            )
        final_lap['race_event'] = [race]*len(final_lap)

        final_lap.drop_duplicates(inplace=True)
        final_lap.drop_duplicates(subset =['Name'],keep='last',inplace=True)
        final_lap.drop(columns = ['Name','Unnamed: 0'], errors='ignore', inplace=True)

        if lap_qual is not None:
            y = lap_qual[['LapTime', 'Position', 'DriverNumber']].set_index('DriverNumber').loc[final_lap.index]
            y_laptime = y.pop('LapTime').to_list()
            y_position = y.pop('Position').to_list()
        else:
            y_laptime = [-1]*len(final_lap)
            y_position = [-1]*len(final_lap)
            
        final_lap['laptime'] = y_laptime
        final_lap['position'] = y_position

        return final_lap

In [4]:
X = load_data(1,2025)

In [5]:
X

Unnamed: 0_level_0,LapTime_prac,Sector1Time_prac,Sector2Time_prac,Sector3Time_prac,SpeedI1_prac,SpeedI2_prac,TyreLife_prac,Position_prac,LapTime_old_qual,Position_old_qual,...,Best_Position_Recent,Worst_Position_Recent,Avg_Position_Recent,Last_Position,Avg_Finish_in_this_circuit,Avg_Team_Finish_in_this_circuit,Team,race_event,laptime,position
DriverNumber_prac,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
81,75.921,26.425,17.2,32.296,288.0,321.0,5.0,13,89.683,12,...,1,7,2,-1,11,9,7,1,75.18,14
63,75.96,26.277,17.173,32.51,289.0,320.0,5.0,14,89.485,14,...,1,13,4,-1,6,6,8,1,75.546,15
1,76.002,26.398,17.11,32.494,288.0,327.0,2.0,18,89.179,19,...,1,8,3,-1,1,2,9,1,75.481,19
16,76.188,26.385,17.249,32.554,288.0,322.0,5.0,10,89.165,7,...,2,11,5,-1,4,4,5,1,75.755,11
55,76.252,26.486,17.287,32.479,288.0,322.0,5.0,15,89.507,15,...,5,19,11,-1,3,15,10,1,75.931,16
23,76.258,26.473,17.253,32.532,288.0,323.0,5.0,1,90.221,1,...,5,20,12,-1,12,15,10,1,75.737,1
44,76.378,26.448,17.426,32.504,288.0,318.0,8.0,8,89.71,5,...,4,16,7,-1,7,4,5,1,75.919,8
22,76.455,26.456,17.267,32.732,287.0,325.0,2.0,17,90.129,18,...,6,20,12,-1,11,2,9,1,75.67,18
4,76.597,26.64,17.353,32.604,285.0,318.0,8.0,11,89.614,9,...,1,7,2,-1,7,9,7,1,75.096,12
10,76.719,26.66,17.364,32.695,282.0,320.0,8.0,6,90.948,4,...,5,20,13,-1,12,11,3,1,75.98,6


In [7]:
import os

import concurrent.futures

error_X = []
X_train = []
y_train_lap = []
y_train_pos = []

pairs = []
for y in [2022, 2023, 2024, 2025]:
    rounds = range(1, 18) if y == 2025 else range(1, 25)
    for r in rounds:
        pairs.append((r, y))

acc_X = []
acc_y1 = []
acc_y2 = []

max_workers = 5
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as ex:
    fut_to_pair = {ex.submit(load_data, r, y): (r, y) for r, y in pairs}
    for fut in tqdm(concurrent.futures.as_completed(fut_to_pair), total=len(fut_to_pair)):
        r, y = fut_to_pair[fut]
        try:
            X = fut.result()
        except Exception as e:
            print(f"Error for {r} {y}: {e}")
            error_X.append((r, y))
            continue

        # treat empty result
        if X is not None:
            if isinstance(X, np.ndarray) and X.size == 0:
                error_X.append((r, y))
                continue
            if len(X) == 0:
                error_X.append((r, y))
                continue

            acc_X.append(X)

 21%|████████████████████████▎                                                                                         | 19/89 [01:22<02:57,  2.54s/it]

Error loading qualifying session for 23 in 2022: Invalid round: 23
Error loading old qualifying session for 23 in 2021: Invalid round: 23
Error for 23 2022: 'NoneType' object is not subscriptable
Error loading qualifying session for 24 in 2022: Invalid round: 24


 22%|█████████████████████████▌                                                                                        | 20/89 [01:23<02:20,  2.03s/it]

Error loading old qualifying session for 24 in 2021: Invalid round: 24
Error for 24 2022: 'NoneType' object is not subscriptable


 48%|███████████████████████████████████████████████████████                                                           | 43/89 [03:18<01:46,  2.32s/it]

Error loading qualifying session for 23 in 2023: Invalid round: 23
Error loading old qualifying session for 23 in 2022: Invalid round: 23
Error for 23 2023: 'NoneType' object is not subscriptable
Error loading qualifying session for 24 in 2023: Invalid round: 24


 49%|████████████████████████████████████████████████████████▎                                                         | 44/89 [03:19<01:18,  1.74s/it]

Error loading old qualifying session for 24 in 2022: Invalid round: 24
Error for 24 2023: 'NoneType' object is not subscriptable


 58%|██████████████████████████████████████████████████████████████████▌                                               | 52/89 [04:07<02:59,  4.84s/it]

Error loading qualifying session for 9 in 2024: Invalid round: 9


 63%|███████████████████████████████████████████████████████████████████████▋                                          | 56/89 [04:28<02:47,  5.08s/it]

Team Driver Performance data missing for , filling with -1s.
Error for 9 2024: Index(['Name'], dtype='object')


 78%|████████████████████████████████████████████████████████████████████████████████████████▍                         | 69/89 [05:37<01:31,  4.55s/it]

Error loading old qualifying session for 24 in 2023: Invalid round: 24
Error loading old qualifying session for 23 in 2023: Invalid round: 23


 79%|█████████████████████████████████████████████████████████████████████████████████████████▋                        | 70/89 [05:40<01:22,  4.33s/it]

Error loading old qualifying session for 1 in 2024: Invalid round: 1
Both practice and qualifying data are missing. Skipping


 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 87/89 [07:19<00:10,  5.47s/it]

Error for 15 2025: "['18'] not in index"


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89/89 [07:21<00:00,  4.96s/it]


In [24]:
X = pd.concat([i for i in acc_X if type(i)!=tuple])
X = X[X['Position_prac']!=-1]
y_train_lap = X['laptime']
y_train_pos = X['position']
X.drop(columns=['laptime','position'],inplace=True)
X_train = X.values

In [25]:
X.shape, len(y_train_lap), len(y_train_pos), len(error_X)

((594, 24), 594, 594, 6)

In [26]:
#Save the training data
np.save('.cache/hist_data/X_train.npy',X_train)
np.save('.cache/hist_data/y_train_lap.npy',y_train_lap)
np.save('.cache/hist_data/y_train_pos.npy',y_train_pos)