In [367]:
import numpy as np
import pandas as pd
import fastf1
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import os

In [368]:
# os.mkdir('f1_cache')

In [369]:
# enable cache
fastf1.Cache.enable_cache('f1_cache')

# loading 2024 Japan GP Race data
session_2024 = fastf1.get_session(2024, 'Japan', 'R')
session_2024.load()

laps_2024 = session_2024.laps[['Driver', 'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']].copy()

core           INFO 	Loading data for Japanese Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '4', '14', '63', '81', '44', '22', '27', '18', '20', '77', '31', '10', '2', '24', '3', '23']


In [370]:
# loading 2025 Japan Qualifying data
session_2025 = fastf1.get_session(2025, 'Japan', 'Q')
session_2025.load()

quali_2025 = session_2025.laps[['Driver', 'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']]
quali_2025.columns = ['Driver', 'quali_LapTime_2025', 'quali_Sector1Time_2025', 'quali_Sector2Time_2025', 'quali_Sector3Time_2025']

core           INFO 	Loading data for Japanese Grand Prix - Qualifying [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '16', '63', '12', '6', '44', '23', '87', '10', '55', '14', '30', '22', '27', '5', '31', '7', '18']


In [371]:
# converting all the times into seconds
def total_seconds(df,columns):
    for cols in columns:
        df[cols] = pd.to_timedelta(df[cols])
        df[cols] = df[cols].dt.total_seconds()

total_seconds(laps_2024, ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time'])
total_seconds(quali_2025, ['quali_LapTime_2025', 'quali_Sector1Time_2025', 'quali_Sector2Time_2025', 'quali_Sector3Time_2025'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = pd.to_timedelta(df[cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = df[cols].dt.total_seconds()


In [372]:
laps_2024 = laps_2024.groupby('Driver')[['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']].mean().reset_index()
quali_2025 = quali_2025.groupby('Driver')[['quali_LapTime_2025', 'quali_Sector1Time_2025', 'quali_Sector2Time_2025', 'quali_Sector3Time_2025']].min().reset_index()

In [373]:
# we will also include the 2024 Canadian Grand Prix average lap times as the race was in wet condition which will be helpful in making predictions for this season Japan Grand Prix as there is 90% chance of rain

In [374]:
canadian_gp_2024 = fastf1.get_session(2024, 'Canada', 'R')
canadian_gp_2024.load()
wet_conditions = canadian_gp_2024.laps[['Driver', 'LapTime']]
wet_conditions.columns = ['Driver', 'Wet_LapTime']
total_seconds(wet_conditions, ['Wet_LapTime'])
wet_conditions = wet_conditions.groupby('Driver')[['Wet_LapTime']].mean().reset_index()

core           INFO 	Loading data for Canadian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '63', '44', '81', '14', '18', '3', '10', '31', '27', '20', '77', '22', '24', '55', '23', '11', '16', '2']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [375]:
laps_2024.sample(5)

Unnamed: 0,Driver,LapTime,Sector1Time,Sector2Time,Sector3Time
8,NOR,98.261392,35.65402,43.851736,19.364226
1,ALO,98.639314,35.8362,43.799377,19.556226
13,RUS,98.978902,36.855098,43.947755,19.754679
10,PER,97.903706,35.45464,43.726057,19.352264
0,ALB,,,,


In [376]:
quali_2025.sample(5)

Unnamed: 0,Driver,quali_LapTime_2025,quali_Sector1Time_2025,quali_Sector2Time_2025,quali_Sector3Time_2025
17,STR,89.271,31.034,40.483,17.754
6,GAS,87.822,30.593,39.84,17.389
5,DOO,88.877,30.949,40.186,17.608
15,RUS,87.318,30.376,39.56,17.281
0,ALB,87.615,30.604,39.581,17.399


In [377]:
wet_conditions.sample(5)

Unnamed: 0,Driver,Wet_LapTime
13,RUS,90.746343
1,ALO,90.934814
2,BOT,91.263057
3,GAS,91.113543
12,RIC,91.094271


In [378]:
merged_df = laps_2024.merge(quali_2025, on='Driver', how='left')
merged_df = merged_df.merge(wet_conditions, on='Driver', how='left')

In [379]:
# stimulating target variable assuming that the wet laps are 10% slower
merged_df['assumed_japan_LapTime'] = merged_df['quali_LapTime_2025'] * 1.1

In [380]:
merged_df.dropna(subset=['quali_LapTime_2025'], inplace= True)

In [381]:
X = merged_df.drop(['assumed_japan_LapTime', 'Driver'], axis= 1).fillna(0)
y = merged_df['assumed_japan_LapTime'].fillna(0)

In [382]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [383]:
model = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01)
model.fit(X_train, y_train)
preds = model.predict(X_test)

# Checking for the mean absolute error
print(f'The MAE is : {mean_absolute_error(y_test,preds)}')

The MAE is : 0.32400728714709714


In [385]:
# train on whole dataset for predictions
model.fit(X, y)
preds = model.predict(X)

merged_df['predicted_japan_LapTime'] = preds

print('The predicted Japan Grand Prix Lap Times areüèéÔ∏è‚è∞:')
merged_df.sort_values(by= 'predicted_japan_LapTime')[['Driver', 'predicted_japan_LapTime']]

The predicted Japan Grand Prix Lap Times areüèéÔ∏è‚è∞:


Unnamed: 0,Driver,predicted_japan_LapTime
18,VER,95.681362
8,NOR,95.694544
11,PIA,95.729732
6,LEC,96.028926
13,RUS,96.04982
4,HAM,96.371013
0,ALB,96.376506
3,GAS,96.604198
14,SAI,96.619594
1,ALO,96.686689
