# F1 Race predictor

Testing a method to predict the F1 race winner

## Calling the packages

In [1]:
# F1 data points
import fastf1 as f1
import fastf1.plotting

In [2]:
# data analysis packages
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.6f}'.format
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [3]:
# ml packages
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

## Data Pre-Processing

### Previous Year's Data

In [4]:
# Load 2024 Bazilian GP race session
session_2024 = fastf1.get_session(2024, "Brazil", "R")
session_2024.load()

core           INFO 	Loading data for S√£o Paulo Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '31', '10', '63', '16', '4', '22', '81', '30', '44', '11', '50', '77', '14', '24', '55', '43', '23', '18', '27']


In [5]:
# Extract lap and sector times
laps_2024 = session_2024.laps[["Driver", "LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]].copy()
laps_2024.dropna(inplace=True)

# Convert times to seconds
for col in ["LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]:
    laps_2024[f"{col} (s)"] = laps_2024[col].dt.total_seconds()

# Aggregate sector data by driver
sector_times_2024 = laps_2024.groupby("Driver").agg({
    "Sector1Time (s)": "mean",
    "Sector2Time (s)": "mean",
    "Sector3Time (s)": "mean"
}).reset_index()

sector_times_2024["TotalSectorTime (s)"] = (
    sector_times_2024["Sector1Time (s)"] +
    sector_times_2024["Sector2Time (s)"] +
    sector_times_2024["Sector3Time (s)"]
)

### Current Year qualifying data

In [6]:
# 2025 Qualifying Data Brazil GP

qualifying_2025 = pd.DataFrame({
    "Driver": ["Lando Norris", "Andrea Kimi Antonelli", "Charles Leclerc", "Oscar Piastri", "Isack Hadjar",
               "George Russell", "Liam Lawson", "Oliver Bearman", "Pierre Gasly", "Nico Hulkenberg",
               "Fernando Alonso", "Alexander Albon", "Lewis Hamilton", "Lance Stroll", "Carlos Sainz Jr.",
               "Max Verstappen", "Esteban Ocon", "Franco Colapinto", "Yuki Tsunoda", "Gabriel Bortoleto"],
"QualifyingTime (s)": [69.511, 69.685, 69.805, 69.886, 69.931,
                       69.942, 69.962, 69.977, 70.002, 70.039,
                       70.001, 70.053, 70.100, 70.161, 70.472,
                       70.403, 70.438, 70.632, 70.711, 70.811]
})

# race = f1.get_session(2025, 'Brazil', 'Q')

# race.load(telemetry=False, weather=False)

# drivers = race.laps.pick_quicklaps().reset_index()[['Driver', 'LapTime']]
# drivers[f"LapTime (s)"] = drivers['LapTime'].dt.total_seconds()
# qualifying_2025 = drivers.groupby('Driver')[['LapTime (s)']].min().copy()

In [7]:
# Map full names to FastF1 3-letter codes
driver_mapping = {
    "Oscar Piastri": "PIA", "George Russell": "RUS", "Lando Norris": "NOR", "Max Verstappen": "VER",
    "Lewis Hamilton": "HAM", "Charles Leclerc": "LEC", "Isack Hadjar": "HAD", "Andrea Kimi Antonelli": "ANT",
    "Yuki Tsunoda": "TSU", "Alexander Albon": "ALB", "Esteban Ocon": "OCO", "Nico H√ºlkenberg": "HUL",
    "Fernando Alonso": "ALO", "Lance Stroll": "STR", "Carlos Sainz Jr.": "SAI", "Pierre Gasly": "GAS",
    "Oliver Bearman": "BEA", "Franco Colapinto": "COL", "Gabriel Bortoleto": "BOR", "Liam Lawson": "LAW"
}

In [8]:
qualifying_2025["Driver"] = qualifying_2025["Driver"].map(driver_mapping)


### Getting the Rain factor of each driver

In [9]:
# the factor can be calculated by taking the percentage of avg speed dueing a lap during rain vs the the lap in clear weather

In [10]:
# wet driver performance factor
driver_wet_performance = {
    "VER": 0.975196, "HAM": 0.976464, "LEC": 0.975862, "NOR": 0.978179, "ALO": 0.972655,
    "RUS": 0.968678, "SAI": 0.978754, "TSU": 0.996338, "OCO": 0.981810, "GAS": 0.978832, "STR": 0.979857
}
qualifying_2025["WetPerformanceFactor"] = qualifying_2025['Driver'].map(driver_wet_performance)

### Getting the Rain possibility and timings

In [11]:
rain_probability =  1 # as it is the Brazilian GP 
# temperature = forecast_data["main"]["temp"] if forecast_data else 20

# wet performance if chance is greater than 75% for rain
if rain_probability >= 0.75:
    qualifying_2025["QualifyingTime"] = qualifying_2025["QualifyingTime (s)"] / qualifying_2025["WetPerformanceFactor"]
else:
    qualifying_2025["QualifyingTime"] = qualifying_2025["QualifyingTime (s)"]

In [12]:
# Merge qualifying data with sector times
merged_data = qualifying_2025.merge(sector_times_2024[["Driver", "TotalSectorTime (s)"]], on="Driver", how="left")
merged_data["RainProbability"] = rain_probability
# merged_data["QualifyingTime"] = merged_data["QualifyingTime"] ** 2

## Training the model

In [13]:
# Define features
merge_data = merged_data[[
    "Driver", "QualifyingTime", "RainProbability", "TotalSectorTime (s)"
]].fillna(0).copy()

X = merge_data[[
    "QualifyingTime", "RainProbability", "TotalSectorTime (s)"
]].fillna(0)
y = laps_2024.groupby("Driver")["LapTime (s)"].mean().reindex(merge_data["Driver"])

clean_data = merge_data.copy()
clean_data["LapTime (s)"] = y.values
clean_data = clean_data.dropna(subset=["LapTime (s)"])

X = clean_data[[
    "QualifyingTime", "RainProbability", "TotalSectorTime (s)"
]]
y = clean_data["LapTime (s)"]

In [14]:
# Train Gradient Boosting Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=39)
model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05,  max_depth=5, random_state=39)
model.fit(X_train, y_train)
clean_data["PredictedRaceTime (s)"] = model.predict(X)

## Predicting the winner and error check

In [15]:
# Predict race times using 2025 qualifying and sector data
final_results = clean_data.sort_values("PredictedRaceTime (s)")
print("Predicted 2025 Brazil GP Winner:")
print(final_results[["Driver", "PredictedRaceTime (s)"]])

Predicted 2025 Brazil GP Winner:
   Driver  PredictedRaceTime (s)
15    VER              87.882273
16    OCO              88.233121
8     GAS              88.297000
5     RUS              88.477091
0     NOR              88.550742
2     LEC              88.550742
3     PIA              88.726530
18    TSU              88.726530
12    HAM              88.982530
6     LAW              89.025939
7     BEA              89.363364
10    ALO              89.363364
17    COL              89.942000
14    SAI              90.350971


In [16]:
# Evaluate Model
y_pred = model.predict(X_test)
print(f"\nüîç Model Error (MAE): {mean_absolute_error(y_test, y_pred):.2f} seconds")


üîç Model Error (MAE): 0.08 seconds
