In [59]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# CSV imports
base_path = '../Data/f1_dataset/'

races = pd.read_csv(base_path + 'races.csv')
drivers = pd.read_csv(base_path + 'drivers.csv')
qualifying = pd.read_csv(base_path + 'qualifying.csv')
circuits = pd.read_csv(base_path + 'circuits.csv')
constructor_results = pd.read_csv(base_path + 'constructor_results.csv')
constructor_standings = pd.read_csv(base_path + 'constructor_standings.csv')
constructors = pd.read_csv(base_path + 'constructors.csv')
driver_standings = pd.read_csv(base_path + 'driver_standings.csv')
lap_times = pd.read_csv(base_path + 'lap_times.csv')
pit_stops = pd.read_csv(base_path + 'pit_stops.csv')
results = pd.read_csv(base_path + 'results.csv')
seasons = pd.read_csv(base_path + 'seasons.csv')
sprint_results = pd.read_csv(base_path + 'sprint_results.csv')
status = pd.read_csv(base_path + 'status.csv')

In [3]:
# Function to convert the given string in mm:ss:sss format to seconds
def time_to_seconds(t):
    if pd.isna(t): return None
    try:
        m, s = t.split(':')
        return int(m) * 60 + float(s)
    except:
        return None

In [5]:
# Function to convert the given string in mm ss sss format to seconds
def time_to_seconds_1(t):
    if pd.isna(t): return None
    try:
        t = str(t).lower().strip()
        minutes = seconds = milliseconds = 0
        m = re.search(r"(\d+)m", t)
        s = re.search(r"(\d+)s", t)
        ms = re.search(r"(\d+)ms", t)
        if m: minutes = int(m.group(1))
        if s: seconds = int(s.group(1))
        if ms: milliseconds = int(ms.group(1))

        return minutes * 60 + seconds + milliseconds / 1000
    except:
        return None

In [6]:
# Function to convert the given time in seconds to mm:ss:sss format
def seconds_to_time_str(s):
    if s is None: return ''
    m = int(s // 60)
    sec = s % 60
    return f"{m:02}:{sec:06.3f}"

To make our data usefull, we have to sort them into driver - circuit combinations for every driver and circuit avaliable. 
We filter out races so that we are only considering races after the Hybrid Era i.e 2014. 
The fastest lap is the minimum of the Q1, Q2 and Q3 timings as it represents the **fastest raw speed acheivable with minimal interference**. 

We have to predict F1 lap times as a function of year — for a given driver and circuit combination. So, this is a **time series regression problem** where:
* Input:
  * Year (numerical, scalar)
  * Teammate’s lap time (continuous)
  * Constructor ID (categorical or encoded)
* Output:
  * Lap time (continuous value, in seconds)

The lap time trends are nonlinear (due technology changes, rule changes, driver form, etc.). So traditional regression (e.g., linear) might miss subtle patterns .

Neural networks are universal function approximators, making them ideal for modeling unknown and complex relationships in a small number of dimensions.

In [36]:
class F1LapTimePredictor(nn.Module):
    def __init__(self):
        super(F1LapTimePredictor, self).__init__()
        self.fc1 = nn.Linear(3, 8)  # input: year, teammate_time, constructorId
        self.prelu1 = nn.PReLU()
        self.fc2 = nn.Linear(8, 6)
        self.prelu2 = nn.PReLU()
        self.fc3 = nn.Linear(6, 1)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.prelu1(self.fc1(x))
        x = self.prelu2(self.fc2(x))
        x = self.tanh(self.fc3(x))
        return x


We keep the neural network small beacause for every year the **dataset per driver-circuit combination is small** (6-10 data points max). A deep or wide network would overfit immediately.

We make an interface (__getitem__) that returns a scaled input vector of length 3 and the scaled target lap time for any sample.

In [42]:
class F1Dataset(Dataset):
    def __init__(self, driver_circuit_file):
        data = pd.read_csv(driver_circuit_file)

        # Drop any rows with missing values
        data = data.dropna(subset=['year', 'lap_time_seconds', 'teammate_time', 'constructorId'])

        self.year_scaler = MinMaxScaler() #Scales years to the range [0, 1].
        self.teammate_scaler = MinMaxScaler() #Scales teammate lap times to [0, 1].
        self.constructor_scaler = MinMaxScaler() #Scales constructor IDs (numeric) to [0, 1]
        self.lap_time_scaler = MinMaxScaler(feature_range=(-1, 1)) #Scales the target lap times to [-1, 1] because your model's output layer uses tanh, which outputs in that range.

        years = data['year'].values.reshape(-1, 1).astype(np.float32)
        teammate_times = data['teammate_time'].values.reshape(-1, 1).astype(np.float32)
        constructor_ids = data['constructorId'].values.reshape(-1, 1).astype(np.float32)
        lap_times = data['lap_time_seconds'].values.reshape(-1, 1).astype(np.float32)

        self.scaled_years = self.year_scaler.fit_transform(years)
        self.scaled_teammates = self.teammate_scaler.fit_transform(teammate_times)
        self.scaled_constructors = self.constructor_scaler.fit_transform(constructor_ids)
        self.scaled_lap_times = self.lap_time_scaler.fit_transform(lap_times)

    def __len__(self):
        return len(self.scaled_years)

    def __getitem__(self, idx):
        x = np.array([
            self.scaled_years[idx][0],
            self.scaled_teammates[idx][0],
            self.scaled_constructors[idx][0]
        ], dtype=np.float32)

        y = self.scaled_lap_times[idx]
        return torch.tensor(x), torch.tensor(y)

 

In [67]:
#function trains a neural network model (F1LapTimePredictor) to predict F1 lap times based on features for a specific driver-circuit combination.
def train_model(driver_circuit_file, epochs=500, learning_rate=0.01, verbose_name=""):
    dataset = F1Dataset(driver_circuit_file)
    dataloader = DataLoader(dataset, batch_size=len(dataset), shuffle=True)

    #instance of model
    model = F1LapTimePredictor()
    criterion = nn.SmoothL1Loss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(epochs):
        for features, lap_times in dataloader:  # `features` now contains [year, teammate_time, constructorId]
            outputs = model(features)
            loss = criterion(outputs, lap_times)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # if (epoch + 1) % 100 == 0:
        #     print(f"[{verbose_name}] Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.6f}")

    # After training completes, returns the trained model and all the scalers used for each input feature and target.
    # Essential for prediction
    return (
        model,
        dataset.year_scaler,
        dataset.lap_time_scaler,
        dataset.teammate_scaler,
        dataset.constructor_scaler
    )



In [43]:
#This function takes a trained model and associated scalers, along with specific input values.
# Itpredicts the lap time in the original scale.

def predict_lap_time(model, year_scaler, lap_time_scaler, teammate_scaler, constructor_scaler,
                     year, teammate_time, constructor_id):
    model.eval()
    with torch.no_grad():
        # Normalize each input feature
        year_norm = year_scaler.transform(np.array([[year]]).astype(np.float32))
        teammate_norm = teammate_scaler.transform(np.array([[teammate_time]]).astype(np.float32))
        constructor_norm = constructor_scaler.transform(np.array([[constructor_id]]).astype(np.float32))

        # Combine into single input tensor
        input_features = np.hstack([year_norm, teammate_norm, constructor_norm])
        input_tensor = torch.tensor(input_features, dtype=torch.float32)

        # Predict and inverse transform the result
        predicted_norm = model(input_tensor)
        predicted = lap_time_scaler.inverse_transform(predicted_norm.numpy())
        
    return float(predicted.squeeze())


We train a separate model for each driver - circuit combination (using the train_model() above) and predict the lap times (usinh predict_lap_time() above). 

If we are predicting for a year which we already have data for we do the error analysis and see how much deviation the prediction has from the actual data. 

It Predicts lap times using year, teammate’s lap time, and constructor ID and saves it in a csv.

In [None]:
prediction_year = 2022
results = []
data_folder = '../Data/driver_circuit_data'

for file_name in os.listdir(data_folder):
    if file_name.endswith(".csv"):
        file_path = os.path.join(data_folder, file_name)
        try:
            model, year_scaler, lap_time_scaler, teammate_scaler, constructor_scaler = train_model(
                file_path,
                verbose_name=file_name.replace(".csv", "")
            )
            data = pd.read_csv(file_path)
            actual_row = data[data['year'] == prediction_year]
            if not actual_row.empty:
                teammate_time = actual_row['teammate_time'].values[0]
                constructor_id = actual_row['constructorId'].values[0]
                actual_time = actual_row['lap_time_seconds'].values[0]
                predicted_time = predict_lap_time(
                    model,
                    year_scaler,
                    lap_time_scaler,
                    teammate_scaler,
                    constructor_scaler,
                    prediction_year,
                    teammate_time,
                    constructor_id
                )
                deviation_pct = abs(predicted_time - actual_time) / actual_time * 100
            else:
                predicted_time = predict_lap_time(
                    model,
                    year_scaler,
                    lap_time_scaler,
                    teammate_scaler,
                    constructor_scaler,
                    prediction_year,
                    0.0,        # default value if missing
                    0           # default constructor ID if missing
                )
                actual_time = None
                deviation_pct = None

            base_name = file_name.replace(".csv", "")
            parts = base_name.split("_")
            driver = f"{parts[0]}_{parts[1]}"
            circuit = "_".join(parts[2:])

            results.append({
                "driver": driver,
                "circuit": circuit,
                f"predicted_lap_time_{prediction_year}": predicted_time,
                f"actual_lap_time_{prediction_year}": actual_time,
                "deviation_percent": deviation_pct
            })

        except Exception as e:
            print(f"Failed for {file_name}: {e}")

results_df = pd.DataFrame(results)
results_df.to_csv(f"../Data/predicted_vs_actual/predicted_vs_actual_{prediction_year}.csv", index=False)
print(f"{prediction_year} prediction complete. Results saved to predicted_vs_actual_{prediction_year}.csv")


2022 prediction complete. Results saved to predicted_vs_actual_2022.csv


so now i have to make predictions for lap times of every driver for every circuit. 

In [None]:
# Use this snippet to get the % deviation when the predicted year is in the training dataset.
df = pd.read_csv(f"..Data/predicted_vs_actual/predicted_vs_actual_{prediction_year}.csv")
df = df.dropna(subset=[f'actual_lap_time_{prediction_year}'])
df_sorted = df.sort_values(by='deviation_percent', ascending=False)
df_sorted[f'predicted_lap_time_{prediction_year}'] = df_sorted[f'predicted_lap_time_{prediction_year}'].round(3)
df_sorted[f'actual_lap_time_{prediction_year}'] = df_sorted[f'actual_lap_time_{prediction_year}'].round(3)
df_sorted['deviation_percent'] = df_sorted['deviation_percent'].round(2)

error_table = df_sorted[['driver', 'circuit', 
                         f'predicted_lap_time_{prediction_year}', 
                         f'actual_lap_time_{prediction_year}', 
                         'deviation_percent']]
display(error_table)

average_deviation = df_sorted['deviation_percent'].mean()
print(f"\n📊 Average Deviation Across All Predictions ({prediction_year}): {average_deviation:.2f}%")



  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,driver,circuit,predicted_lap_time_2022,actual_lap_time_2022,deviation_percent
117,Nicholas_Latifi,Red_Bull_Ring,65.176,67.003,2.73
74,Lewis_Hamilton,Hungaroring,76.564,78.035,1.89
472,Lance_Stroll,Bahrain_International_Circuit,91.423,93.032,1.73
7,George_Russell,Yas_Marina_Circuit,83.214,84.511,1.54
550,Max_Verstappen,Circuit_de_Monaco,70.567,71.666,1.53
...,...,...,...,...,...
358,Guanyu_Zhou,Baku_City_Circuit,103.777,103.777,0.00
47,Sebastian_Vettel,Albert_Park_Grand_Prix_Circuit,,81.149,
206,Pierre_Gasly,Jeddah_Corniche_Circuit,,89.254,
221,Fernando_Alonso,Miami_International_Autodrome,,90.160,



📊 Average Deviation Across All Predictions (2022): 0.16%


In [110]:
# Use this snippet to get the % deviation when the predicted year is in the training dataset.
valid_rows = df[[f'predicted_lap_time_{prediction_year}', f'actual_lap_time_{prediction_year}']].dropna()
y_true = valid_rows[f'actual_lap_time_{prediction_year}']
y_pred = valid_rows[f'predicted_lap_time_{prediction_year}']

# Compute metrics
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)
mape = (abs((y_pred - y_true) / y_true) * 100).mean()

# sMAPE function
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))

symmetric_mape = smape(y_true.values, y_pred.values)

# Print metrics
print(f"📏 MAE: {mae:.3f} seconds")
print(f"📏 RMSE: {rmse:.3f} seconds")
print(f"📈 R² Score: {r2:.4f}")
print(f"📊 MAPE: {mape:.2f}%")
print(f"📊 sMAPE: {symmetric_mape:.2f}%")

📏 MAE: 0.137 seconds
📏 RMSE: 0.287 seconds
📈 R² Score: 0.9994
📊 MAPE: 0.16%
📊 sMAPE: 0.16%


In [None]:
# Use this snippet to get the % deviation for unseen data (eg: 2025)

pred_df = pd.read_csv("..Data/predicted_vs_actual/predicted_vs_actual_2025.csv")
actual_df = pd.read_csv("..Data/2025_data/2025_qualifying_data.csv")
if 'actual_lap_time_2025' in pred_df.columns:
    pred_df = pred_df.drop(columns=['actual_lap_time_2025'])
if 'deviation_percent' in pred_df.columns:
    pred_df = pred_df.drop(columns=['deviation_percent'])

actual_df['qualifying_time'] = actual_df['qualifying_time'].astype(str)
actual_df['actual_lap_time_2025'] = actual_df['qualifying_time'].apply(time_to_seconds_1)
actual_df = actual_df[actual_df['actual_lap_time_2025'].notna()]
actual_df = actual_df[actual_df['actual_lap_time_2025'] > 0]
actual_df = actual_df[~actual_df['circuit'].isin(['Bahrain_International_Circuit', 'Circuit_de_BarcelonaCatalunya'])]

actual_df.rename(columns={'driver_name': 'driver'}, inplace=True)
merged_df = pd.merge(
    pred_df,
    actual_df[['driver', 'circuit', 'actual_lap_time_2025']],
    on=['driver', 'circuit'],
    how='inner'
)
merged_df = merged_df[merged_df['predicted_lap_time_2025'].notna()]
merged_df = merged_df[merged_df['predicted_lap_time_2025'] > 0]
merged_df['deviation_percent'] = (
    abs(merged_df['predicted_lap_time_2025'] - merged_df['actual_lap_time_2025']) 
    / merged_df['actual_lap_time_2025'] * 100
)
merged_df['predicted_lap_time_2025'] = merged_df['predicted_lap_time_2025'].round(3)
merged_df['actual_lap_time_2025'] = merged_df['actual_lap_time_2025'].round(3)
merged_df['deviation_percent'] = merged_df['deviation_percent'].round(2)
display(merged_df[['driver', 'circuit', 'predicted_lap_time_2025', 'actual_lap_time_2025', 'deviation_percent']])
overall_avg = merged_df['deviation_percent'].mean()
per_circuit_avg = merged_df.groupby('circuit')['deviation_percent'].mean().sort_values(ascending=False)

print(f"\n📊 Overall Average Deviation for 2025 Predictions: {overall_avg:.2f}%")
print("\n📍 Average Deviation by Circuit:\n")
print(per_circuit_avg.round(2))




Unnamed: 0,driver,circuit,predicted_lap_time_2025,actual_lap_time_2025,deviation_percent
0,Max_Verstappen,Albert_Park_Grand_Prix_Circuit,75.915,75.481,0.57
1,Esteban_Ocon,Autodromo_Enzo_e_Dino_Ferrari,75.117,76.613,1.95
2,Pierre_Gasly,Shanghai_International_Circuit,92.930,91.992,1.02
3,Pierre_Gasly,Circuit_de_Monaco,70.896,71.994,1.53
4,Yuki_Tsunoda,Jeddah_Corniche_Circuit,89.939,87.990,2.22
...,...,...,...,...,...
66,Lewis_Hamilton,Autodromo_Enzo_e_Dino_Ferrari,73.706,75.765,2.72
67,Yuki_Tsunoda,Albert_Park_Grand_Prix_Circuit,79.424,75.670,4.96
68,Lance_Stroll,Shanghai_International_Circuit,93.986,91.773,2.41
69,Lance_Stroll,Autodromo_Enzo_e_Dino_Ferrari,75.138,75.497,0.48



📊 Overall Average Deviation for 2025 Predictions: 1.25%

📍 Average Deviation by Circuit:

circuit
Autodromo_Enzo_e_Dino_Ferrari     1.56
Suzuka_Circuit                    1.53
Miami_International_Autodrome     1.48
Albert_Park_Grand_Prix_Circuit    1.35
Shanghai_International_Circuit    1.34
Circuit_de_Monaco                 0.88
Jeddah_Corniche_Circuit           0.63
Name: deviation_percent, dtype: float64
