In [17]:
# 📘 F1 Prediction Project (2025 Edition)
# FastF1 + XGBoost + SHAP + LSTM (PyTorch) with full normalization

# ----------------------------------------
# 🔹 PART 1: SETUP & DEPENDENCIES
# ----------------------------------------
!pip install fastf1 xgboost shap matplotlib scikit-learn pandas --quiet

# Imports
import os
import fastf1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler

# ----------------------------------------
# 🔹 PART 2: ENABLE CACHE & LOAD DATA
# ----------------------------------------
os.makedirs('/content/f1_cache', exist_ok=True)
fastf1.Cache.enable_cache('/content/f1_cache')

session = fastf1.get_session(2023, 'Monaco', 'R')
session.load()

laps = session.laps.pick_quicklaps().copy()
print("Sample lap data:")
print(laps[['Driver', 'LapTime', 'Compound', 'TyreLife', 'TrackStatus']].head())

# ----------------------------------------
# 🔹 PART 3: FEATURE ENGINEERING
# ----------------------------------------
columns = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']
sequence_length = 10

laps = laps[['Driver', 'LapNumber', 'LapTime'] + columns].dropna()
laps['LapTimeSec'] = laps['LapTime'].dt.total_seconds()

# Normalize inputs
feature_scaler = MinMaxScaler()
laps[columns] = feature_scaler.fit_transform(laps[columns])

# Normalize labels
label_scaler = MinMaxScaler()
laps['LapTimeSec'] = label_scaler.fit_transform(laps[['LapTimeSec']])

sequences = []
labels = []

for driver in laps['Driver'].unique():
    driver_laps = laps[laps['Driver'] == driver].sort_values('LapNumber')
    for i in range(len(driver_laps) - sequence_length):
        seq = driver_laps.iloc[i:i+sequence_length][columns].values
        label = driver_laps.iloc[i+sequence_length]['LapTimeSec']
        sequences.append(seq)
        labels.append(label)

X = torch.tensor(np.array(sequences), dtype=torch.float32)
y = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)

# Split into train and test
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# ----------------------------------------
# 🔹 PART 4: LSTM MODEL
# ----------------------------------------
class LapTimeDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(LapTimeDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader = DataLoader(LapTimeDataset(X_test, y_test), batch_size=32, shuffle=False)

class LapPredictor(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.norm = nn.BatchNorm1d(input_size)
        self.lstm = nn.LSTM(input_size, hidden_size=64, batch_first=True)
        self.head = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        x = self.norm(x.transpose(1, 2)).transpose(1, 2)
        _, (h_n, _) = self.lstm(x)
        return self.head(h_n[-1])

model = LapPredictor(input_size=X.shape[2])
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# ----------------------------------------
# 🔹 PART 5: TRAINING
# ----------------------------------------
model.train()
for epoch in range(50):
    total_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_fn(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

# ----------------------------------------
# 🔹 PART 6: EVALUATION
# ----------------------------------------
model.eval()
preds = []
actuals = []

with torch.no_grad():
    for xb, yb in test_loader:
        pred = model(xb)
        preds.extend(pred.squeeze().tolist())
        actuals.extend(yb.squeeze().tolist())

# Inverse transform predictions
preds = label_scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
actuals = label_scaler.inverse_transform(np.array(actuals).reshape(-1, 1)).flatten()

print("\nSample predictions vs actuals:")
for i in range(5):
    print(f"Predicted: {preds[i]:.2f}s | Actual: {actuals[i]:.2f}s")


[0m

core           INFO 	Loading data for Monaco Grand Prix - Race [v3.5.3]
INFO:fastf1.fastf1.core:Loading data for Monaco Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
INFO:fastf1.fastf1.req:Using cached data for session_info
req            INFO 	Using cached data for driver_info
INFO:fastf1.fastf1.req:Using cached data for driver_info
DEBUG:fastf1.ergast:Failed to parse timestamp '-1:53:44.819' in Ergastresponse.
req            INFO 	Using cached data for session_status_data
INFO:fastf1.fastf1.req:Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
INFO:fastf1.fastf1.req:Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
INFO:fastf1.fastf1.req:Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
INFO:fastf1.fastf1.req:Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_d

Sample lap data:
  Driver                LapTime Compound  TyreLife TrackStatus
1    VER 0 days 00:01:19.367000   MEDIUM       2.0           1
2    VER 0 days 00:01:19.074000   MEDIUM       3.0           1
3    VER 0 days 00:01:18.129000   MEDIUM       4.0           1
4    VER 0 days 00:01:18.019000   MEDIUM       5.0           1
5    VER 0 days 00:01:17.640000   MEDIUM       6.0           1
Epoch 5, Loss: 0.2212
Epoch 10, Loss: 0.0645
Epoch 15, Loss: 0.0281
Epoch 20, Loss: 0.0279
Epoch 25, Loss: 0.0271
Epoch 30, Loss: 0.0248
Epoch 35, Loss: 0.0249
Epoch 40, Loss: 0.0244
Epoch 45, Loss: 0.0238
Epoch 50, Loss: 0.0230

Sample predictions vs actuals:
Predicted: 77.97s | Actual: 78.24s
Predicted: 77.96s | Actual: 78.36s
Predicted: 78.01s | Actual: 78.00s
Predicted: 78.04s | Actual: 78.03s
Predicted: 77.95s | Actual: 78.01s
