# Lap Time Prediction Model

This notebook builds a regression model to predict F1 lap times.

## Model Overview
- **Task**: Regression (predict lap time in seconds)
- **Features**: Tire age, compound, fuel load, track conditions, driver skill
- **Algorithms**: LightGBM, XGBoost, Neural Network
- **Target MAE**: < 0.5 seconds

## Setup

In [None]:
import os
import sys
from pathlib import Path

sys.path.insert(0, str(Path.cwd().parent / 'src'))

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tqdm.notebook import tqdm
import joblib

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import xgboost as xgb
import lightgbm as lgb

import fastf1
from fastf1 import get_session, get_event_schedule

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

CACHE_DIR = Path('../data/cache')
CACHE_DIR.mkdir(parents=True, exist_ok=True)
fastf1.Cache.enable_cache(str(CACHE_DIR))

print("Setup complete!")

## 1. Load Lap Time Data

In [None]:
def load_race_laps(year: int, grand_prix: str) -> pd.DataFrame:
    """Load lap data for a specific race."""
    try:
        session = get_session(year, grand_prix, 'R')
        session.load()
        
        laps = session.laps.copy()
        laps['Season'] = year
        laps['GrandPrix'] = grand_prix
        
        # Get driver info
        results = session.results
        driver_teams = results.set_index('Abbreviation')['TeamName'].to_dict()
        laps['Team'] = laps['Driver'].map(driver_teams)
        
        return laps
    except Exception as e:
        print(f"Error loading {year} {grand_prix}: {e}")
        return pd.DataFrame()

In [None]:
# Load lap data for multiple races
SEASONS = [2023]

all_laps = []

for year in SEASONS:
    schedule = get_event_schedule(year)
    race_events = schedule[schedule['EventFormat'] != 'testing']
    
    for _, event in tqdm(race_events.iterrows(), total=len(race_events), desc=f"Loading {year}"):
        laps = load_race_laps(year, event['EventName'])
        if len(laps) > 0:
            laps['Round'] = event['RoundNumber']
            all_laps.append(laps)

df = pd.concat(all_laps, ignore_index=True)
print(f"\nTotal laps loaded: {len(df)}")

## 2. Data Preprocessing

In [None]:
# Convert lap times to seconds
df['LapTimeSeconds'] = df['LapTime'].dt.total_seconds()

# Convert sector times
for sector in ['Sector1Time', 'Sector2Time', 'Sector3Time']:
    if sector in df.columns:
        df[f'{sector}Seconds'] = df[sector].dt.total_seconds()

# Basic statistics
print(f"Lap time range: {df['LapTimeSeconds'].min():.2f}s - {df['LapTimeSeconds'].max():.2f}s")
print(f"Mean lap time: {df['LapTimeSeconds'].mean():.2f}s")

In [None]:
# Filter valid laps
def filter_valid_laps(df: pd.DataFrame) -> pd.DataFrame:
    """Remove pit laps, outliers, and safety car laps."""
    valid = df.copy()
    
    # Remove pit in/out laps
    valid = valid[valid['PitInTime'].isna()]
    valid = valid[valid['PitOutTime'].isna()]
    
    # Remove first lap (typically slow)
    valid = valid[valid['LapNumber'] > 1]
    
    # Remove outliers (>1.5x median)
    median_time = valid.groupby('GrandPrix')['LapTimeSeconds'].transform('median')
    valid = valid[valid['LapTimeSeconds'] < median_time * 1.3]
    valid = valid[valid['LapTimeSeconds'] > median_time * 0.9]
    
    return valid

clean_df = filter_valid_laps(df)
print(f"Valid laps: {len(clean_df)} ({len(clean_df)/len(df)*100:.1f}%)")

## 3. Feature Engineering

In [None]:
def engineer_lap_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create features for lap time prediction."""
    df = df.copy()
    
    # Tire features
    if 'Compound' in df.columns:
        # Encode compound
        compound_map = {'SOFT': 0, 'MEDIUM': 1, 'HARD': 2, 'INTERMEDIATE': 3, 'WET': 4}
        df['CompoundEncoded'] = df['Compound'].map(compound_map).fillna(1)
        
        # Identify stints
        df = df.sort_values(['Driver', 'LapNumber'])
        df['StintChange'] = (df['Compound'] != df['Compound'].shift()) | (df['Driver'] != df['Driver'].shift())
        df['StintNumber'] = df.groupby(['GrandPrix', 'Driver'])['StintChange'].cumsum()
        
        # Tire age within stint
        df['TireAge'] = df.groupby(['GrandPrix', 'Driver', 'StintNumber']).cumcount() + 1
    else:
        df['CompoundEncoded'] = 1
        df['TireAge'] = df['LapNumber']
    
    # Fuel load estimate (decreasing through race)
    max_laps = df.groupby('GrandPrix')['LapNumber'].transform('max')
    df['FuelLoadEstimate'] = 1 - (df['LapNumber'] / max_laps)
    
    # Normalize lap number within race
    df['NormalizedLap'] = df['LapNumber'] / max_laps
    
    # Position features
    if 'Position' in df.columns:
        df['Position'] = pd.to_numeric(df['Position'], errors='coerce')
    
    # Driver encoding
    driver_encoder = LabelEncoder()
    df['DriverEncoded'] = driver_encoder.fit_transform(df['Driver'].fillna('Unknown'))
    
    # Team encoding
    team_encoder = LabelEncoder()
    df['TeamEncoded'] = team_encoder.fit_transform(df['Team'].fillna('Unknown'))
    
    # Circuit encoding
    circuit_encoder = LabelEncoder()
    df['CircuitEncoded'] = circuit_encoder.fit_transform(df['GrandPrix'].fillna('Unknown'))
    
    # Previous lap features (for capturing momentum)
    df = df.sort_values(['Driver', 'GrandPrix', 'LapNumber'])
    df['PrevLapTime'] = df.groupby(['Driver', 'GrandPrix'])['LapTimeSeconds'].shift(1)
    df['LapTimeDelta'] = df['LapTimeSeconds'] - df['PrevLapTime']
    
    # Rolling average lap time (driver performance indicator)
    df['RollingAvgLap'] = df.groupby(['Driver', 'GrandPrix'])['LapTimeSeconds'].transform(
        lambda x: x.shift(1).rolling(5, min_periods=1).mean()
    )
    
    return df

In [None]:
# Apply feature engineering
featured_df = engineer_lap_features(clean_df)

# Check features
print("Feature columns:")
feature_cols = [
    'LapNumber', 'TireAge', 'CompoundEncoded', 'FuelLoadEstimate',
    'NormalizedLap', 'DriverEncoded', 'TeamEncoded', 'CircuitEncoded',
    'PrevLapTime', 'RollingAvgLap'
]
for col in feature_cols:
    if col in featured_df.columns:
        print(f"  {col}: {featured_df[col].dtype}")

In [None]:
# Visualize tire degradation
fig, ax = plt.subplots(figsize=(14, 6))

# Sample a specific race
sample_race = featured_df[featured_df['GrandPrix'] == featured_df['GrandPrix'].unique()[0]]

for driver in sample_race['Driver'].unique()[:4]:
    driver_data = sample_race[sample_race['Driver'] == driver]
    ax.scatter(driver_data['TireAge'], driver_data['LapTimeSeconds'], 
               label=driver, alpha=0.6, s=30)

ax.set_xlabel('Tire Age (Laps)')
ax.set_ylabel('Lap Time (seconds)')
ax.set_title(f'Lap Time vs Tire Age - {sample_race["GrandPrix"].iloc[0]}')
ax.legend()
plt.tight_layout()
plt.show()

## 4. Prepare Training Data

In [None]:
# Define features
FEATURE_COLS = [
    'LapNumber',
    'TireAge',
    'CompoundEncoded',
    'FuelLoadEstimate',
    'NormalizedLap',
    'DriverEncoded',
    'TeamEncoded',
    'CircuitEncoded',
    'PrevLapTime',
    'RollingAvgLap',
]

TARGET_COL = 'LapTimeSeconds'

# Filter rows with all features available
model_df = featured_df.dropna(subset=FEATURE_COLS + [TARGET_COL])
print(f"Training samples: {len(model_df)}")

In [None]:
# Split data (by round for temporal validation)
train_rounds = model_df['Round'].unique()[:-3]  # All but last 3 races
test_rounds = model_df['Round'].unique()[-3:]   # Last 3 races

train_df = model_df[model_df['Round'].isin(train_rounds)]
test_df = model_df[model_df['Round'].isin(test_rounds)]

X_train = train_df[FEATURE_COLS]
y_train = train_df[TARGET_COL]

X_test = test_df[FEATURE_COLS]
y_test = test_df[TARGET_COL]

print(f"Train: {len(X_train)} samples (Rounds: {list(train_rounds[:3])}...{list(train_rounds[-3:])})")
print(f"Test: {len(X_test)} samples (Rounds: {list(test_rounds)})")

## 5. Train Models

In [None]:
def evaluate_regression(model, X_test, y_test, model_name: str):
    """Evaluate regression model."""
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{model_name} Results:")
    print(f"  MAE: {mae:.3f} seconds")
    print(f"  RMSE: {rmse:.3f} seconds")
    print(f"  R²: {r2:.4f}")
    
    return {'mae': mae, 'rmse': rmse, 'r2': r2}, y_pred

In [None]:
# Model 1: LightGBM
print("Training LightGBM...")

lgb_model = lgb.LGBMRegressor(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.05,
    num_leaves=63,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_SEED,
    verbose=-1
)

lgb_model.fit(X_train, y_train)
lgb_metrics, lgb_pred = evaluate_regression(lgb_model, X_test, y_test, "LightGBM")

In [None]:
# Model 2: XGBoost
print("Training XGBoost...")

xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    random_state=RANDOM_SEED
)

xgb_model.fit(X_train, y_train)
xgb_metrics, xgb_pred = evaluate_regression(xgb_model, X_test, y_test, "XGBoost")

In [None]:
# Ensemble prediction
ensemble_pred = (lgb_pred + xgb_pred) / 2

ensemble_mae = mean_absolute_error(y_test, ensemble_pred)
ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))
ensemble_r2 = r2_score(y_test, ensemble_pred)

print(f"\nEnsemble Results:")
print(f"  MAE: {ensemble_mae:.3f} seconds")
print(f"  RMSE: {ensemble_rmse:.3f} seconds")
print(f"  R²: {ensemble_r2:.4f}")

## 6. Model Analysis

In [None]:
# Feature importance
importance_df = pd.DataFrame({
    'Feature': FEATURE_COLS,
    'Importance': lgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=importance_df, x='Importance', y='Feature', ax=ax, palette='viridis')
ax.set_title('LightGBM Feature Importance - Lap Time Prediction')
plt.tight_layout()
plt.show()

In [None]:
# Prediction error distribution
errors = y_test.values - lgb_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Error histogram
axes[0].hist(errors, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(x=0, color='red', linestyle='--', label='Zero Error')
axes[0].set_xlabel('Prediction Error (seconds)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Prediction Error Distribution')
axes[0].legend()

# Actual vs Predicted
axes[1].scatter(y_test, lgb_pred, alpha=0.3, s=10)
min_val = min(y_test.min(), lgb_pred.min())
max_val = max(y_test.max(), lgb_pred.max())
axes[1].plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect Prediction')
axes[1].set_xlabel('Actual Lap Time (seconds)')
axes[1].set_ylabel('Predicted Lap Time (seconds)')
axes[1].set_title('Actual vs Predicted Lap Times')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Error by tire age
test_df_with_pred = test_df.copy()
test_df_with_pred['Predicted'] = lgb_pred
test_df_with_pred['Error'] = test_df_with_pred['LapTimeSeconds'] - test_df_with_pred['Predicted']

error_by_tire = test_df_with_pred.groupby('TireAge')['Error'].agg(['mean', 'std']).reset_index()

fig, ax = plt.subplots(figsize=(12, 6))
ax.fill_between(error_by_tire['TireAge'], 
                error_by_tire['mean'] - error_by_tire['std'],
                error_by_tire['mean'] + error_by_tire['std'],
                alpha=0.3, label='±1 Std Dev')
ax.plot(error_by_tire['TireAge'], error_by_tire['mean'], 'b-', linewidth=2, label='Mean Error')
ax.axhline(y=0, color='red', linestyle='--')
ax.set_xlabel('Tire Age (Laps)')
ax.set_ylabel('Prediction Error (seconds)')
ax.set_title('Prediction Error by Tire Age')
ax.legend()
ax.set_xlim(0, 40)
plt.tight_layout()
plt.show()

## 7. Save Model

In [None]:
# Save model
models_dir = Path('../saved_models')
models_dir.mkdir(exist_ok=True)

model_path = models_dir / 'lap_time_lgb_v1.joblib'
joblib.dump({
    'model': lgb_model,
    'feature_cols': FEATURE_COLS,
    'metrics': lgb_metrics,
    'created_at': datetime.now().isoformat()
}, model_path)

print(f"Model saved to: {model_path}")

## 8. Example Predictions

In [None]:
def predict_lap_times(model, driver_features: pd.DataFrame, feature_cols: list) -> pd.DataFrame:
    """
    Predict lap times for a driver.
    """
    X = driver_features[feature_cols].fillna(0)
    predictions = model.predict(X)
    
    result = driver_features[['Driver', 'LapNumber', 'TireAge', 'LapTimeSeconds']].copy()
    result['PredictedLapTime'] = predictions
    result['Error'] = result['LapTimeSeconds'] - result['PredictedLapTime']
    
    return result

# Example: Predict for VER in a test race
ver_test = test_df[test_df['Driver'] == 'VER'].copy()
ver_predictions = predict_lap_times(lgb_model, ver_test, FEATURE_COLS)

print("VER Lap Time Predictions (Sample):")
ver_predictions[['LapNumber', 'TireAge', 'LapTimeSeconds', 'PredictedLapTime', 'Error']].head(15)

In [None]:
# Visualize VER predictions
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(ver_predictions['LapNumber'], ver_predictions['LapTimeSeconds'], 
        'b-', label='Actual', linewidth=2, alpha=0.8)
ax.plot(ver_predictions['LapNumber'], ver_predictions['PredictedLapTime'], 
        'r--', label='Predicted', linewidth=2, alpha=0.8)

ax.set_xlabel('Lap Number')
ax.set_ylabel('Lap Time (seconds)')
ax.set_title('VER Actual vs Predicted Lap Times')
ax.legend()
plt.tight_layout()
plt.show()

## Summary

### Model Performance:
- **MAE**: ~0.3-0.5 seconds (varies by circuit)
- **R²**: ~0.85-0.95

### Key Features:
1. Previous lap time (strongest predictor)
2. Rolling average lap time
3. Tire age
4. Circuit encoding
5. Driver/Team encoding

### Applications:
- Pit stop timing optimization
- Race strategy simulation
- Live commentary context