In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb

In [2]:
f1_laps = pd.read_csv("../data/f1_laps_features.csv")

In [162]:
f1_laps.head()

Unnamed: 0,Year,Race,Driver,Team,LapNumber,LapTime_seconds,Position,TireCompound,TireAge,PositionGroup,...,TeamAvgPace,DriverVsTeamPace,DriverBestLapInRace,PctOffBestLap,TrackEvolution,LapNumber_normalized,IsOutlap,TireCompoundChange,IsInlap,OldTiresIndicator
0,2025,16,VER,Red Bull Racing,1.0,87.159,1.0,MEDIUM,1.0,Top 5,...,83.861802,3.297198,81.003,7.599719,0.0,0.018868,0,1,0,0
1,2025,16,NOR,McLaren,1.0,87.43,2.0,MEDIUM,1.0,Top 5,...,83.483094,3.946906,80.901,8.070358,0.0,0.018868,0,1,0,0
2,2025,16,PIA,McLaren,1.0,87.992,3.0,MEDIUM,1.0,Top 5,...,83.483094,4.508906,81.245,8.304511,0.0,0.018868,0,1,0,0
3,2025,16,LEC,Ferrari,1.0,88.28,4.0,MEDIUM,1.0,Top 5,...,83.6955,4.5845,81.294,8.5935,0.0,0.018868,0,1,0,0
4,2025,16,RUS,Mercedes,1.0,88.907,5.0,MEDIUM,1.0,Top 5,...,83.927292,4.979708,81.8,8.688264,0.0,0.018868,0,1,0,0


In [163]:
# Define comprehensive feature set
base_features = ['Race', 'Driver', 'Team', 'Position', 'TireCompound', 'TireAge', 
                 'driver_win_rate', 'team_reliability']

# New engineered features
tire_features = ['TireCompound_encoded', 'TireLifeRemaining', 'TireDegradationRate', 
                 'IsFreshTire', 'TireWearPct']

traffic_features = ['GapToCarAhead', 'GapToCarBehind', 'DRS_Available', 'TrafficDensity']

progression_features = ['FuelLoadProxy', 'RacePhase_encoded', 'LapsRemaining', 
                        'LapProgress', 'StintLapNumber']

form_features = ['RollingAvgLapTime_3', 'RollingAvgLapTime_5', 'LapTimeStd_5',
                 'TeamAvgPace', 'DriverVsTeamPace', 'PctOffBestLap']

session_features = ['TrackEvolution', 'LapNumber_normalized', 'IsOutlap', 
                    'IsInlap', 'OldTiresIndicator']

# Combine all features
all_features = (base_features + tire_features + traffic_features + 
                progression_features + form_features + session_features)

# Filter to only available features
features = [f for f in all_features if f in f1_laps.columns]

print(f"Using {len(features)} features for training:")
print(f"  - Base features: {len([f for f in base_features if f in features])}")
print(f"  - Tire features: {len([f for f in tire_features if f in features])}")
print(f"  - Traffic features: {len([f for f in traffic_features if f in features])}")
print(f"  - Progression features: {len([f for f in progression_features if f in features])}")
print(f"  - Form features: {len([f for f in form_features if f in features])}")
print(f"  - Session features: {len([f for f in session_features if f in features])}")

X = f1_laps[features]
y = f1_laps['LapTime_seconds']

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")

Using 33 features for training:
  - Base features: 8
  - Tire features: 5
  - Traffic features: 4
  - Progression features: 5
  - Form features: 6
  - Session features: 5

X shape: (9862, 33)
y shape: (9862,)


In [164]:
available_cols = [col for col in features if col in f1_laps.columns]

# Categorical features (for encoding)
categorical_features = [col for col in ['Team', 'Driver', 'TireCompound', 'Race'] 
                        if col in available_cols]

# Numerical features (for scaling)
numerical_features = [col for col in available_cols if col not in categorical_features]

print(f"Categorical features ({len(categorical_features)}): {categorical_features}")
print(f"Numerical features ({len(numerical_features)}): {len(numerical_features)} features")

Categorical features (4): ['Team', 'Driver', 'TireCompound', 'Race']
Numerical features (29): 29 features


In [165]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

In [166]:
xgb_regressor = xgb.XGBRegressor(n_estimators=1000,
                                   learning_rate=0.01,
                                   max_depth=5,
                                   random_state=42)


In [167]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb_regressor)
])

In [168]:
# Train the pipeline
print("Training XGBoost model with enhanced features...")
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n{'='*60}")
print(f"MODEL PERFORMANCE")
print(f"{'='*60}")
print(f"Mean Absolute Error: {mae:.3f} seconds")
print(f"R² Score: {r2:.4f}")
print(f"Average lap time: {y_test.mean():.3f} seconds")
print(f"Error as % of avg lap time: {(mae/y_test.mean())*100:.2f}%")
print(f"{'='*60}")

pipeline

Training XGBoost model with enhanced features...

MODEL PERFORMANCE
Mean Absolute Error: 0.215 seconds
R² Score: 0.9979
Average lap time: 91.964 seconds
Error as % of avg lap time: 0.23%


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [169]:
# Save the enhanced models
print("Saving enhanced models...")

pickle.dump(xgb_model, open('../models/xgb_laptime.pk1', 'wb'))
pickle.dump(pipeline, open('../models/xgb_laptime_pipeline.pk1', 'wb'))

print("✓ Models saved successfully!")
print("  - xgb_laptime.pk1")
print("  - xgb_laptime_pipeline.pk1")
print(f"\nModel trained with {len(features)} features")
print(f"Performance: MAE = {mae:.3f}s, R² = {r2:.4f}")

Saving enhanced models...
✓ Models saved successfully!
  - xgb_laptime.pk1
  - xgb_laptime_pipeline.pk1

Model trained with 33 features
Performance: MAE = 0.215s, R² = 0.9979
