In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
f1_laps = pd.read_csv("../data/f1_laps_features.csv")

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb

In [4]:
f1_laps.head()

Unnamed: 0,Year,Race,Driver,Team,LapNumber,LapTime_seconds,Position,TireCompound,TireAge,PositionGroup,driver_win_rate,start_position,positions_gained,tire_degradation,race_phase,team_reliability
0,2024,1,VER,Red Bull Racing,1.0,97.284,1.0,1.0,4.0,Top 5,0.0,1.0,0.0,-0.016,Early,83.333333
1,2024,1,VER,Red Bull Racing,2.0,96.296,1.0,1.0,5.0,Top 5,0.0,1.0,0.0,-0.988,Early,83.333333
2,2024,1,VER,Red Bull Racing,3.0,96.753,1.0,1.0,6.0,Top 5,0.0,1.0,0.0,0.457,Early,83.333333
3,2024,1,VER,Red Bull Racing,4.0,96.647,1.0,1.0,7.0,Top 5,0.0,1.0,0.0,-0.106,Early,83.333333
4,2024,1,VER,Red Bull Racing,5.0,97.173,1.0,1.0,8.0,Top 5,0.0,1.0,0.0,0.526,Early,83.333333


In [5]:
features = ['Race', 'Driver', 'Team', 'Position', 'TireCompound', 'TireAge', 'driver_win_rate', 'team_reliability']
X = f1_laps[features]
y = f1_laps['LapTime_seconds']

In [27]:
f1_laps['team_reliability'].value_counts()

team_reliability
41.666667    5662
98.333333    3541
90.000000    3470
91.666667    3432
25.000000    3324
65.000000    3288
83.333333    3260
55.000000    3226
40.677966    2903
75.000000     582
Name: count, dtype: int64

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32688 entries, 0 to 32687
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Race              32688 non-null  int64  
 1   Driver            32688 non-null  object 
 2   Team              32688 non-null  object 
 3   Position          32688 non-null  float64
 4   TireCompound      32688 non-null  float64
 5   TireAge           32688 non-null  float64
 6   driver_win_rate   32688 non-null  float64
 7   team_reliability  32688 non-null  float64
dtypes: float64(5), int64(1), object(2)
memory usage: 2.0+ MB


In [6]:
available_cols = [col for col in features if col in f1_laps.columns]

categorical_features = [col for col in ['Team', 'Driver', 'TireCompound'] if col in available_cols]
numerical_features = [col for col in available_cols if col not in categorical_features]

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

In [8]:
xgb_regressor = xgb.XGBRegressor(n_estimators=1000,
                                   learning_rate=0.01,
                                   max_depth=5,
                                   random_state=42)


In [9]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb_regressor)
])

In [10]:
pipeline.fit(X, y)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
import pickle

pickle.dump(xgb_regressor, open('../models/xgb_laptime.pk1', 'wb'))
pickle.dump(pipeline, open('../models/xgb_laptime_pipeline.pk1', 'wb'))