In [1]:
import pandas as pd
import numpy as np
import os, joblib

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pd.set_option('display.max_columns', 200)


In [2]:
df = pd.read_csv('data/f1_ml_laps_dataset.csv')
df = df.sort_values(['gp_name','driver','lap_number']).reset_index(drop=True)
df['lap_time']=df['lap_time'].astype(float)
df.head()

Unnamed: 0,season,gp_name,session_name,session_type,driver,team,lap_number,lap_time,sector_1_time,sector_2_time,sector_3_time,position,track_status,is_pit_lap,compound,stint,tyre_life,fresh_tyre,speed_mean,speed_max,throttle_mean,brake_mean,rpm_mean,rpm_max,drs_activations,air_temp,track_temp,humidity,wind_speed,wind_dir,pressure
0,2023,Abu Dhabi Grand Prix,Race,Race,ALB,Williams,1,100.625,,40.189,35.489,17,1,False,MEDIUM,1,2,False,185.275862,322,54.944297,0.236074,9975.607427,12120,377,27.1,33.6,49,2.0,345,1014.7
1,2023,Abu Dhabi Grand Prix,Race,Race,ALB,Williams,2,93.56,18.892,39.357,35.311,18,1,False,MEDIUM,1,3,False,200.488636,320,60.139205,0.178977,10090.39205,12144,351,27.0,33.4,51,1.7,359,1014.9
2,2023,Abu Dhabi Grand Prix,Race,Race,ALB,Williams,3,91.768,18.588,38.312,34.868,18,1,False,MEDIUM,1,4,False,203.83237,339,59.49711,0.190751,10092.02312,12532,102,27.1,33.7,49,1.7,344,1014.9
3,2023,Abu Dhabi Grand Prix,Race,Race,ALB,Williams,4,91.591,18.657,38.211,34.723,18,1,False,MEDIUM,1,5,False,202.244318,339,57.352273,0.21875,10222.7017,12414,103,27.0,33.2,50,1.8,262,1014.9
4,2023,Abu Dhabi Grand Prix,Race,Race,ALB,Williams,5,91.422,18.605,38.328,34.489,18,1,False,MEDIUM,1,6,False,204.781977,339,59.453488,0.186047,10243.92442,12383,99,27.0,33.1,50,2.4,273,1014.7


In [3]:
group = ['gp_name', 'driver']

# Lag features
df["prev_lap_time"]  = df.groupby(group)["lap_time"].shift(1)
df["prev2_lap_time"] = df.groupby(group)["lap_time"].shift(2)
df["prev3_lap_time"] = df.groupby(group)["lap_time"].shift(3)

# Rolling features using transform() → SAFE INDEX
df["rolling_mean_3"] = df.groupby(group)["lap_time"].transform(lambda x: x.rolling(3).mean())
df["rolling_mean_5"] = df.groupby(group)["lap_time"].transform(lambda x: x.rolling(5).mean())
df["rolling_std_3"]  = df.groupby(group)["lap_time"].transform(lambda x: x.rolling(3).std())

# Weather deltas using transform() → SAFE
weather_cols = [c for c in ['air_temp','track_temp','humidity','wind_speed','pressure','wind_dir'] if c in df.columns]
for col in weather_cols:
    df[f"{col}_change"] = df.groupby(group)[col].transform(lambda x: x.diff())

# Target next lap
df["target_next_lap"] = df.groupby(group)["lap_time"].shift(-1)

# Final cleaned dataset
df_model = df.dropna(subset=[
    "target_next_lap","prev_lap_time","prev2_lap_time","prev3_lap_time"
]).copy()

df_model.head()


Unnamed: 0,season,gp_name,session_name,session_type,driver,team,lap_number,lap_time,sector_1_time,sector_2_time,sector_3_time,position,track_status,is_pit_lap,compound,stint,tyre_life,fresh_tyre,speed_mean,speed_max,throttle_mean,brake_mean,rpm_mean,rpm_max,drs_activations,air_temp,track_temp,humidity,wind_speed,wind_dir,pressure,prev_lap_time,prev2_lap_time,prev3_lap_time,rolling_mean_3,rolling_mean_5,rolling_std_3,air_temp_change,track_temp_change,humidity_change,wind_speed_change,pressure_change,wind_dir_change,target_next_lap
3,2023,Abu Dhabi Grand Prix,Race,Race,ALB,Williams,4,91.591,18.657,38.211,34.723,18,1,False,MEDIUM,1,5,False,202.244318,339,57.352273,0.21875,10222.7017,12414,103,27.0,33.2,50,1.8,262,1014.9,91.768,93.56,100.625,92.306333,,1.089308,-0.1,-0.5,1.0,0.1,0.0,-82.0,91.422
4,2023,Abu Dhabi Grand Prix,Race,Race,ALB,Williams,5,91.422,18.605,38.328,34.489,18,1,False,MEDIUM,1,6,False,204.781977,339,59.453488,0.186047,10243.92442,12383,99,27.0,33.1,50,2.4,273,1014.7,91.591,91.768,93.56,91.593667,93.7932,0.173015,0.0,-0.1,0.0,0.6,-0.2,11.0,91.491
5,2023,Abu Dhabi Grand Prix,Race,Race,ALB,Williams,6,91.491,18.696,38.325,34.47,18,1,False,MEDIUM,1,7,False,207.383721,336,58.534884,0.186047,10280.73256,12312,99,27.0,33.0,50,1.7,264,1014.9,91.422,91.591,91.768,91.501333,91.9664,0.084973,0.0,-0.1,0.0,-0.7,0.2,-9.0,91.438
6,2023,Abu Dhabi Grand Prix,Race,Race,ALB,Williams,7,91.438,18.537,38.553,34.348,18,1,False,MEDIUM,1,8,False,205.221212,334,58.836364,0.181818,10133.9303,12325,69,27.0,33.2,51,2.0,276,1014.9,91.491,91.422,91.591,91.450333,91.542,0.036116,0.0,0.2,1.0,0.3,0.0,12.0,91.58
7,2023,Abu Dhabi Grand Prix,Race,Race,ALB,Williams,8,91.58,18.473,38.845,34.262,17,1,False,MEDIUM,1,9,False,205.627219,317,60.760355,0.180473,10119.16864,11758,1,26.9,33.0,51,2.0,242,1014.9,91.438,91.491,91.422,91.503,91.5044,0.071757,-0.1,-0.2,0.0,0.0,0.0,-34.0,91.644


In [4]:
numeric_features=[c for c in [
 'lap_number','prev_lap_time','prev2_lap_time','prev3_lap_time','rolling_mean_3','rolling_mean_5','rolling_std_3',
 'tyre_life','stint','fresh_tyre','is_pit_lap','position',
 'air_temp','track_temp','humidity','wind_speed','pressure','wind_dir'
] if c in df_model.columns]

numeric_features += [c for c in df_model.columns if c.endswith('_change')]

categorical_features=[c for c in ['driver','team','compound','track_status','session_name','gp_name'] if c in df_model.columns]

feature_cols = numeric_features + categorical_features

df_model['gp_name'].unique()

array(['Abu Dhabi Grand Prix', 'Australian Grand Prix',
       'Austrian Grand Prix', 'Azerbaijan Grand Prix',
       'Bahrain Grand Prix', 'Belgian Grand Prix', 'British Grand Prix',
       'Canadian Grand Prix', 'Dutch Grand Prix', 'Hungarian Grand Prix',
       'Italian Grand Prix', 'Japanese Grand Prix',
       'Las Vegas Grand Prix', 'Mexico City Grand Prix',
       'Miami Grand Prix', 'Monaco Grand Prix', 'Qatar Grand Prix',
       'Saudi Arabian Grand Prix', 'Singapore Grand Prix',
       'Spanish Grand Prix', 'São Paulo Grand Prix',
       'United States Grand Prix'], dtype=object)

In [5]:
test_gp = df_model['gp_name'].unique()[-1]
print('Using test GP:', test_gp)

X = df_model[feature_cols]
y = df_model['target_next_lap']

mask = df_model['gp_name'] == test_gp

X_train_raw = X[~mask].copy()
X_test_raw  = X[mask].copy()
y_train = y[~mask].copy()
y_test  = y[mask].copy()

X_train = pd.get_dummies(X_train_raw, columns=categorical_features, drop_first=True)
X_test  = pd.get_dummies(X_test_raw,  columns=categorical_features, drop_first=True)
X_test  = X_test.reindex(columns=X_train.columns, fill_value=0)

feature_columns = X_train.columns.tolist()
print(len(feature_columns))

Using test GP: United States Grand Prix
101


In [6]:
models={
 'RandomForest': RandomForestRegressor(n_estimators=400,max_depth=12,n_jobs=-1,random_state=42),
 'XGBoost': XGBRegressor(n_estimators=300,max_depth=8,learning_rate=0.05,subsample=0.9,random_state=42)
}

results=[]

for name,model in models.items():
    print('Training:',name)
    model.fit(X_train,y_train)
    pred=model.predict(X_test)
    
    mae=mean_absolute_error(y_test,pred)
    mse=mean_squared_error(y_test,pred)
    rmse=np.sqrt(mse)
    r2=r2_score(y_test,pred)
    mape=np.mean(np.abs((y_test-pred)/y_test))*100
    
    results.append([name,mae,rmse,mse,r2,mape])

results_df=pd.DataFrame(results,columns=['Model','MAE','RMSE','MSE','R2','MAPE'])
results_df

Training: RandomForest
Training: XGBoost


Unnamed: 0,Model,MAE,RMSE,MSE,R2,MAPE
0,RandomForest,2.306287,4.595621,21.119733,-0.211591,2.133569
1,XGBoost,1.275763,1.940535,3.765676,0.783972,1.215298


In [7]:
# choose best model = lowest MAE
best_row = results_df.sort_values('MAE').iloc[0]
best_model_name = best_row['Model']

print("Best model:", best_model_name)

best_model = models[best_model_name]

os.makedirs('models',exist_ok=True)
joblib.dump(best_model,'models/best_next_lap_model.pkl')

meta={
 'feature_columns': feature_columns,
 'categorical_features': categorical_features,
 'best_model_name': best_model_name,
 'test_gp': test_gp
}

joblib.dump(meta,'models/best_next_lap_model_meta.pkl')

print("Saved best model and metadata.")

Best model: XGBoost
Saved best model and metadata.


In [8]:
# Example usage
loaded=joblib.load('models/best_next_lap_model.pkl')
meta=joblib.load('models/best_next_lap_model_meta.pkl')

print(meta)

sample_idx = X_test_raw.index[5]
sample = X_test_raw.loc[[sample_idx]]

sample_enc = pd.get_dummies(sample, columns=categorical_features, drop_first=True)
sample_enc = sample_enc.reindex(columns=meta['feature_columns'],fill_value=0)

pred = loaded.predict(sample_enc)[0]
actual = df_model.loc[sample_idx,'target_next_lap']

print("Predicted:",pred,"Actual:",actual)

{'feature_columns': ['lap_number', 'prev_lap_time', 'prev2_lap_time', 'prev3_lap_time', 'rolling_mean_3', 'rolling_mean_5', 'rolling_std_3', 'tyre_life', 'stint', 'fresh_tyre', 'is_pit_lap', 'position', 'air_temp', 'track_temp', 'humidity', 'wind_speed', 'pressure', 'wind_dir', 'air_temp_change', 'track_temp_change', 'humidity_change', 'wind_speed_change', 'pressure_change', 'wind_dir_change', 'driver_ALO', 'driver_BOT', 'driver_DEV', 'driver_GAS', 'driver_HAM', 'driver_HUL', 'driver_LAW', 'driver_LEC', 'driver_MAG', 'driver_NOR', 'driver_OCO', 'driver_PER', 'driver_PIA', 'driver_RIC', 'driver_RUS', 'driver_SAI', 'driver_SAR', 'driver_STR', 'driver_TSU', 'driver_VER', 'driver_ZHO', 'team_AlphaTauri', 'team_Alpine', 'team_Aston Martin', 'team_Ferrari', 'team_Haas F1 Team', 'team_McLaren', 'team_Mercedes', 'team_Red Bull Racing', 'team_Williams', 'compound_INTERMEDIATE', 'compound_MEDIUM', 'compound_SOFT', 'compound_WET', 'track_status_4', 'track_status_6', 'track_status_12', 'track_stat

In [9]:
# Load model + metadata
loaded = joblib.load("models/best_next_lap_model.pkl")
meta   = joblib.load("models/best_next_lap_model_meta.pkl")

feature_columns      = meta["feature_columns"]
categorical_features = meta["categorical_features"]
test_gp              = meta["test_gp"]

# Filter test GP rows
test_rows = df_model[df_model["gp_name"] == test_gp]
ver_rows  = test_rows[test_rows["driver"] == "VER"].sort_values("lap_number")

print("Available VER laps:", ver_rows["lap_number"].tolist())

# Use lap 16 → predict lap 17
lap_to_predict_from = 16

sample_idx = ver_rows[ver_rows["lap_number"] == lap_to_predict_from].index[0]
sample      = X_test_raw.loc[[sample_idx]]

# Encode
sample_enc = pd.get_dummies(sample, columns=categorical_features, drop_first=True)
sample_enc = sample_enc.reindex(columns=feature_columns, fill_value=0)

predicted_next_lap = loaded.predict(sample_enc)[0]
actual_next_lap    = df_model.loc[sample_idx, "target_next_lap"]

print(f"\nDriver: VER   Lap: {lap_to_predict_from} → {lap_to_predict_from+1}")
print("Predicted Lap 16:", predicted_next_lap)
print("Actual Lap 16:   ", actual_next_lap)


Available VER laps: [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]

Driver: VER   Lap: 16 → 17
Predicted Lap 16: 119.64335
Actual Lap 16:    120.701


In [10]:
# Filter test GP rows
alb_rows = test_rows[test_rows["driver"] == "VER"].sort_values("lap_number")

# Correct tyre change detection: when STINT increases
alb_rows["tyre_changed"] = alb_rows["stint"].diff() == 1

# Get the laps where tyre change actually occurred
tyre_change_laps = alb_rows[alb_rows["tyre_changed"] == True]["lap_number"].tolist()

print("Actual ALB tyre-change laps in test GP:", tyre_change_laps)

# Edge case: no tyre changes detected
if len(tyre_change_laps) == 0:
    raise ValueError("No real tyre changes detected for ALB in the test GP!")

# Use first tyre change lap
lap_num = tyre_change_laps[0]

print(f"\nSelected tyre change lap for VER: {lap_num}")

# Prepare sample for prediction
sample_idx = alb_rows[alb_rows["lap_number"] == lap_num].index[0]
sample = X_test_raw.loc[[sample_idx]].copy()

print("\nTyre state at actual pitstop lap:")
print(sample[["stint","compound","tyre_life","fresh_tyre"]])

# Encode
sample_enc = pd.get_dummies(sample, columns=categorical_features, drop_first=True)
sample_enc = sample_enc.reindex(columns=feature_columns, fill_value=0)

predicted_next_lap  = loaded.predict(sample_enc)[0]
actual_next_lap     = df_model.loc[sample_idx, "target_next_lap"]

print(f"\nDriver: VER   Lap: {lap_num} → {lap_num+1}")
print("Predicted next lap (after REAL tyre change):", predicted_next_lap)
print("Actual next lap:                          ", actual_next_lap)


Actual ALB tyre-change laps in test GP: [17, 36]

Selected tyre change lap for VER: 17

Tyre state at actual pitstop lap:
       stint compound  tyre_life  fresh_tyre
23888      2   MEDIUM          1        True

Driver: VER   Lap: 17 → 18
Predicted next lap (after REAL tyre change): 102.109924
Actual next lap:                           100.989


In [11]:
def compare_pit_vs_no_pit_with_actual(driver, lap):
    rows = test_rows[test_rows["driver"] == driver].sort_values("lap_number")
    idx = rows[rows["lap_number"] == lap].index[0]
    
    base = X_test_raw.loc[[idx]].copy()
    pit  = base.copy()

    # Simulate real pit stop
    pit["tyre_life"]  = 0
    pit["fresh_tyre"] = 1
    pit["compound"]   = "SOFT"

    # Encode
    def encode(df):
        enc = pd.get_dummies(df, columns=categorical_features, drop_first=True)
        return enc.reindex(columns=feature_columns, fill_value=0)

    pred_no_pit = loaded.predict(encode(base))[0]
    pred_pit    = loaded.predict(encode(pit))[0]

    # REAL next lap time
    actual_next = df_model.loc[idx, "target_next_lap"]

    return {
        "Pred_No_Pit": pred_no_pit,
        "Pred_Pit": pred_pit,
        "Actual": actual_next
    }

print(compare_pit_vs_no_pit_with_actual("ALB", 15))


{'Pred_No_Pit': 102.30966, 'Pred_Pit': 98.67317, 'Actual': 103.131}


In [12]:
def simulate_track_temp_change_with_actual(driver, lap, delta_temp):
    rows = test_rows[test_rows["driver"] == driver].sort_values("lap_number")
    idx = rows[rows["lap_number"] == lap].index[0]

    sample = X_test_raw.loc[[idx]].copy()

    sample["track_temp"] += delta_temp
    sample["track_temp_change"] = delta_temp

    enc = pd.get_dummies(sample, columns=categorical_features, drop_first=True)
    enc = enc.reindex(columns=feature_columns, fill_value=0)

    predicted = loaded.predict(enc)[0]
    actual    = df_model.loc[idx, "target_next_lap"]

    return predicted, actual

print(simulate_track_temp_change_with_actual("HAM", 20, +5))


(118.24258, 123.034)


In [13]:
def simulate_compound_with_actual(driver, lap, new_compound):
    rows = test_rows[test_rows["driver"] == driver].sort_values("lap_number")
    idx = rows[rows["lap_number"] == lap].index[0]

    sample = X_test_raw.loc[[idx]].copy()

    sample["compound"] = new_compound
    sample["fresh_tyre"] = 1
    sample["tyre_life"] = 0

    enc = pd.get_dummies(sample, columns=categorical_features, drop_first=True)
    enc = enc.reindex(columns=feature_columns, fill_value=0)

    predicted = loaded.predict(enc)[0]
    actual    = df_model.loc[idx, "target_next_lap"]

    return predicted, actual

print(simulate_compound_with_actual("LEC", 18, "SOFT"))


(100.612946, 103.074)


In [14]:
def simulate_vsc_with_actual(driver, lap):
    rows = test_rows[test_rows["driver"] == driver].sort_values("lap_number")
    idx = rows[rows["lap_number"] == lap].index[0]

    sample = X_test_raw.loc[[idx]].copy()

    sample["prev_lap_time"]  += 15
    sample["prev2_lap_time"] += 10
    sample["prev3_lap_time"] += 5

    enc = pd.get_dummies(sample, columns=categorical_features, drop_first=True)
    enc = enc.reindex(columns=feature_columns, fill_value=0)

    predicted = loaded.predict(enc)[0]
    actual    = df_model.loc[idx, "target_next_lap"]

    return predicted, actual

print(simulate_vsc_with_actual("VER", 20))


(99.110146, 100.98)


In [15]:
def simulate_degradation_with_actual(driver, lap, wear_factor):
    rows = test_rows[test_rows["driver"] == driver].sort_values("lap_number")
    idx = rows[rows["lap_number"] == lap].index[0]

    sample = X_test_raw.loc[[idx]].copy()

    sample["tyre_life"] *= wear_factor

    enc = pd.get_dummies(sample, columns=categorical_features, drop_first=True)
    enc = enc.reindex(columns=feature_columns, fill_value=0)

    predicted = loaded.predict(enc)[0]
    actual    = df_model.loc[idx, "target_next_lap"]

    return predicted, actual

print(simulate_degradation_with_actual("SAI", 22, 1.3))


(102.19631, 101.683)
