In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)

df = pd.read_csv("data/final_f1_ml_laps_dataset.csv")
print("Raw shape:", df.shape)

# Clean
df = df[(df["lap_time"] > 30) & (df["lap_time"] < 200)]
df = df.dropna(subset=["sector_1_time", "sector_2_time", "sector_3_time"])

df.head()

Raw shape: (23983, 48)


Unnamed: 0,season,gp_name,session_name,session_type,driver,team,lap_number,lap_time,sector_1_time,sector_2_time,sector_3_time,position,track_status,is_pit_lap,compound,stint,tyre_life,fresh_tyre,speed_mean,speed_max,throttle_mean,brake_mean,rpm_mean,rpm_max,drs_activations,air_temp,track_temp,humidity,wind_speed,wind_dir,pressure,speed_min,speed_std,speed_q1,speed_q3,speed_range,throttle_max,throttle_min,throttle_std,throttle_pct_full,brake_std,brake_pct_braking,rpm_std,gear_changes,gear_min,gear_max,drs_active_pct,drs_time_seconds
1,2023,Bahrain Grand Prix,Race,Race,VER,Red Bull Racing,2,97.974,31.342,42.504,24.128,1,12,False,SOFT,1,5,False,195.407713,298,60.225895,0.203857,9773.179063,12063,363,27.3,31.1,22,0.0,208,1016.7,64.0,63.263189,145.5,242.5,234.0,100.0,0.0,43.626538,0.432507,0.40342,0.203857,1637.445568,43.0,2.0,8.0,1.0,97.750285
2,2023,Bahrain Grand Prix,Race,Race,VER,Red Bull Racing,3,98.006,31.388,42.469,24.149,1,1,False,SOFT,1,6,False,198.095109,298,62.290761,0.184783,9756.970109,11795,0,27.3,31.2,22,0.6,170,1016.7,65.0,64.896809,147.0,253.0,233.0,100.0,0.0,43.299096,0.470109,0.38865,0.184783,1747.054894,36.0,3.0,8.0,0.0,0.0
3,2023,Bahrain Grand Prix,Race,Race,VER,Red Bull Racing,4,97.976,31.271,42.642,24.063,1,1,False,SOFT,1,7,False,195.786842,299,62.092105,0.2,9723.621053,11960,0,27.2,31.1,22,0.8,236,1016.9,65.0,64.320556,143.5,245.75,234.0,100.0,0.0,43.281912,0.471053,0.400527,0.2,1726.074671,39.0,3.0,8.0,0.0,0.0
4,2023,Bahrain Grand Prix,Race,Race,VER,Red Bull Racing,5,98.035,31.244,42.724,24.067,1,1,False,SOFT,1,8,False,197.120879,301,60.884615,0.206044,9710.263736,11911,0,27.2,31.0,22,1.0,206,1016.7,65.0,64.729756,145.0,249.25,236.0,100.0,0.0,43.7514,0.450549,0.405019,0.206044,1732.516039,36.0,3.0,8.0,0.0,0.0
5,2023,Bahrain Grand Prix,Race,Race,VER,Red Bull Racing,6,97.986,31.341,42.632,24.013,1,1,False,SOFT,1,9,False,196.887363,301,60.942308,0.200549,9767.299451,11852,0,27.1,31.0,22,0.6,175,1016.9,62.0,65.702778,143.0,250.75,239.0,100.0,0.0,43.439385,0.450549,0.400963,0.200549,1660.594957,42.0,2.0,8.0,0.0,0.0


In [29]:
tyre_base = df.copy()

# Remove pit laps if available
if "is_pit_lap" in tyre_base.columns:
    tyre_base = tyre_base[tyre_base["is_pit_lap"] == 0]

# Remove non-green laps if track_status exists
if "track_status" in tyre_base.columns:
    tyre_base = tyre_base[tyre_base["track_status"] == 1]

# Driver pace offset
driver_means = tyre_base.groupby("driver")["lap_time"].mean()
global_mean = tyre_base["lap_time"].mean()
driver_offset_map = (driver_means - global_mean).to_dict()

tyre_base["driver_pace_offset"] = tyre_base["driver"].map(driver_offset_map)
tyre_base["driver_pace_offset"] -= tyre_base["driver_pace_offset"].mean()


In [30]:
tyre_cat_features = ["gp_name", "compound"]

tyre_num_features = [
    "tyre_life", "stint", "lap_number",
    "track_temp", "air_temp", "humidity",
    "wind_speed", "wind_dir", "pressure",
    "driver_pace_offset"
]

tyre_target = "lap_time"

cols_needed = tyre_cat_features + tyre_num_features + [tyre_target]

tyre_df = tyre_base.dropna(subset=cols_needed).copy()
print("Tyre dataset:", tyre_df.shape)


Tyre dataset: (20541, 49)


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X_t = tyre_df[tyre_cat_features + tyre_num_features]
y_t = tyre_df[tyre_target]

X_t_train, X_t_test, y_t_train, y_t_test = train_test_split(
    X_t, y_t, test_size=0.2, random_state=42
)

num_t = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_t = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

pre_tyre = ColumnTransformer([
    ("num", num_t, tyre_num_features),
    ("cat", cat_t, tyre_cat_features)
])

tyre_model = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1
)

tyre_pipeline = Pipeline([
    ("pre", pre_tyre),
    ("model", tyre_model)
])

tyre_pipeline.fit(X_t_train, y_t_train)

y_t_pred = tyre_pipeline.predict(X_t_test)

print("Tyre Model Performance:")
print(" MAE :", mean_absolute_error(y_t_test, y_t_pred))
print(" RMSE:", mean_squared_error(y_t_test, y_t_pred, squared=False))
print(" R2  :", r2_score(y_t_test, y_t_pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000744 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1048
[LightGBM] [Info] Number of data points in the train set: 16432, number of used features: 37
[LightGBM] [Info] Start training from score 88.854356
Tyre Model Performance:
 MAE : 0.48254171841456933
 RMSE: 1.1486896191153733
 R2  : 0.9888824010259125




In [32]:
def get_driver_offset(driver_name):
    return float(driver_offset_map.get(driver_name, 0.0))

def predict_tyre_laptime(
    gp_name, compound, tyre_life, stint, lap_number,
    track_temp, air_temp, humidity,
    wind_speed, wind_dir, pressure,
    driver=None
):
    pace = get_driver_offset(driver) if driver else 0.0

    row = {
        "gp_name": gp_name,
        "compound": compound,
        "tyre_life": tyre_life,
        "stint": stint,
        "lap_number": lap_number,
        "track_temp": track_temp,
        "air_temp": air_temp,
        "humidity": humidity,
        "wind_speed": wind_speed,
        "wind_dir": wind_dir,
        "pressure": pressure,
        "driver_pace_offset": pace
    }

    return float(tyre_pipeline.predict(pd.DataFrame([row]))[0])


In [33]:
def estimate_pit_loss(df_full, gp_name, driver=None):
    temp = df_full[df_full["gp_name"] == gp_name]

    if driver:
        temp = temp[temp["driver"] == driver]

    if "is_pit_lap" not in temp.columns:
        return 20.0

    pit = temp[temp["is_pit_lap"] == 1]["lap_time"]
    normal = temp[temp["is_pit_lap"] == 0]["lap_time"]

    if pit.empty or normal.empty:
        return 20.0

    return float(np.clip(pit.median() - normal.median(), 10, 35))


In [34]:
def simulate_tyre_strategy(strategy, total_laps, pit_loss, gp_name, weather_dict, driver=None):
    laps = []
    total_time = 0
    lap_num = 1
    stint_idx = 0

    for compound, stint_len in strategy:
        stint_idx += 1
        for life in range(1, stint_len + 1):
            if lap_num > total_laps:
                break

            lt = predict_tyre_laptime(
                gp_name, compound, life, stint_idx, lap_num,
                weather_dict["track_temp"], weather_dict["air_temp"],
                weather_dict["humidity"], weather_dict["wind_speed"],
                weather_dict["wind_dir"], weather_dict["pressure"],
                driver=driver
            )

            total_time += lt
            laps.append({"lap": lap_num, "compound": compound, "lap_time_pred": lt})
            lap_num += 1

        if lap_num <= total_laps:
            total_time += pit_loss

    return total_time, pd.DataFrame(laps)


In [35]:
def possible_stint_lengths(total_laps):
    return list(range(10, min(30, total_laps - 10) + 1))

def generate_strategies_fast(total_laps, compounds):
    lengths = possible_stint_lengths(total_laps)
    strategies = []

    # ---------------------
    # 1-stop (2 stints)
    # ---------------------
    for c1 in compounds:
        for c2 in compounds:
            if c1 == c2:
                continue

            for s1 in lengths:
                s2 = total_laps - s1
                if s2 < 10 or s2 > 30:
                    continue
                strategies.append([(c1, s1), (c2, s2)])

    # ---------------------
    # 2-stop (3 stints)
    # ---------------------
    for c1 in compounds:
        for c2 in compounds:
            for c3 in compounds:
                if len({c1, c2, c3}) < 2:
                    continue

                for s1 in lengths:
                    for s2 in lengths:
                        s3 = total_laps - (s1 + s2)
                        if 10 <= s3 <= 30:
                            strategies.append([(c1, s1), (c2, s2), (c3, s3)])

    return strategies


In [36]:
def find_best_strategies(
    gp_name, total_laps, weather_dict, driver=None, top_k=5
):
    compounds = sorted(df["compound"].unique())
    pit_loss = estimate_pit_loss(df, gp_name, driver)

    strategies = generate_strategies_fast(total_laps, compounds)

    results = []
    for strat in strategies:
        tt, _ = simulate_tyre_strategy(
            strat, total_laps, pit_loss, gp_name, weather_dict, driver
        )
        results.append({"strategy": strat, "total_time": tt})

    return pd.DataFrame(results).sort_values("total_time").head(top_k)


In [37]:
print("=== Tyre Model: Existing Driver (VER) ===")
ver_time = predict_tyre_laptime(
    gp_name="Bahrain Grand Prix", compound="Medium",
    tyre_life=7, stint=1, lap_number=7,
    track_temp=36, air_temp=28, humidity=40,
    wind_speed=3.2, wind_dir=140, pressure=1006,
    driver="VER"
)
print("Predicted:", ver_time)


=== Tyre Model: Existing Driver (VER) ===
Predicted: 95.27486661838127


In [38]:
print("=== Tyre Model: New Driver (General Model) ===")
new_time = predict_tyre_laptime(
    gp_name="Bahrain Grand Prix", compound="Medium",
    tyre_life=7, stint=1, lap_number=7,
    track_temp=36, air_temp=28, humidity=40,
    wind_speed=3.2, wind_dir=140, pressure=1006,
    driver=None
)
print("Predicted:", new_time)


=== Tyre Model: New Driver (General Model) ===
Predicted: 96.25269298494379


In [39]:
weather = {
    "track_temp": 36,
    "air_temp": 28,
    "humidity": 40,
    "wind_speed": 3.2,
    "wind_dir": 140,
    "pressure": 1006,
}

strategy = [("Medium", 15), ("Hard", 20)]

total, df_strat = simulate_tyre_strategy(
    strategy, total_laps=35, pit_loss=22,
    gp_name="Bahrain Grand Prix",
    weather_dict=weather,
    driver=None
)
print("Total time:", total)
df_strat.head()


Total time: 3378.545870856079


Unnamed: 0,lap,compound,lap_time_pred
0,1,Medium,96.808298
1,2,Medium,96.808298
2,3,Medium,96.410751
3,4,Medium,96.344112
4,5,Medium,96.224357


In [40]:
weather = {
    "track_temp": 36,
    "air_temp": 28,
    "humidity": 40,
    "wind_speed": 3.2,
    "wind_dir": 140,
    "pressure": 1006,
}

# total laps for the race
race_laps = 57  # example for Bahrain

best = find_best_strategies(
    gp_name="Bahrain Grand Prix",
    total_laps=race_laps,
    weather_dict=weather,
    driver=None,   # general strategy model (new driver)
    top_k=5
)

best

KeyboardInterrupt: 