In [2]:
import pandas as pd

df = pd.read_csv("data/bahrain_2020_2023_race_laps.csv")

bahrain = df.copy()


In [3]:
bahrain["race_id"] = bahrain["season"].astype(int).astype(str) + "_BHR"


In [4]:
CMP_MAP = {"SOFT": "S", "MEDIUM": "M", "HARD": "H"}

def build_strategy_string(g):
    stints = (
        g.sort_values("lap_number")
         .groupby("stint")["compound"]
         .apply(lambda x: x.mode().iloc[0])
         .tolist()
    )
    return "-".join([CMP_MAP.get(c, "?") for c in stints])

race_level = (
    bahrain
    .groupby(["race_id", "driver"], as_index=False, group_keys=False)
    .apply(
        lambda g: pd.Series({
            "strategy": build_strategy_string(g),
            "total_race_time": g["lap_time"].sum(),
            "stops": g["stint"].nunique() - 1,
            "avg_track_temp": g["track_temp"].mean(),
            "avg_air_temp": g["air_temp"].mean(),
        })
    )
)


  bahrain


In [5]:
# Copy Bahrain laps for lap-time model
laps = bahrain.copy()

# Remove invalid laps
laps = laps[~laps["lap_time"].isna()]
laps = laps[laps["lap_time"] > 40]      # remove formation laps etc.
laps = laps[laps["lap_time"] < 200]     # remove outliers

# (optional initially: do NOT remove pit laps)
# laps = laps[~laps["is_pit_lap"]]

print("Training laps:", len(laps))


Training laps: 4149


In [6]:
# Feature columns for the lap-time model
feature_cols = [
    "lap_number",
    "stint",
    "tyre_life",
    "compound",
    "track_temp",
    "air_temp",
    "wind_speed",
    "wind_dir"
]

X = laps[feature_cols]
y = laps["lap_time"]
print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (4149, 8)
y shape: (4149,)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

cat_cols = ["compound"]
num_cols = [c for c in feature_cols if c not in cat_cols]

preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", "passthrough", num_cols),
])

laptime_model = Pipeline([
    ("preprocess", preprocess),
    ("rf", RandomForestRegressor(
        n_estimators=400,
        max_depth=14,
        random_state=42
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

laptime_model.fit(X_train, y_train)

print("Validation R2:", laptime_model.score(X_test, y_test))


Validation R2: 0.909280985510102


In [8]:
laps["pred_base"] = laptime_model.predict(X)
laps["residual"] = laps["lap_time"] - laps["pred_base"]

driver_offsets = laps.groupby("driver")["residual"].median().to_dict()

def get_driver_offset(driver):
    return driver_offsets.get(driver, 0.0)  # new driver → 0

print(driver_offsets)


{'ALB': 0.04025587811801046, 'ALO': 0.07644199408817087, 'BOT': -0.35897535808329906, 'DEV': 0.3417779402293988, 'GAS': -0.21323959441999563, 'GIO': -0.014902160744284743, 'HAM': -0.7040144263231483, 'HUL': 0.4201282047683037, 'KVY': 0.4763913492068781, 'LAT': 0.9648125968356709, 'LEC': -0.517192578746922, 'MAG': 0.1535718709998264, 'MSC': 0.6316705111738301, 'NOR': -0.3831280887510644, 'OCO': 0.05514629785442082, 'PER': -0.7656748349776024, 'PIA': 0.3737804449202713, 'RAI': 0.16265006867764953, 'RIC': -0.11152977418867493, 'RUS': -0.043091284464395585, 'SAI': -0.5169073185329438, 'SAR': 0.224601217463551, 'STR': -0.06627162109154483, 'TSU': 0.025034744997086023, 'VER': -1.0505593780568176, 'VET': 0.36529523157766164, 'ZHO': 0.008174281342924417}


In [9]:
winner_strategies = []

for race_id in bahrain["race_id"].unique():
    # get the driver who finished P1 (position==1 on last lap)
    race_df = bahrain[bahrain["race_id"] == race_id]
    last_lap = race_df["lap_number"].max()
    p1_driver = race_df[race_df["lap_number"] == last_lap]["driver"].iloc[0]
    
    winner_data = race_df[race_df["driver"] == p1_driver]
    
    strategy = winner_data.groupby("stint")["compound"].agg(lambda x: x.mode().iloc[0]).tolist()
    stints = winner_data.groupby("stint")["lap_number"].agg(["min","max"])
    
    winner_strategies.append({
        "race_id": race_id,
        "driver": p1_driver,
        "strategy": strategy,
        "stints": stints
    })


In [10]:
for w in winner_strategies:
    print(f"\n{w['race_id']} — Winner: {w['driver']}")
    print("Strategy:", " → ".join(w["strategy"]))
    
    for stint, row in w["stints"].iterrows():
        stint = int(stint)  # FIX: make stint an integer
        print(f"  {w['strategy'][stint-1]}: Laps {int(row['min'])}–{int(row['max'])}")



2020_BHR — Winner: GAS
Strategy: MEDIUM → HARD → HARD
  MEDIUM: Laps 1–1
  HARD: Laps 4–25
  HARD: Laps 26–57

2021_BHR — Winner: HAM
Strategy: MEDIUM → HARD → HARD
  MEDIUM: Laps 1–13
  HARD: Laps 14–28
  HARD: Laps 29–56

2022_BHR — Winner: ALO
Strategy: SOFT → MEDIUM → HARD → SOFT
  SOFT: Laps 1–11
  MEDIUM: Laps 12–25
  HARD: Laps 26–42
  SOFT: Laps 43–57

2023_BHR — Winner: VER
Strategy: SOFT → SOFT → HARD
  SOFT: Laps 1–14
  SOFT: Laps 15–36
  HARD: Laps 37–57


from here

In [11]:
import numpy as np

# Build a map: pattern -> list of pit lap lists from winners
pattern_to_pits = {}

for w in winner_strategies:
    # e.g. ["SOFT","SOFT","HARD"] -> "S-S-H"
    pattern = "-".join([CMP_MAP[c] for c in w["strategy"]])

    # get min/max lap per stint
    stints = w["stints"]
    stint_bounds = []
    for stint, row in stints.iterrows():
        stint = int(stint)
        stint_bounds.append((int(row["min"]), int(row["max"])))
    
    # convert bounds into pit laps (pit at boundary between stints)
    # e.g. [(1,14), (15,36), (37,57)] -> pits [15,37]
    pit_laps = []
    for i in range(len(stint_bounds) - 1):
        next_start = stint_bounds[i+1][0]
        pit_laps.append(next_start)
    
    pattern_to_pits.setdefault(pattern, []).append(pit_laps)

pattern_to_pits


{'M-H-H': [[4, 26], [14, 29]], 'S-M-H-S': [[12, 26, 43]], 'S-S-H': [[15, 37]]}

In [12]:
avg_pits_by_pattern = {}

for pattern, pits_list in pattern_to_pits.items():
    # transpose the list of lists to average per pit index
    pits_array = np.array(pits_list)  # shape (n_races_with_this_pattern, n_pits)
    avg_pits = pits_array.mean(axis=0)
    avg_pits_by_pattern[pattern] = [int(round(x)) for x in avg_pits]

avg_pits_by_pattern


{'M-H-H': [9, 28], 'S-M-H-S': [12, 26, 43], 'S-S-H': [15, 37]}

In [20]:
CMP_MAP_INV = {"S": "SOFT", "M": "MEDIUM", "H": "HARD"}

def simulate_race(strategy, pit_laps, weather, driver=None):
    compounds = [CMP_MAP_INV[c] for c in strategy.split("-")]
    total_laps = 57
    pit_loss = 22.0
    
    driver_offset = get_driver_offset(driver)

    stint_idx = 0
    tyre_life = 0
    total_time = 0.0

    for lap in range(1, total_laps + 1):
        
        # Pit stop?
        if lap in pit_laps:
            total_time += pit_loss
            stint_idx += 1
            tyre_life = 0
        
        tyre_life += 1
        
        row = {
            "lap_number": lap,
            "stint": stint_idx + 1,
            "tyre_life": tyre_life,
            "compound": compounds[stint_idx],
            "track_temp": weather["track_temp"],
            "air_temp": weather["air_temp"],
            "wind_speed": weather["wind_speed"],
            "wind_dir": weather["wind_dir"],
        }
        
        lap_pred = laptime_model.predict(pd.DataFrame([row]))[0]
        total_time += lap_pred + driver_offset

    return total_time


In [21]:
def find_best_strategies_fast(weather, driver=None, top_k=5, tweak_window=3):
    """
    Search only around historically good strategies.
    For each historical pattern (e.g. S-S-H), we:
    - take its average pit laps from winners
    - try small +/- tweak_window variations
    """
    results = []
    total_laps = 57

    for pattern, base_pits in avg_pits_by_pattern.items():
        n_pits = len(base_pits)

        # build small search grid: each pit can move within +/- tweak_window
        if n_pits == 1:
            base = base_pits[0]
            for p in range(base - tweak_window, base + tweak_window + 1):
                if p <= 5 or p >= total_laps - 5:
                    continue
                total = simulate_race(pattern, [p], weather, driver)
                results.append((pattern, [p], total))

        elif n_pits == 2:
            base1, base2 = base_pits
            for p1 in range(base1 - tweak_window, base1 + tweak_window + 1):
                for p2 in range(base2 - tweak_window, base2 + tweak_window + 1):
                    if not (5 < p1 < p2 < total_laps - 5):
                        continue
                    total = simulate_race(pattern, [p1, p2], weather, driver)
                    results.append((pattern, [p1, p2], total))

        # you can extend to 3 stops the same way if needed

    results.sort(key=lambda x: x[2])
    best = []
    for pattern, pits, t in results[:top_k]:
        best.append({
            "strategy_pattern": pattern,
            "pit_laps": pits,
            "predicted_race_time": t
        })
    return best


In [22]:
def get_year_weather(bahrain_df, season):
    df_season = bahrain_df[bahrain_df["season"] == season]
    return {
        "track_temp": df_season["track_temp"].mean(),
        "air_temp": df_season["air_temp"].mean(),
        "wind_speed": df_season["wind_speed"].mean(),
        "wind_dir": df_season["wind_dir"].mean(),
    }


In [23]:
weather_2023 = get_year_weather(bahrain, 2023)

best_2023 = find_best_strategies_fast(weather_2023, driver="VER", top_k=3)
best_2023


[{'strategy_pattern': 'S-S-H',
  'pit_laps': [14, 35],
  'predicted_race_time': 5689.623697339643},
 {'strategy_pattern': 'S-S-H',
  'pit_laps': [15, 36],
  'predicted_race_time': 5689.649158644459},
 {'strategy_pattern': 'S-S-H',
  'pit_laps': [15, 35],
  'predicted_race_time': 5689.671563458176}]

here

In [27]:
top3_strategies = []

for race_id in bahrain["race_id"].unique():
    race_df = bahrain[bahrain["race_id"] == race_id]

    # final lap of the race
    last_lap = race_df["lap_number"].max()

    # finishing order
    final_classification = (
        race_df[race_df["lap_number"] == last_lap]
        .sort_values("position")
    )

    # top 3 finishers
    top3 = final_classification.head(3)["driver"].tolist()

    for driver in top3:
        driver_data = race_df[race_df["driver"] == driver]

        strategy = (
            driver_data.groupby("stint")["compound"]
            .agg(lambda x: x.mode().iloc[0])
            .tolist()
        )

        stints = driver_data.groupby("stint")["lap_number"].agg(["min", "max"])

        top3_strategies.append({
            "race_id": race_id,
            "driver": driver,
            "strategy": strategy,
            "stints": stints
        })


In [28]:
for s in top3_strategies:
    print(f"\n{s['race_id']} — Driver: {s['driver']}")
    print("Strategy:", " → ".join(s["strategy"]))

    for stint, row in s["stints"].iterrows():
        stint = int(stint)
        print(f"  {s['strategy'][stint-1]}: Laps {int(row['min'])}–{int(row['max'])}")



2020_BHR — Driver: VER
Strategy: MEDIUM → MEDIUM → HARD → HARD → MEDIUM
  MEDIUM: Laps 1–1
  MEDIUM: Laps 4–20
  HARD: Laps 21–34
  HARD: Laps 35–46
  MEDIUM: Laps 47–57

2020_BHR — Driver: ALB
Strategy: MEDIUM → MEDIUM → MEDIUM → HARD
  MEDIUM: Laps 1–1
  MEDIUM: Laps 4–19
  MEDIUM: Laps 20–34
  HARD: Laps 35–57

2020_BHR — Driver: NOR
Strategy: MEDIUM → MEDIUM → MEDIUM → HARD
  MEDIUM: Laps 1–1
  MEDIUM: Laps 4–18
  MEDIUM: Laps 19–38
  HARD: Laps 39–57

2021_BHR — Driver: HAM
Strategy: MEDIUM → HARD → HARD
  MEDIUM: Laps 1–13
  HARD: Laps 14–28
  HARD: Laps 29–56

2021_BHR — Driver: VER
Strategy: MEDIUM → MEDIUM → HARD
  MEDIUM: Laps 1–17
  MEDIUM: Laps 18–39
  HARD: Laps 40–56

2021_BHR — Driver: BOT
Strategy: MEDIUM → HARD → HARD → MEDIUM
  MEDIUM: Laps 1–16
  HARD: Laps 17–30
  HARD: Laps 31–54
  MEDIUM: Laps 55–56

2022_BHR — Driver: LEC
Strategy: SOFT → SOFT → MEDIUM → SOFT
  SOFT: Laps 1–15
  SOFT: Laps 16–31
  MEDIUM: Laps 32–46
  SOFT: Laps 49–57

2022_BHR — Driver: SAI
Str

In [29]:
import numpy as np

pattern_to_pits = {}

for s in top3_strategies:
    pattern = "-".join([CMP_MAP[c] for c in s["strategy"]])

    stints = s["stints"]
    stint_bounds = []
    for stint, row in stints.iterrows():
        stint = int(stint)
        stint_bounds.append((int(row["min"]), int(row["max"])))

    pit_laps = []
    for i in range(len(stint_bounds) - 1):
        next_start = stint_bounds[i+1][0]
        pit_laps.append(next_start)

    pattern_to_pits.setdefault(pattern, []).append(pit_laps)

pattern_to_pits


{'M-M-H-H-M': [[4, 21, 35, 47]],
 'M-M-M-H': [[4, 20, 35], [4, 19, 39]],
 'M-H-H': [[14, 29]],
 'M-M-H': [[18, 40]],
 'M-H-H-M': [[17, 31, 55]],
 'S-S-M-S': [[16, 32, 49], [15, 34, 45]],
 'S-H-M-S': [[12, 28, 45]],
 'S-S-H': [[15, 37], [18, 35]],
 'S-H-H': [[15, 35]]}

In [30]:
avg_pits_by_pattern = {}

for pattern, pits_list in pattern_to_pits.items():
    pits_array = np.array(pits_list)  # shape: (#races_with_pattern, #pits)
    avg_pits = pits_array.mean(axis=0)
    avg_pits_by_pattern[pattern] = [int(round(x)) for x in avg_pits]

avg_pits_by_pattern


{'M-M-H-H-M': [4, 21, 35, 47],
 'M-M-M-H': [4, 20, 37],
 'M-H-H': [14, 29],
 'M-M-H': [18, 40],
 'M-H-H-M': [17, 31, 55],
 'S-S-M-S': [16, 33, 47],
 'S-H-M-S': [12, 28, 45],
 'S-S-H': [16, 36],
 'S-H-H': [15, 35]}

In [35]:
def find_best_strategies(weather, driver=None, top_k=5, tweak_window=3):
    """
    Search only around historically successful strategies.
    Much faster than brute force.
    """
    results = []
    total_laps = 57

    for pattern, base_pits in avg_pits_by_pattern.items():
        n_pits = len(base_pits)

        # 1-stop
        if n_pits == 1:
            base = base_pits[0]
            for p in range(base - tweak_window, base + tweak_window + 1):
                if p <= 5 or p >= total_laps - 5:
                    continue
                total = simulate_race(pattern, [p], weather, driver)
                results.append((pattern, [p], total))

        # 2-stop
        elif n_pits == 2:
            base1, base2 = base_pits
            for p1 in range(base1 - tweak_window, base1 + tweak_window + 1):
                for p2 in range(base2 - tweak_window, base2 + tweak_window + 1):
                    if not (5 < p1 < p2 < total_laps - 5):
                        continue
                    total = simulate_race(pattern, [p1, p2], weather, driver)
                    results.append((pattern, [p1, p2], total))

    results.sort(key=lambda x: x[2])
    return results[:top_k]


In [37]:
weather_2023 = get_year_weather(bahrain, 2023)

best_2023 = find_best_strategies(weather_2023, driver="HAM", top_k=3)
best_2023


[('S-S-H', [14, 35], 5709.376759588459),
 ('S-S-H', [15, 36], 5709.4022208932765),
 ('S-S-H', [15, 35], 5709.424625706993)]