In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os

# Create the cache directory if it doesn't exist
cache_dir = '/content/drive/MyDrive/FastF1_cache'
os.makedirs(cache_dir, exist_ok=True)
!pip install optuna joblib

!pip install fastf1
import fastf1 as f1

# Enable caching on Google Drive
f1.Cache.enable_cache(cache_dir)

Mounted at /content/drive
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.2.1
Collecting fastf1
  Downloading fastf1-3.5.3-py3-none-any.whl.metadata (4.6 kB)
Collecting rapidfuzz (from fastf1)
  Downloading rapidfuzz-3.12.2-cp311-cp311-many

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
import optuna
import joblib

# Load combined pit stop preprocessed data (2021-2023)
data_path = '/content/drive/MyDrive/ML_Project/Combined_PitStop_Preprocessed_2021_2023.csv'
df = pd.read_csv(data_path)

# Drop rows with missing key columns
df = df.dropna(subset=["PitStopLapTimeRatio", "PitStopCount", "AvgPitDuration", "TotalPitDuration",
                         "AvgTyreLife", "AvgLapTime", "AvgPrevLapTime", "AvgNextLapTime"])

# Define features and target
features = ["PitStopCount", "AvgPitDuration", "TotalPitDuration", "AvgTyreLife",
            "AvgLapTime", "AvgPrevLapTime", "AvgNextLapTime"]
target = "PitStopLapTimeRatio"
X = df[features]
y = df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Objective function for Optuna hyperparameter tuning
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "tree_method": "hist",
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True)
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    rmses = []
    for train_idx, valid_idx in cv.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        model = xgb.XGBRegressor(**params, random_state=42, n_estimators=1000)
        try:
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
        except TypeError:
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        rmses.append(rmse)
    return np.mean(rmses)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50, timeout=600)

print("Best trial:")
trial = study.best_trial
print("  RMSE: {:.4f}".format(trial.value))
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

best_params = trial.params
best_params["objective"] = "reg:squarederror"
best_params["eval_metric"] = "rmse"
best_params["tree_method"] = "hist"

final_model = xgb.XGBRegressor(**best_params, random_state=42, n_estimators=1000)
try:
    final_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=True)
except TypeError:
    final_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

y_pred = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Final RMSE on test set:", final_rmse)

model_path = "/content/drive/MyDrive/ML_Project/pitstop_model_xgb.pkl"
joblib.dump(final_model, model_path)
print("Model saved to", model_path)


[I 2025-04-02 18:39:47,585] A new study created in memory with name: no-name-c12864da-8507-4363-8bda-96ea75ae77e1
[I 2025-04-02 18:39:49,898] Trial 0 finished with value: 0.7138056172173909 and parameters: {'booster': 'gbtree', 'max_depth': 7, 'learning_rate': 0.09907366238069121, 'subsample': 0.6947262988345622, 'colsample_bytree': 0.5568864114161325, 'lambda': 9.635172035713985, 'alpha': 4.069039634795697}. Best is trial 0 with value: 0.7138056172173909.
[I 2025-04-02 18:46:40,784] Trial 1 finished with value: 0.6986171694354211 and parameters: {'booster': 'dart', 'max_depth': 10, 'learning_rate': 0.012415556876745677, 'subsample': 0.5784058240387313, 'colsample_bytree': 0.5480803925189823, 'lambda': 0.5519387396593203, 'alpha': 8.318882513380567}. Best is trial 1 with value: 0.6986171694354211.
[I 2025-04-02 18:53:38,665] Trial 2 finished with value: 0.43906478354157114 and parameters: {'booster': 'dart', 'max_depth': 6, 'learning_rate': 0.010298172984615541, 'subsample': 0.51835927

Best trial:
  RMSE: 0.4391
  Params:
    booster: dart
    max_depth: 6
    learning_rate: 0.010298172984615541
    subsample: 0.5183592757718707
    colsample_bytree: 0.852231805918072
    lambda: 2.7487961055314374
    alpha: 2.2089403151924363
[0]	validation_0-rmse:3.66258
[1]	validation_0-rmse:3.63493
[2]	validation_0-rmse:3.60965
[3]	validation_0-rmse:3.58896
[4]	validation_0-rmse:3.56089
[5]	validation_0-rmse:3.53248
[6]	validation_0-rmse:3.51226
[7]	validation_0-rmse:3.48759
[8]	validation_0-rmse:3.46173
[9]	validation_0-rmse:3.44185
[10]	validation_0-rmse:3.41476
[11]	validation_0-rmse:3.38952
[12]	validation_0-rmse:3.36563
[13]	validation_0-rmse:3.34518
[14]	validation_0-rmse:3.32657
[15]	validation_0-rmse:3.30285
[16]	validation_0-rmse:3.27703
[17]	validation_0-rmse:3.25406
[18]	validation_0-rmse:3.24978
[19]	validation_0-rmse:3.22729
[20]	validation_0-rmse:3.20913
[21]	validation_0-rmse:3.18581
[22]	validation_0-rmse:3.16772
[23]	validation_0-rmse:3.14272
[24]	validation_0-r

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
from IPython.display import display
import ipywidgets as widgets

# Create an output widget for our report
out = widgets.Output()
display(out)

with out:
    # Load 2024 preprocessed pit stop data
    df_2024 = pd.read_csv('/content/drive/MyDrive/ML_Project/2024_Pitstop_Preprocessed.csv')
    df_2024 = df_2024.dropna(subset=["PitStopLapTimeRatio", "PitStopCount", "AvgPitDuration",
                                       "TotalPitDuration", "AvgTyreLife", "AvgLapTime",
                                       "AvgPrevLapTime", "AvgNextLapTime"])

    features = ["PitStopCount", "AvgPitDuration", "TotalPitDuration", "AvgTyreLife",
                "AvgLapTime", "AvgPrevLapTime", "AvgNextLapTime"]
    target = "PitStopLapTimeRatio"
    X_2024 = df_2024[features]
    y_2024 = df_2024[target]

    # Load the trained model
    model_path = "/content/drive/MyDrive/ML_Project/pitstop_model_xgb.pkl"
    model = joblib.load(model_path)

    # Generate predictions
    y_pred = model.predict(X_2024)

    # Clip predictions to a maximum threshold (e.g., 6.5) to eliminate extreme outliers
    clip_threshold = 6.5
    y_pred_clipped = np.clip(y_pred, None, clip_threshold)
    df_2024["Predicted_PitStopLapTimeRatio"] = y_pred_clipped

    # Compute residuals using clipped predictions
    df_2024["Residual"] = df_2024["PitStopLapTimeRatio"] - df_2024["Predicted_PitStopLapTimeRatio"]

    # Evaluate model performance using RMSE and MAE on clipped predictions
    rmse_2024 = np.sqrt(mean_squared_error(y_2024, df_2024["Predicted_PitStopLapTimeRatio"]))
    mae_2024 = mean_absolute_error(y_2024, df_2024["Predicted_PitStopLapTimeRatio"])
    print("2024 RMSE (clipped):", rmse_2024)
    print("2024 MAE (clipped):", mae_2024)

    # Generate strategy recommendations based on a threshold for predicted PitStopLapTimeRatio
    # Here, if the clipped predicted ratio > threshold, recommend improvement.
    threshold = 1.0  # Adjust threshold based on domain insights
    df_2024["Recommended_Strategy"] = df_2024["Predicted_PitStopLapTimeRatio"].apply(
        lambda x: "Improve Pit Efficiency" if x > threshold else "Maintain Strategy"
    )

    # Create a detailed report grouped by Event, Team, and Driver
    report = df_2024.groupby(["Event", "Team", "Driver"]).agg({
        "PitStopCount": "mean",
        "AvgPitDuration": "mean",
        "TotalPitDuration": "mean",
        "AvgTyreLife": "mean",
        "AvgLapTime": "mean",
        "AvgPrevLapTime": "mean",
        "AvgNextLapTime": "mean",
        "PitStopLapTimeRatio": "mean",
        "Predicted_PitStopLapTimeRatio": "mean",
        "Recommended_Strategy": lambda x: x.mode()[0] if len(x.mode()) > 0 else "N/A"
    }).reset_index()

    # Rename aggregated columns for clarity
    report = report.rename(columns={
        "PitStopLapTimeRatio": "Avg_Actual_PitStopLapTimeRatio",
        "Predicted_PitStopLapTimeRatio": "Avg_Predicted_PitStopLapTimeRatio"
    })
    report["Difference"] = report["Avg_Actual_PitStopLapTimeRatio"] - report["Avg_Predicted_PitStopLapTimeRatio"]

    # Filter report to include only cases where recommended strategy is "Improve Pit Efficiency"
    optimization_report = report[report["Recommended_Strategy"] == "Improve Pit Efficiency"]

    # Save the detailed report as CSV
    report_path = "/content/drive/MyDrive/ML_Project/PitStop_Strategy_Report_2024.csv"
    optimization_report.to_csv(report_path, index=False)
    print("Detailed pit stop strategy report (optimization needed) saved to", report_path)

    # Visualize model performance: Scatter Plot of Actual vs Predicted
    plt.figure(figsize=(8,6))
    sns.scatterplot(x=df_2024["PitStopLapTimeRatio"], y=df_2024["Predicted_PitStopLapTimeRatio"])
    plt.xlabel("Actual PitStopLapTimeRatio")
    plt.ylabel("Predicted PitStopLapTimeRatio")
    plt.title("2024: Actual vs Predicted PitStopLapTimeRatio (Clipped)")
    plt.plot([df_2024["PitStopLapTimeRatio"].min(), df_2024["PitStopLapTimeRatio"].max()],
             [df_2024["PitStopLapTimeRatio"].min(), df_2024["PitStopLapTimeRatio"].max()], 'r--')
    plt.show()

    # Visualize residual distribution
    plt.figure(figsize=(8,6))
    sns.histplot(df_2024["Residual"], bins=30, kde=True)
    plt.xlabel("Residuals")
    plt.title("2024 Residual Distribution (Clipped)")
    plt.show()

    # Visualize strategy recommendations by Event
    plt.figure(figsize=(10,6))
    sns.countplot(x="Event", hue="Recommended_Strategy", data=df_2024)
    plt.xticks(rotation=45)
    plt.title("Strategy Recommendations by Event (Clipped Predictions)")
    plt.show()

    print("Detailed Report (Optimization Needed) - First 10 Rows:")
    display(optimization_report.head(10))

    print("\nInterpretation:")
    print("For each event, team, and driver, the report shows the average actual vs. predicted PitStopLapTimeRatio.")
    print("We applied clipping to the predictions to remove extreme outliers (values above 2.0).")
    print("A high predicted ratio (above the threshold of 1.0) suggests that the team should focus on improving pit stop efficiency")
    print("by speeding up tire changes, optimizing pit lane speed, and enhancing pit crew coordination.")


Output()

#Interpretation:
####The report shows, for each event, team, and driver, the average actual and predicted PitStopLapTimeRatio.
####A high predicted ratio suggests that a team should focus on speeding up tire changes, optimizing pit lane speed,
####and enhancing pit crew coordination to reduce time loss. The report lists exactly which events require optimization.