In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# --------------------------------------
# LOAD CLEAN MASTER (before imputation)
# --------------------------------------
df = pd.read_csv("cleanedData/master_planA.csv")

# Keep only rows where we have real pit lap
known = df[df["first_pit_lap"].notna()].copy()

print("Total known pit rows:", len(known))

# --------------------------------------
# SELECT FEATURES FOR IMPUTATION MODEL
# --------------------------------------
features = ["year", "circuitId", "grid", "constructorId", "driverId", 
            "round", "laps"]

X = known[features]
y = known["first_pit_lap"]

# --------------------------------------
# SPLIT TRAIN / TEST
# --------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

# --------------------------------------
# TRAIN RANDOM FOREST IMPUTATION MODEL
# --------------------------------------
model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

model.fit(X_train, y_train)

# --------------------------------------
# PREDICT ON TEST SPLIT
# --------------------------------------
y_pred = model.predict(X_test)

# --------------------------------------
# METRICS
# --------------------------------------
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\n========== IMPUTATION MODEL PERFORMANCE ==========")
print("MAE (mean absolute error):", round(mae, 3))
print("RMSE:", round(rmse, 3))

# --------------------------------------
# SHOW SAMPLE PREDICTIONS
# --------------------------------------
print("\nSample Predictions (Actual vs Predicted):")
sample = pd.DataFrame({
    "actual": y_test.values[:10],
    "predicted": y_pred[:10]
})
print(sample)


Total known pit rows: 4079
Train size: 3059
Test size: 1020

MAE (mean absolute error): 6.046
RMSE: 8.478

Sample Predictions (Actual vs Predicted):
   actual  predicted
0    20.0     16.240
1    18.0     19.005
2    13.0     13.080
3    25.0     17.630
4    30.0     28.345
5    20.0     20.225
6    11.0     19.560
7    17.0     24.295
8    20.0     10.335
9    17.0     12.495


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# --------------------------------------
# LOAD MASTER BEFORE IMPUTATION
# --------------------------------------
df = pd.read_csv("cleanedData/master_planA.csv")
track = pd.read_csv("cleanedData/track_features.csv")

# Merge track features (same as imputation file)
df = df.merge(track, on="circuitId", how="left")

# Keep only rows where real pit stop exists
known = df[df["first_pit_lap"].notna()].copy()

print("Total known pit rows:", len(known))

# --------------------------------------
# FEATURES TO TEST THE IMPUTATION MODEL
# --------------------------------------
features = [
    "year", "circuitId", "grid", "constructorId", "driverId",
    "round", "laps",
    "tyre_deg_level",      # NEW
    "pit_lane_loss",       # NEW
    "track_length_km",     # NEW
    "race_distance_km"     # NEW
]

# Drop rows missing any feature (rare)
known = known.dropna(subset=features)

X = known[features]
y = known["first_pit_lap"]

# --------------------------------------
# TRAIN/TEST SPLIT
# --------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

# --------------------------------------
# TRAIN IMPUTATION MODEL
# --------------------------------------
model = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)
model.fit(X_train, y_train)

# --------------------------------------
# PREDICT ON TEST SET
# --------------------------------------
y_pred = model.predict(X_test)

# --------------------------------------
# METRICS
# --------------------------------------
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\n========== IMPUTER MODEL PERFORMANCE ==========")
print("MAE:", round(mae, 3))
print("RMSE:", round(rmse, 3))

# --------------------------------------
# SHOW SAMPLE PREDICTIONS
# --------------------------------------
print("\nSample Predictions (Actual vs Predicted):")
sample = pd.DataFrame({
    "actual": y_test.values[:10],
    "predicted": y_pred[:10]
})
print(sample)


Total known pit rows: 4079
Train size: 3059
Test size: 1020

MAE: 5.984
RMSE: 8.469

Sample Predictions (Actual vs Predicted):
   actual  predicted
0    20.0  16.336667
1    18.0  19.230000
2    13.0  12.750000
3    25.0  17.566667
4    30.0  28.640000
5    20.0  22.706667
6    11.0  18.796667
7    17.0  23.903333
8    20.0  10.640000
9    17.0  12.516667
