In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Load datasets
df = pd.read_csv("cleanedData/master_planA.csv")
track = pd.read_csv("cleanedData/track_features.csv")  # NEW

# Merge track features into master data
df = df.merge(track, on="circuitId", how="left")

# -------------------------------
# 1. Split known vs missing pits
# -------------------------------
known = df[df["first_pit_lap"].notna()].copy()
missing = df[df["first_pit_lap"].isna()].copy()

print("Known pit rows:", len(known))
print("Missing pit rows:", len(missing))

# -------------------------------
# 2. Select features for imputation
# -------------------------------
impute_features = [
    "year", "circuitId", "grid", "constructorId", "driverId",
    "round", "laps",
    "tyre_deg_level",          # NEW
    "pit_lane_loss",           # NEW
    "track_length_km",         # NEW
    "race_distance_km"         # NEW
]

# Drop rows missing these values
known = known.dropna(subset=impute_features)

# Prepare training data
X_train = known[impute_features]
y_train = known["first_pit_lap"]

# -------------------------------
# 3. Train model
# -------------------------------
model = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)
model.fit(X_train, y_train)

# -------------------------------
# 4. Predict missing pit laps
# -------------------------------
missing = missing.dropna(subset=impute_features)
X_missing = missing[impute_features]

missing["first_pit_lap"] = model.predict(X_missing)

# -------------------------------
# 5. Combine known + imputed
# -------------------------------
final_df = pd.concat([known, missing], ignore_index=True)

# -------------------------------
# 6. Save final full dataset
# -------------------------------
final_df.to_csv("cleanedData/new_master_planA_imputed.csv", index=False)

print("Imputation done → cleanedData/new_master_planA_imputed.csv")

Known pit rows: 4079
Missing pit rows: 4012
Imputation done → cleanedData/new_master_planA_imputed.csv
