##Data Cleaning

In [3]:
import pandas as pd
import os

# Create folder if not exists
os.makedirs("cleanedData", exist_ok=True)

# Load datasets
races = pd.read_csv("/workspaces/group-project-team-omega/data/races.csv")
results = pd.read_csv("/workspaces/group-project-team-omega/data/results.csv")
pit_stops = pd.read_csv("/workspaces/group-project-team-omega/data/pit_stops.csv")
drivers = pd.read_csv("/workspaces/group-project-team-omega/data/drivers.csv")
constructors = pd.read_csv("/workspaces/group-project-team-omega/data/constructors.csv")

# ---------- STEP 1: Keep races from 2000 onwards ----------
races_clean = races[races["year"] >= 2000].copy()
valid_raceIds = races_clean["raceId"].unique()

# ---------- STEP 2: Filter results for these races ----------
results_clean = results[results["raceId"].isin(valid_raceIds)].copy()

# ---------- STEP 3: Filter pit stops for these races ----------
pit_stops_clean = pit_stops[pit_stops["raceId"].isin(valid_raceIds)].copy()

# ---------- STEP 4: Compute first pit lap ----------
first_pit = (
    pit_stops_clean.groupby(["raceId", "driverId"])["lap"]
    .min()
    .reset_index()
    .rename(columns={"lap": "first_pit_lap"})
)

# ---------- STEP 5: Compute pit stop count ----------
pit_count = (
    pit_stops_clean.groupby(["raceId", "driverId"])["lap"]
    .count()
    .reset_index()
    .rename(columns={"lap": "pit_stop_count"})
)

# ---------- STEP 6: Merge pit info into results ----------
results_clean = results_clean.merge(first_pit, on=["raceId", "driverId"], how="left")
results_clean = results_clean.merge(pit_count, on=["raceId", "driverId"], how="left")

# Fill missing pit_stop_count as 0
results_clean["pit_stop_count"] = results_clean["pit_stop_count"].fillna(0)

# ---------- STEP 7: REMOVE >2 PIT STOPS ----------
results_clean = results_clean[results_clean["pit_stop_count"] <= 2]

# ---------- STEP 8: Remove early DNFs + 0 stops ----------
early_dnf = (
    (results_clean["pit_stop_count"] == 0)
    & (results_clean["laps"] < 3)
)
results_clean = results_clean[~early_dnf]

# ---------- SAVE CLEANED FILES ----------
results_clean.to_csv("cleanedData/clean_results.csv", index=False)
races_clean.to_csv("cleanedData/clean_races.csv", index=False)
drivers.to_csv("cleanedData/clean_drivers.csv", index=False)
constructors.to_csv("cleanedData/clean_constructors.csv", index=False)

print("Data cleaning completed → saved in cleanedData/")


Data cleaning completed → saved in cleanedData/
