In [3]:
import os
import numpy as np
import pandas as pd
import fastf1
from tqdm import tqdm


# ========================
# CONFIG
# ========================
fastf1.Cache.enable_cache("cache")

SEASONS = [2020, 2021, 2022, 2023]
GP_NAME = "Bahrain Grand Prix"
SESSION_TYPES = ["R"]

OUTPUT_PATH = "data/bahrain_2020_2023_race_laps.csv"


# ========================
# UTILITIES
# ========================

def safe_timedelta_to_sec(val):
    if pd.isna(val):
        return np.nan
    return float(val.total_seconds())


def extract_telemetry_features(lap):
    features = {
        "speed_mean": np.nan,
        "speed_max": np.nan,
        "throttle_mean": np.nan,
        "brake_mean": np.nan,
        "rpm_mean": np.nan,
        "rpm_max": np.nan,
        "drs_activations": np.nan,
    }
    try:
        tel = lap.get_car_data().add_distance()
    except:
        return features

    if tel.empty:
        return features

    # Speed
    if "Speed" in tel.columns:
        features["speed_mean"] = float(tel["Speed"].mean())
        features["speed_max"] = float(tel["Speed"].max())

    # Throttle
    if "Throttle" in tel.columns:
        features["throttle_mean"] = float(tel["Throttle"].mean())

    # Brake
    if "Brake" in tel.columns:
        features["brake_mean"] = float(tel["Brake"].mean())

    # RPM
    if "RPM" in tel.columns:
        features["rpm_mean"] = float(tel["RPM"].mean())
        features["rpm_max"] = float(tel["RPM"].max())

    # DRS
    if "DRS" in tel.columns:
        features["drs_activations"] = int((tel["DRS"] > 0).sum())

    return features


def extract_weather_for_lap(lap, weather_df):
    default = {
        "air_temp": np.nan,
        "track_temp": np.nan,
        "humidity": np.nan,
        "wind_speed": np.nan,
        "wind_dir": np.nan,
        "pressure": np.nan,
    }

    if weather_df is None or weather_df.empty:
        return default

    lap_start = lap.get("LapStartTime")
    lap_time = lap.get("LapTime")

    if pd.isna(lap_start) or pd.isna(lap_time):
        return default

    try:
        lap_mid = lap_start + (lap_time / 2)
        idx = (weather_df["Time"] - lap_mid).abs().idxmin()
        w = weather_df.loc[idx]

        return {
            "air_temp": float(w.get("AirTemp", np.nan)),
            "track_temp": float(w.get("TrackTemp", np.nan)),
            "humidity": float(w.get("Humidity", np.nan)),
            "wind_speed": float(w.get("WindSpeed", np.nan)),
            "wind_dir": float(w.get("WindDirection", np.nan)),
            "pressure": float(w.get("Pressure", np.nan)),
        }
    except:
        return default


def extract_lap_row(lap, session, season):
    tel_feats = extract_telemetry_features(lap)
    weather_feats = extract_weather_for_lap(lap, session.weather_data)

    pit_in = lap.get("PitInTime")
    pit_out = lap.get("PitOutTime")
    is_pit = (not pd.isna(pit_in)) or (not pd.isna(pit_out))

    return {
        "season": season,
        "gp_name": session.event.get("EventName", "na"),
        "session_name": session.name,
        "session_type": session.session_info.get("Type", "NA"),

        "driver": lap.get("Driver", "na"),
        "team": lap.get("Team", "na"),

        "lap_number": lap.get("LapNumber", np.nan),
        "lap_time": safe_timedelta_to_sec(lap.get("LapTime")),
        "sector_1_time": safe_timedelta_to_sec(lap.get("Sector1Time")),
        "sector_2_time": safe_timedelta_to_sec(lap.get("Sector2Time")),
        "sector_3_time": safe_timedelta_to_sec(lap.get("Sector3Time")),
        "position": lap.get("Position", np.nan),
        "track_status": lap.get("TrackStatus", np.nan),
        "is_pit_lap": is_pit,

        "compound": lap.get("Compound", "na"),
        "stint": lap.get("Stint", np.nan),
        "tyre_life": lap.get("TyreLife", np.nan),
        "fresh_tyre": lap.get("FreshTyre", np.nan),

        **tel_feats,
        **weather_feats,
    }


# ========================
# MAIN
# ========================

def main():
    rows = []

    for season in SEASONS:
        print(f"\nüìÇ Loading {season} ‚Äì {GP_NAME} ‚Äì Race")

        try:
            session = fastf1.get_session(season, GP_NAME, "R")
            session.load()
        except Exception as e:
            print(f"‚ùå Cannot load session: {e}")
            continue

        laps = session.laps
        laps = laps[~laps["LapTime"].isna()]  # keep only valid laps

        for _, lap in tqdm(laps.iterrows(), total=len(laps),
                           desc=f"   üèé {season} Bahrain"):
            try:
                rows.append(extract_lap_row(lap, session, season))
            except Exception as e:
                print(f"‚ö†Ô∏è Skipped lap: {e}")

    if not rows:
        print("‚ö†Ô∏è No data collected.")
        return

    df = pd.DataFrame(rows)
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
    df.to_csv(OUTPUT_PATH, index=False)

    print(f"\n‚úÖ Saved {len(df)} laps ‚Üí {OUTPUT_PATH}")
    print(f"   Columns: {df.columns.tolist()}")


if __name__ == "__main__":
    main()


core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data



üìÇ Loading 2020 ‚Äì Bahrain Grand Prix ‚Äì Race


req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '23', '4', '55', '10', '3', '77', '31', '16', '26', '63', '5', '6', '7', '99', '20', '11', '18', '8']
   üèé 2020 Bahrain: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 958/958 [00:12<00:00, 77.78it/s] 
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached 


üìÇ Loading 2021 ‚Äì Bahrain Grand Prix ‚Äì Race


req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '77', '4', '11', '16', '3', '55', '22', '18', '7', '99', '31', '63', '5', '47', '10', '6', '14', '9']
   üèé 2021 Bahrain: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1018/1018 [00:09<00:00, 104.92it/s]
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached 


üìÇ Loading 2022 ‚Äì Bahrain Grand Prix ‚Äì Race


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '44', '63', '20', '77', '31', '22', '14', '24', '47', '18', '23', '3', '4', '6', '27', '11', '1', '10']
   üèé 2022 Bahrain: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1118/1118 [00:09<00:00, 121.38it/s]
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info



üìÇ Loading 2023 ‚Äì Bahrain Grand Prix ‚Äì Race


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']
   üèé 2023 Bahrain: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1055/1055 [00:10<00:00, 101.47it/s]



‚úÖ Saved 4149 laps ‚Üí data/bahrain_2020_2023_race_laps.csv
   Columns: ['season', 'gp_name', 'session_name', 'session_type', 'driver', 'team', 'lap_number', 'lap_time', 'sector_1_time', 'sector_2_time', 'sector_3_time', 'position', 'track_status', 'is_pit_lap', 'compound', 'stint', 'tyre_life', 'fresh_tyre', 'speed_mean', 'speed_max', 'throttle_mean', 'brake_mean', 'rpm_mean', 'rpm_max', 'drs_activations', 'air_temp', 'track_temp', 'humidity', 'wind_speed', 'wind_dir', 'pressure']
