# Setup

In [8]:
import fastf1

from f1_etl import DriverLabelEncoder, DataConfig, SessionConfig, create_safety_car_dataset
from f1_etl import DataAggregator, RawDataExtractor
from f1_etl.config import create_season_configs

# Qatar Grand Prix (2024)

- Nico Hülkenberg (HUL, #27) – Involved in a Turn 1 collision on Lap 1 (with Ocon and Colapinto), which brought out a Safety Car. Later in the race, Hülkenberg spun into the gravel on Lap 40, contributing to a VSC that transitioned into a full Safety Car.

- Esteban Ocon (OCO, #31) – Caught in the Lap 1 collision (with Hülkenberg and Colapinto) that triggered the opening-lap Safety Car. Ocon was eliminated from the race in this incident.
Franco Colapinto (COL, #43) – Also involved in the Lap 1 multi-car collision with Hülkenberg and Ocon, which resulted in an immediate Safety Car. Colapinto was eliminated from the race due to this crash.

- Alex Albon (ALB, #23) – Lost a wing mirror on the main straight during Lap 35. The debris was left on track and subsequently hit by another car, leading to a full Safety Car as marshals cleared the shattered mirror. (Albon’s incident led directly to an SC deployment, not just a VSC.)

- Valtteri Bottas (BOT, #77) – Ran over Albon’s detached mirror on Lap 35, which scattered carbon-fibre debris across the track. This worsening of the hazard (and resulting punctures on other cars) prompted the Safety Car deployment to allow cleanup of the debris.

- Sergio Pérez (PER, #11) – Spun off on Lap 40 under his own accord and became stranded on track just as the previous Safety Car period was ending. This, combined with Hülkenberg’s simultaneous spin, initially brought out a Virtual Safety Car which then escalated to a full Safety Car. (Pérez’s incident was a VSC → SC trigger.)

In [2]:
session_qa = fastf1.get_session(2024, "Qatar Grand Prix", "R")
session_qa.load()

core           INFO 	Loading data for Qatar Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '81', '63', '10', '55', '14', '24', '20', '4', '77', '44', '22', '30', '23', '27', '11', '18', '43', '31']


In [4]:
driver_enc_qa = DriverLabelEncoder()
driver_enc_qa.fit_session(session_qa)

driver_enc_qa.driver_to_number

{'VER': '1',
 'LEC': '16',
 'PIA': '81',
 'RUS': '63',
 'GAS': '10',
 'SAI': '55',
 'ALO': '14',
 'ZHO': '24',
 'MAG': '20',
 'NOR': '4',
 'BOT': '77',
 'HAM': '44',
 'TSU': '22',
 'LAW': '30',
 'ALB': '23',
 'HUL': '27',
 'PER': '11',
 'STR': '18',
 'COL': '43',
 'OCO': '31'}

In [None]:
data_config = DataConfig(
    sessions=[
        SessionConfig(2024, "Qatar Grand Prix", "R")
    ],
    drivers=["27", "31", "23", "77", "11"]
)

# TrackStatus Encoder

In [9]:
from f1_etl import FixedVocabTrackStatusEncoder

track_status_enc = FixedVocabTrackStatusEncoder()

In [10]:
config = data_config

for session in config.sessions:
    effective_drivers = config.get_effective_drivers(session)
    

# Step 1: Extract raw data
extractor = RawDataExtractor(config.cache_dir)
sessions_data = [
    extractor.extract_session(session_config) for session_config in config.sessions
]

# Step 2: Aggregate data with per-session driver filtering
aggregator = DataAggregator()
telemetry_data = aggregator.aggregate_telemetry_data(
    sessions_data, config, config.sessions
)

if telemetry_data.empty:
    raise ValueError("No telemetry data extracted")

# Step 3: Setup fixed vocabulary encoder for track status

label_encoder = FixedVocabTrackStatusEncoder(use_onehot=False)


# Analyze distributions before encoding (optional but useful)
label_encoder.analyze_data(telemetry_data["TrackStatus"], "training_data")

if "TrackStatus" not in telemetry_data.columns:
    raise ValueError("TrackStatus column not found in telemetry data")

# Fit and transform
encoded_labels = label_encoder.fit_transform(telemetry_data["TrackStatus"])

core           INFO 	Loading data for Qatar Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Qatar Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '81', '63', '10', '55', '14', '24', '20', '4', '77', '44', '22', '30', '23', '27', '11', '18', '43', '31']
core           INFO 	Loading data for Chinese Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Chinese Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '11', '16', '55', '63', '14', '81', '44', '27', '31', '23', '10', '24', '18', '20', '2', '3', '22', '77']
core           INFO 	Loading data for Mexico City Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Mexico City Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['55', '4', '16', '44', '63', '1', '20', '81', '27', '10', '18', '43', '31', '77', '24', '30', '11', '14', '23', '22']



📊 Track Status Analysis (training_data):
   green       : 1425374 samples ( 82.2%)
   safety_car  : 146082 samples (  8.4%)
   vsc         :  2337 samples (  0.1%)
   yellow      : 159368 samples (  9.2%)
   Missing classes: [np.str_('red'), np.str_('unknown'), np.str_('vsc_ending')]
✅ FixedVocabTrackStatusEncoder fitted
   Classes seen: ['green', 'safety_car', 'vsc', 'yellow']
   Total classes: 7
   Output mode: integer labels


In [13]:
label_encoder.class_to_idx

{np.str_('green'): 0,
 np.str_('red'): 1,
 np.str_('safety_car'): 2,
 np.str_('unknown'): 3,
 np.str_('vsc'): 4,
 np.str_('vsc_ending'): 5,
 np.str_('yellow'): 6}

In [15]:
label_encoder.class_to_idx['safety_car']

2

# All Grand Prix (2024)

In [None]:
data_config = DataConfig(
    sessions=[
        SessionConfig(2024, "Qatar Grand Prix", "R"),
        SessionConfig(2024, "Chinese Grand Prix", "R"), 
        SessionConfig(2024, "Canadian Grand Prix", "R"),
        SessionConfig(2024, "Mexico City Grand Prix", "R"),
        SessionConfig(2024, "São Paulo Grand Prix", "R"),
        SessionConfig(2024, "Miami Grand Prix", "R"),
        SessionConfig(2024, "Saudi Arabian Grand Prix", "R"),
        SessionConfig(2024, "United States Grand Prix", "R"),
        SessionConfig(2024, "Monaco Grand Prix", "R")
    ],
    drivers=[
        # Qatar GP
        "27", "31", "43", "23", "77", "11",
        # Chinese GP  
        "77", "18", "3", "20", "22",
        # Canadian GP
        "2", "55", "23",
        # Mexico City GP
        "22", "23", 
        # São Paulo GP
        "27", "43", "55",
        # Miami GP
        "1", "20", "2",
        # Saudi Arabian GP
        "18",
        # US GP
        "44",
        # Monaco GP
        "11", "20", "27"
    ],
    include_weather=False,
)

# Session Filtering By Driver

## Method 1: Using per-session drivers in SessionConfig

In [17]:
safety_car_sessions = [
    SessionConfig(2024, "Qatar Grand Prix", "R", drivers=["27", "31", "43", "23", "77", "11"]),
    SessionConfig(2024, "Chinese Grand Prix", "R", drivers=["77", "18", "3", "20", "22"]),
    SessionConfig(2024, "Canadian Grand Prix", "R", drivers=["2", "55", "23"]),
    SessionConfig(2024, "Mexico City Grand Prix", "R", drivers=["22", "23"]),
    SessionConfig(2024, "São Paulo Grand Prix", "R", drivers=["27", "43", "55"]),
    SessionConfig(2024, "Miami Grand Prix", "R", drivers=["1", "20", "2"]),
    SessionConfig(2024, "Saudi Arabian Grand Prix", "R", drivers=["18"]),
    SessionConfig(2024, "United States Grand Prix", "R", drivers=["44"]),
    SessionConfig(2024, "Monaco Grand Prix", "R", drivers=["11", "20", "27"]),
]

config = DataConfig(
    sessions=safety_car_sessions,
    include_weather=False,
)

safety_car_class_num = label_encoder.class_to_idx['safety_car']
vsc_class_num = label_encoder.class_to_idx['vsc']

dataset = create_safety_car_dataset(
    config=config,
    window_size=50,
    prediction_horizon=100,
    normalize=True,
    target_column="TrackStatus",
    resampling_strategy="smote",
    
    # resampling_config={
    #     str(safety_car_class_num): 0.3, 
    #     str(vsc_class_num): 0.2
    # }  # Focus on safety car and VSC events
)

2025-07-06 20:39:23,512 - f1_etl - INFO - Preprocessing configuration:
2025-07-06 20:39:23,513 - f1_etl - INFO -   Missing values: enabled (forward_fill)
2025-07-06 20:39:23,514 - f1_etl - INFO -   Normalization: enabled (standard)
2025-07-06 20:39:23,514 - f1_etl - INFO -   Resampling: smote
2025-07-06 20:39:23,514 - f1_etl - INFO - Driver configuration:
2025-07-06 20:39:23,515 - f1_etl - INFO -   Global drivers: None
2025-07-06 20:39:23,515 - f1_etl - INFO -   Qatar Grand Prix: ['27', '31', '43', '23', '77', '11']
2025-07-06 20:39:23,515 - f1_etl - INFO -   Chinese Grand Prix: ['77', '18', '3', '20', '22']
2025-07-06 20:39:23,516 - f1_etl - INFO -   Canadian Grand Prix: ['2', '55', '23']
2025-07-06 20:39:23,516 - f1_etl - INFO -   Mexico City Grand Prix: ['22', '23']
2025-07-06 20:39:23,517 - f1_etl - INFO -   São Paulo Grand Prix: ['27', '43', '55']
2025-07-06 20:39:23,517 - f1_etl - INFO -   Miami Grand Prix: ['1', '20', '2']
2025-07-06 20:39:23,517 - f1_etl - INFO -   Saudi Arabia

Loading session: 2024 Qatar Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '81', '63', '10', '55', '14', '24', '20', '4', '77', '44', '22', '30', '23', '27', '11', '18', '43', '31']
core           INFO 	Loading data for Chinese Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Chinese Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '11', '16', '55', '63', '14', '81', '44', '27', '31', '23', '10', '24', '18', '20', '2', '3', '22', '77']
core           INFO 	Loading data for Canadian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Canadian Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '63', '44', '81', '14', '18', '3', '10', '31', '27', '20', '77', '22', '24', '55', '23', '11', '16', '2']
core           INFO 	Loading data for Mexico City Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Mexico City Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['55', '4', '16', '44', '63', '1', '20', '81', '27', '10', '18', '43', '31', '77', '24', '30', '11', '14', '23', '22']
core           INFO 	Loading data for São Paulo Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 São Paulo Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '31', '10', '63', '16', '4', '22', '81', '30', '44', '11', '50', '77', '14', '24', '55', '43', '23', '18', '27']
core           INFO 	Loading data for Miami Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Miami Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '16', '11', '55', '44', '22', '63', '14', '31', '27', '10', '81', '24', '3', '77', '18', '23', '20', '2']
core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Saudi Arabian Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '16', '81', '14', '63', '38', '4', '44', '27', '23', '20', '31', '2', '22', '3', '77', '24', '18', '10']
core           INFO 	Loading data for United States Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 United States Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '1', '4', '81', '63', '11', '27', '30', '43', '20', '10', '14', '22', '18', '23', '77', '31', '24', '44']
core           INFO 	Loading data for Monaco Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Monaco Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '81', '55', '4', '63', '1', '44', '22', '23', '10', '14', '3', '77', '18', '2', '24', '31', '11', '27', '20']
2025-07-06 20:39:43,908 - f1_etl - INFO - Creating new fixed vocabulary encoder



📊 Track Status Analysis (training_data):
   green       : 1651547 samples ( 83.6%)
   red         : 75876 samples (  3.8%)
   safety_car  : 173787 samples (  8.8%)
   vsc         : 13317 samples (  0.7%)
   vsc_ending  :   555 samples (  0.0%)
   yellow      : 60370 samples (  3.1%)
   Missing classes: [np.str_('unknown')]
✅ FixedVocabTrackStatusEncoder fitted
   Classes seen: ['green', 'red', 'safety_car', 'vsc', 'vsc_ending', 'yellow']
   Total classes: 7
   Output mode: integer labels


2025-07-06 20:39:44,438 - f1_etl - INFO - Applying smote resampling at session/driver level
2025-07-06 20:39:44,439 - f1_etl - INFO - Sampling strategy: minority
2025-07-06 20:39:49,280 - f1_etl - INFO - Resampling complete: 1975452 -> 3593677 samples
2025-07-06 20:39:49,287 - f1_etl - INFO - Class distribution before resampling:
2025-07-06 20:39:49,287 - f1_etl - INFO -   1: 1651547
2025-07-06 20:39:49,288 - f1_etl - INFO -   4: 173787
2025-07-06 20:39:49,288 - f1_etl - INFO -   5: 75876
2025-07-06 20:39:49,288 - f1_etl - INFO -   2: 60370
2025-07-06 20:39:49,288 - f1_etl - INFO -   6: 13317
2025-07-06 20:39:49,289 - f1_etl - INFO -   7: 555
2025-07-06 20:39:49,289 - f1_etl - INFO - Class distribution after resampling:
2025-07-06 20:39:49,289 - f1_etl - INFO -   1: 1651547
2025-07-06 20:39:49,289 - f1_etl - INFO -   6: 657548
2025-07-06 20:39:49,289 - f1_etl - INFO -   4: 621347
2025-07-06 20:39:49,290 - f1_etl - INFO -   7: 404748
2025-07-06 20:39:49,290 - f1_etl - INFO -   2: 182611

## Method 2: Focus on specific drivers across multiple races

In [None]:
# Example: Nico Hülkenberg (#27) - involved in Qatar, São Paulo, and Monaco incidents
hulkenberg_config = DataConfig(
    sessions=[
        SessionConfig(2024, "Qatar Grand Prix", "R", drivers=["27"]),      # Turn 1 collision + Lap 40 spin
        SessionConfig(2024, "São Paulo Grand Prix", "R", drivers=["27"]),  # Lap 28 VSC incident
        SessionConfig(2024, "Monaco Grand Prix", "R", drivers=["27"]),     # Lap 1 multi-car crash
    ],
    include_weather=False,
)
dataset_hulk_incidents = create_safety_car_dataset(
    config=hulkenberg_config,
    window_size=50,
    prediction_horizon=100,
    normalize=True,
    target_column="TrackStatus",
    resampling_strategy="smote",
    resampling_config={
        "2": 0.5
    }
)

## Method 3: Study specific race incidents

In [None]:
# Qatar GP - most complex with multiple safety car periods
qatar_config = DataConfig(
    sessions=[
        SessionConfig(2024, "Qatar Grand Prix", "R", 
                     drivers=["27", "31", "43", "23", "77", "11"])  # All drivers involved in SC/VSC
    ],
    include_weather=False,
)

## Method 4: Using enhanced create_season_configs for targeted analysis


In [None]:
safety_car_drivers_per_race = {
    "Qatar Grand Prix": ["27", "31", "43", "23", "77", "11"],      # Multiple incidents
    "Chinese Grand Prix": ["77", "18", "3", "20", "22"],           # Engine failure + restart chaos
    "Canadian Grand Prix": ["2", "55", "23"],                      # Wet weather crashes
    "Mexico City Grand Prix": ["22", "23"],                        # Turn 1 contact
    "São Paulo Grand Prix": ["27", "43", "55"],                    # Rain-related incidents
    "Miami Grand Prix": ["1", "20", "2"],                          # Bollard + collision
    "Saudi Arabian Grand Prix": ["18"],                            # Solo crash
    "United States Grand Prix": ["44"],                            # Hamilton spin
    "Monaco Grand Prix": ["11", "20", "27"],                       # Multi-car crash
}

# Create sessions only for races with safety car incidents
safety_car_sessions = create_season_configs(
    year=2024,
    session_types=['R'],
    drivers_per_session=safety_car_drivers_per_race,
    exclude_events=[  # Exclude races without safety car incidents
        "Bahrain Grand Prix",
        "Japanese Grand Prix", 
        "Australian Grand Prix",
        "Emilia Romagna Grand Prix",
        "Spanish Grand Prix",
        "Austrian Grand Prix",
        "British Grand Prix",
        "Hungarian Grand Prix",
        "Belgian Grand Prix",
        "Dutch Grand Prix",
        "Italian Grand Prix",
        "Singapore Grand Prix",
        "Las Vegas Grand Prix",
        "Abu Dhabi Grand Prix"
    ]
)

config = DataConfig(sessions=safety_car_sessions, include_weather=False)

## Method 5: Comparative analysis - high incident drivers vs. clean drivers

In [None]:
# Drivers with multiple safety car involvements vs. those with none
frequent_incident_drivers = ["27", "23", "18"]  # Hülkenberg, Albon, Stroll
clean_drivers = ["1", "44", "55"]               # Verstappen, Hamilton, Sainz (for comparison)

comparison_config = DataConfig(
    sessions=[
        # Include both incident-prone and clean drivers across all safety car races
        SessionConfig(2024, "Qatar Grand Prix", "R", 
                     drivers=frequent_incident_drivers + clean_drivers),
        SessionConfig(2024, "Chinese Grand Prix", "R", 
                     drivers=frequent_incident_drivers + clean_drivers),
        SessionConfig(2024, "Canadian Grand Prix", "R", 
                     drivers=frequent_incident_drivers + clean_drivers),
    ],
    include_weather=True,  # Include weather for incident analysis
)

## Method 6: Rain-related incidents focus

In [None]:
# São Paulo and Canadian GP had wet weather incidents
wet_weather_config = DataConfig(
    sessions=[
        SessionConfig(2024, "São Paulo Grand Prix", "R", drivers=["27", "43", "55"]),
        SessionConfig(2024, "Canadian Grand Prix", "R", drivers=["2", "55", "23"]),
    ],
    include_weather=True,  # Weather data crucial for rain analysis
)

# Example dataset creation with resampling for safety car prediction
dataset = create_safety_car_dataset(
    config=config,
    window_size=100,         # 100 data points before incident
    prediction_horizon=50,   # Predict 50 steps ahead
    normalize=True,
    target_column="TrackStatus",
    resampling_strategy="borderline_smote",  # Good for imbalanced time series
    resampling_config={
        "safety_car": 0.4,   # Increase safety car samples to 40% of majority
        "vsc": 0.3,          # Increase VSC samples to 30% of majority  
        "yellow": 0.2,       # Increase yellow flag samples to 20% of majority
    },
    enable_debug=True
)