# V/SC Predictive Signal Fishing

## Setup

In [None]:
%%capture
!pip install -q fastf1

In [5]:
import fastf1 as f1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("FastF1 version:", f1.__version__)

f1.Cache._enable_default_cache()

FastF1 version: 3.5.3


In [None]:
f1.Cache.dis

In [6]:
season = 2024
schedule = f1.get_event_schedule(season)

In [None]:
session = f1.get_session(season, 'Saudi Arabian Grand Prix', 'R')
session.load()

core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
INFO:fastf1.fastf1.core:Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
INFO:fastf1.fastf1.req:Using cached data for session_info
req            INFO 	Using cached data for driver_info
INFO:fastf1.fastf1.req:Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
INFO:fastf1.fastf1.req:Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
INFO:fastf1.fastf1.req:Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
INFO:fastf1.fastf1.req:Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
INFO:fastf1.fastf1.req:Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
INFO:fastf1.fastf1.req:Using cached data for timing_app_data
c

In [None]:
session.event

Unnamed: 0,2
RoundNumber,2
Country,Saudi Arabia
Location,Jeddah
OfficialEventName,FORMULA 1 STC SAUDI ARABIAN GRAND PRIX 2024
EventDate,2024-03-09 00:00:00
EventName,Saudi Arabian Grand Prix
EventFormat,conventional
Session1,Practice 1
Session1Date,2024-03-07 16:30:00+03:00
Session1DateUtc,2024-03-07 13:30:00


In [None]:
track_status_df = session.track_status

track_status_df['is_sc'] = track_status_df['Status'] == '4'
track_status_df['is_vsc'] = track_status_df['Status'] == '6'

display(track_status_df)

Unnamed: 0,Time,Status,Message,is_sc,is_vsc
0,0 days 00:00:00,2,Yellow,False,False
1,0 days 00:08:45.692000,1,AllClear,False,False
2,0 days 01:09:15.917000,2,Yellow,False,False
3,0 days 01:09:49.924000,4,SCDeployed,True,False
4,0 days 01:16:35.527000,1,AllClear,False,False


In [None]:
laps = session.laps

In [None]:
import pandas as pd
from typing import List, Tuple

def compute_sc_durations(track_status_df: pd.DataFrame) -> List[Tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]]:
    """
    Compute safety car deployment durations.

    Args:
        track_status_df: DataFrame with columns 'Time', 'Status', 'is_sc'

    Returns:
        List of tuples: (start_time, end_time, duration)
    """
    durations = []
    i = 0

    while i < len(track_status_df):
        # Find start of SC deployment
        if track_status_df.iloc[i]['is_sc']:
            start_time = track_status_df.iloc[i]['Time']
            start_idx = i

            # Look for end of SC deployment (Status == '1')
            j = i + 1
            while j < len(track_status_df):
                if track_status_df.iloc[j]['Status'] == '1':
                    end_time = track_status_df.iloc[j]['Time']
                    duration = end_time - start_time
                    durations.append((start_time, end_time, duration))
                    i = j  # Continue from end point
                    break
                j += 1
            else:
                # SC deployment extends to end of data
                i = len(track_status_df)
        else:
            i += 1

    return durations

def add_sc_flag_to_laps(laps_df: pd.DataFrame, track_status_df: pd.DataFrame) -> pd.DataFrame:
    """
    Add a safety car flag column to the laps DataFrame.

    Args:
        laps_df: DataFrame with 'Time' and 'LapNumber' columns
        track_status_df: DataFrame with 'Time', 'Status', 'is_sc' columns

    Returns:
        laps_df with added 'under_sc' boolean column
    """
    # Get SC deployment periods
    sc_periods = compute_sc_durations(track_status_df)

    # Initialize under_sc column as False
    laps_df = laps_df.copy()
    laps_df['under_sc'] = False

    # For each SC period, mark laps that fall within it
    for start_time, end_time, _ in sc_periods:
        mask = (laps_df['Time'] >= start_time) & (laps_df['Time'] <= end_time)
        laps_df.loc[mask, 'under_sc'] = True

    return laps_df

In [None]:
laps_with_sc = add_sc_flag_to_laps(laps, track_status_df)

laps_with_sc.head(10)

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,under_sc
0,0 days 01:01:19.630000,VER,1,0 days 00:01:35.505000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:29.707000,...,Red Bull Racing,0 days 00:59:43.836000,2024-03-09 17:03:59.056,1,1.0,False,,False,False,False
1,0 days 01:02:53.835000,VER,1,0 days 00:01:34.205000,2.0,1.0,NaT,NaT,0 days 00:00:34.720000,0 days 00:00:29.628000,...,Red Bull Racing,0 days 01:01:19.630000,2024-03-09 17:05:34.850,1,1.0,False,,False,True,False
2,0 days 01:04:27.878000,VER,1,0 days 00:01:34.043000,3.0,1.0,NaT,NaT,0 days 00:00:34.530000,0 days 00:00:29.621000,...,Red Bull Racing,0 days 01:02:53.835000,2024-03-09 17:07:09.055,1,1.0,False,,False,True,False
3,0 days 01:06:01.632000,VER,1,0 days 00:01:33.754000,4.0,1.0,NaT,NaT,0 days 00:00:34.416000,0 days 00:00:29.541000,...,Red Bull Racing,0 days 01:04:27.878000,2024-03-09 17:08:43.098,1,1.0,False,,False,True,False
4,0 days 01:07:35.587000,VER,1,0 days 00:01:33.955000,5.0,1.0,NaT,NaT,0 days 00:00:34.457000,0 days 00:00:29.590000,...,Red Bull Racing,0 days 01:06:01.632000,2024-03-09 17:10:16.852,1,1.0,False,,False,True,False
5,0 days 01:09:09.263000,VER,1,0 days 00:01:33.676000,6.0,1.0,NaT,NaT,0 days 00:00:34.355000,0 days 00:00:29.545000,...,Red Bull Racing,0 days 01:07:35.587000,2024-03-09 17:11:50.807,1,1.0,False,,False,True,False
6,0 days 01:11:12.182000,VER,1,0 days 00:02:02.919000,7.0,1.0,NaT,0 days 01:11:11.053000,0 days 00:00:34.467000,0 days 00:00:41.316000,...,Red Bull Racing,0 days 01:09:09.263000,2024-03-09 17:13:24.483,124,1.0,False,,False,False,True
7,0 days 01:14:21.360000,VER,1,NaT,8.0,2.0,0 days 01:11:31.721000,NaT,0 days 00:01:13.947000,0 days 00:00:57.475000,...,Red Bull Racing,0 days 01:11:12.182000,2024-03-09 17:15:27.402,4,2.0,False,,False,False,True
8,0 days 01:17:02.249000,VER,1,NaT,9.0,2.0,NaT,NaT,0 days 00:00:56.642000,0 days 00:00:49.042000,...,Red Bull Racing,0 days 01:14:21.360000,2024-03-09 17:18:36.580,41,2.0,False,,False,False,False
9,0 days 01:18:37.273000,VER,1,0 days 00:01:35.024000,10.0,2.0,NaT,NaT,0 days 00:00:35.648000,0 days 00:00:29.514000,...,Red Bull Racing,0 days 01:17:02.249000,2024-03-09 17:21:17.469,1,2.0,False,,False,True,False


In [None]:
# prepare the data for joining
laps_with_sc = laps_with_sc.reset_index(drop=True)

weather_data = session.laps.get_weather_data()
weather_data = weather_data.reset_index(drop=True)

# exclude the  'Time' column from weather data when joining
joined = pd.concat([laps, weather_data.loc[:, ~(weather_data.columns == 'Time')]], axis=1)
display(joined.head())

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,DeletedReason,FastF1Generated,IsAccurate,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,0 days 01:01:19.630000,VER,1,0 days 00:01:35.505000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:29.707000,...,,False,False,25.4,62.0,1012.5,False,31.5,358,1.3
1,0 days 01:02:53.835000,VER,1,0 days 00:01:34.205000,2.0,1.0,NaT,NaT,0 days 00:00:34.720000,0 days 00:00:29.628000,...,,False,True,25.4,62.0,1012.6,False,31.5,301,1.2
2,0 days 01:04:27.878000,VER,1,0 days 00:01:34.043000,3.0,1.0,NaT,NaT,0 days 00:00:34.530000,0 days 00:00:29.621000,...,,False,True,25.4,62.0,1012.6,False,31.7,172,1.0
3,0 days 01:06:01.632000,VER,1,0 days 00:01:33.754000,4.0,1.0,NaT,NaT,0 days 00:00:34.416000,0 days 00:00:29.541000,...,,False,True,25.5,61.0,1012.6,False,31.7,342,0.8
4,0 days 01:07:35.587000,VER,1,0 days 00:01:33.955000,5.0,1.0,NaT,NaT,0 days 00:00:34.457000,0 days 00:00:29.590000,...,,False,True,25.6,61.0,1012.6,False,31.7,9,1.3


In [None]:
# joined = joined.reset_index(drop=True)

car_data = session.laps.iloc[0].get_car_data()

car_data

Unnamed: 0,Date,RPM,Speed,nGear,Throttle,Brake,DRS,Source,Time,SessionTime
0,2024-03-09 17:03:59.140,10085.0,0.0,1,15.0,True,1,car,0 days 00:00:00.084000,0 days 00:59:43.920000
1,2024-03-09 17:03:59.500,9420.0,0.0,1,15.0,False,1,car,0 days 00:00:00.444000,0 days 00:59:44.280000
2,2024-03-09 17:03:59.860,6900.0,11.0,1,15.0,False,1,car,0 days 00:00:00.804000,0 days 00:59:44.640000
3,2024-03-09 17:04:00.099,5220.0,22.0,1,15.0,False,1,car,0 days 00:00:01.043000,0 days 00:59:44.879000
4,2024-03-09 17:04:00.459,3743.0,32.0,1,16.0,False,1,car,0 days 00:00:01.403000,0 days 00:59:45.239000
...,...,...,...,...,...,...,...,...,...,...
369,2024-03-09 17:05:33.260,11170.0,282.0,7,100.0,False,1,car,0 days 00:01:34.204000,0 days 01:01:18.040000
370,2024-03-09 17:05:33.700,11326.0,285.0,7,100.0,False,1,car,0 days 00:01:34.644000,0 days 01:01:18.480000
371,2024-03-09 17:05:34.060,11373.0,288.0,7,100.0,False,1,car,0 days 00:01:35.004000,0 days 01:01:18.840000
372,2024-03-09 17:05:34.379,11419.0,291.0,7,100.0,False,1,car,0 days 00:01:35.323000,0 days 01:01:19.159000


## Compute SC Summary

In [None]:
"""
Get a summary DataFrame of safety car deployments.
"""
durations = compute_sc_durations(track_status_df)

if not durations:
    sc_summary = pd.DataFrame(columns=['start_time', 'end_time', 'duration_seconds'])
else:
  sc_summary = pd.DataFrame([
      {
          'start_time': start,
          'end_time': end,
          'duration_seconds': duration.total_seconds()
      }
      for start, end, duration in durations
  ])

print(f"Total SC deployments: {len(sc_summary)}")
print(f"Total SC time: {sc_summary['duration_seconds'].sum():.1f} seconds")

Total SC deployments: 1
Total SC time: 405.6 seconds


In [None]:
mean_lap_time = laps['LapTime'].mean()
print(f"Mean lap time: {mean_lap_time}")

Mean lap time: 0 days 00:01:35.919241695


## Compute SC Durations

In [None]:
import pandas as pd
from typing import List, Tuple

def compute_sc_durations(track_status_df: pd.DataFrame) -> List[Tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]]:
    """
    Compute safety car deployment durations.

    Args:
        track_status_df: DataFrame with columns 'Time', 'Status', 'is_sc'

    Returns:
        List of tuples: (start_time, end_time, duration)
    """
    durations = []
    i = 0

    while i < len(track_status_df):
        # Find start of SC deployment
        if track_status_df.iloc[i]['is_sc']:
            start_time = track_status_df.iloc[i]['Time']
            start_idx = i

            # Look for end of SC deployment (Status == '1')
            j = i + 1
            while j < len(track_status_df):
                if track_status_df.iloc[j]['Status'] == '1':
                    end_time = track_status_df.iloc[j]['Time']
                    duration = end_time - start_time
                    durations.append((start_time, end_time, duration))
                    i = j  # Continue from end point
                    break
                j += 1
            else:
                # SC deployment extends to end of data
                i = len(track_status_df)
        else:
            i += 1

    return durations

def get_sc_summary(track_status_df: pd.DataFrame) -> pd.DataFrame:
    """
    Get a summary DataFrame of safety car deployments.

    Returns:
        DataFrame with columns: start_time, end_time, duration_seconds
    """
    durations = compute_sc_durations(track_status_df)

    if not durations:
        return pd.DataFrame(columns=['start_time', 'end_time', 'duration_seconds'])

    return pd.DataFrame([
        {
            'start_time': start,
            'end_time': end,
            'duration_seconds': duration.total_seconds()
        }
        for start, end, duration in durations
    ])

# Example usage:
# sc_summary = get_sc_summary(track_status_df)
# print(f"Total SC deployments: {len(sc_summary)}")
# print(f"Total SC time: {sc_summary['duration_seconds'].sum():.1f} seconds")

## Compute correlations between SC deployment and other features

In [None]:
from typing import Optional

def analyze_sc_correlations(laps_df: pd.DataFrame, track_status_df: pd.DataFrame, features: Optional[List[str]] = None):
    """
    Analyze correlations between features and safety car deployments.
    """

    # if 'under_sc' column is missing add it
    if 'under_sc' not in laps_df.columns:
      laps_with_sc = add_sc_flag_to_laps(laps_df, track_status_df)
    else:
      laps_with_sc = laps_df

    if features is None:
        # Default numeric features
        numeric_cols = laps_with_sc.select_dtypes(include=[np.number]).columns
        features = [col for col in numeric_cols if col not in ['LapNumber', 'Time']]

    print("Correlation Analysis: Features vs Safety Car Deployment")
    print("=" * 55)

    correlations = []
    for feature in features:
        if feature in laps_with_sc.columns:
            # Calculate correlation with under_sc flag
            corr = laps_with_sc[feature].corr(laps_with_sc['under_sc'].astype(int))
            if not pd.isna(corr):
                correlations.append((feature, corr))

    # Sort by absolute correlation
    correlations.sort(key=lambda x: abs(x[1]), reverse=True)

    for feature, corr in correlations:
        print(f"{feature:20} | Correlation: {corr:6.3f}")

    return correlations

correlations = analyze_sc_correlations(joined, track_status_df)

Correlation Analysis: Features vs Safety Car Deployment
SpeedI2              | Correlation: -0.837
SpeedFL              | Correlation: -0.718
SpeedI1              | Correlation: -0.689
SpeedST              | Correlation: -0.658
LapTime              | Correlation: -0.551
PitOutTime           | Correlation:  0.522
PitInTime            | Correlation:  0.508
AirTemp              | Correlation:  0.378
Pressure             | Correlation: -0.355
LapStartTime         | Correlation: -0.261
Humidity             | Correlation: -0.253
TyreLife             | Correlation: -0.219
WindDirection        | Correlation:  0.218
TrackTemp            | Correlation:  0.178
Stint                | Correlation: -0.149
WindSpeed            | Correlation:  0.013
Sector3Time          | Correlation:  0.007
Sector2Time          | Correlation:  0.007
Sector3SessionTime   | Correlation:  0.007
Sector2SessionTime   | Correlation:  0.007
Sector1Time          | Correlation: -0.004
Position             | Correlation: -0.00

In [8]:
first_lap = laps.iloc[0]

display(first_lap.head())

# car data
display(first_lap.get_car_data().head())

# weather data
display(first_lap.get_weather_data())

# telemetry data
display(first_lap.get_pos_data().head())

NameError: name 'session' is not defined

In [None]:
# run correlation for specific lap telemetry
# the safety car is deployed on lap 7
# let's look at lap 0-7

laps_before_sc = laps.iloc[:7]

# for each lap, get the telemetry
for lap in laps_before_sc:
  car_data = lap.get_car_data()
  weather_data = lap.get_weather_data()
  telemetry_data = lap.get_pos_data()


# 2025-Jun-05 Working Session

## Data Prep

In [None]:
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

def extract_pre_sc_data(laps_df: pd.DataFrame, sc_lap: int = 7) -> Dict:
    """
    Extract and aggregate data from laps leading up to safety car deployment.

    Args:
        laps_df: DataFrame with lap data
        sc_lap: Lap number when safety car was deployed

    Returns:
        Dictionary containing aggregated features for ML training
    """

    # Get pre-safety car laps (1 to sc_lap-1)
    pre_sc_laps = laps_df[laps_df['LapNumber'] < sc_lap].copy()

    print(f"Extracting data from laps 1-{sc_lap-1} (pre-safety car)")
    print(f"Total laps in dataset: {len(pre_sc_laps)}")
    print(f"Unique drivers: {pre_sc_laps['Driver'].nunique()}")

    # Initialize results dictionary
    features = {}

    # Process each lap to get detailed data
    lap_features = []

    for idx, lap_row in pre_sc_laps.iterrows():
        try:
            lap_data = {}
            lap_data['LapNumber'] = lap_row['LapNumber']
            lap_data['Driver'] = lap_row['Driver']
            lap_data['LapTime'] = lap_row['LapTime']

            # Get car data
            try:
                car_data = lap_row.get_car_data()
                if not car_data.empty:
                    lap_data['avg_speed'] = car_data['Speed'].mean()
                    lap_data['speed_std'] = car_data['Speed'].std()
                    lap_data['max_speed'] = car_data['Speed'].max()
                    lap_data['min_speed'] = car_data['Speed'].min()
                    lap_data['throttle_avg'] = car_data['Throttle'].mean()
                    lap_data['brake_frequency'] = car_data['Brake'].sum() / len(car_data)
                    lap_data['rpm_avg'] = car_data['RPM'].mean()
                    lap_data['rpm_std'] = car_data['RPM'].std()
                else:
                    print(f"Empty car data for lap {lap_row['LapNumber']}, driver {lap_row['Driver']}")
            except Exception as e:
                print(f"Error getting car data for lap {lap_row['LapNumber']}: {e}")

            # Get position data
            try:
                pos_data = lap_row.get_pos_data()
                if not pos_data.empty:
                    # Check for off-track incidents
                    lap_data['off_track_count'] = (pos_data['Status'] == 'OffTrack').sum()
                    lap_data['position_changes'] = len(pos_data)  # More samples = more position changes
                else:
                    print(f"Empty position data for lap {lap_row['LapNumber']}, driver {lap_row['Driver']}")
            except Exception as e:
                print(f"Error getting position data for lap {lap_row['LapNumber']}: {e}")

            # Get weather data (once per lap)
            try:
                weather_data = lap_row.get_weather_data()
                if weather_data is not None and len(weather_data) > 0:
                    # Handle different weather data formats
                    if isinstance(weather_data, pd.Series):
                        lap_data['air_temp'] = weather_data.get('AirTemp', np.nan)
                        lap_data['track_temp'] = weather_data.get('TrackTemp', np.nan)
                        lap_data['humidity'] = weather_data.get('Humidity', np.nan)
                        lap_data['rainfall'] = weather_data.get('Rainfall', np.nan)
                        lap_data['wind_speed'] = weather_data.get('WindSpeed', np.nan)
                    elif isinstance(weather_data, pd.DataFrame) and not weather_data.empty:
                        weather_row = weather_data.iloc[0]
                        lap_data['air_temp'] = weather_row.get('AirTemp', np.nan)
                        lap_data['track_temp'] = weather_row.get('TrackTemp', np.nan)
                        lap_data['humidity'] = weather_row.get('Humidity', np.nan)
                        lap_data['rainfall'] = weather_row.get('Rainfall', np.nan)
                        lap_data['wind_speed'] = weather_row.get('WindSpeed', np.nan)
                else:
                    # Set default NaN values if no weather data
                    lap_data['air_temp'] = np.nan
                    lap_data['track_temp'] = np.nan
                    lap_data['humidity'] = np.nan
                    lap_data['rainfall'] = np.nan
                    lap_data['wind_speed'] = np.nan
            except Exception as e:
                # Set default NaN values on error
                lap_data['air_temp'] = np.nan
                lap_data['track_temp'] = np.nan
                lap_data['humidity'] = np.nan
                lap_data['rainfall'] = np.nan
                lap_data['wind_speed'] = np.nan

            lap_features.append(lap_data)

        except Exception as e:
            print(f"Error processing lap {lap_row['LapNumber']}: {e}")
            continue

    # Convert to DataFrame for easier manipulation
    lap_features_df = pd.DataFrame(lap_features)

    print(f"\nSuccessfully processed {len(lap_features_df)} laps")
    print(f"Features collected: {list(lap_features_df.columns)}")

    return lap_features_df

def aggregate_temporal_features(lap_features_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create aggregated features that capture temporal patterns and risk escalation.
    """

    # Convert LapTime to seconds for calculations
    lap_features_df = lap_features_df.copy()
    lap_features_df['LapTime_seconds'] = lap_features_df['LapTime'].dt.total_seconds()

    # Group by lap number to get cross-driver statistics
    lap_stats = []

    for lap_num in sorted(lap_features_df['LapNumber'].unique()):
        lap_data = lap_features_df[lap_features_df['LapNumber'] == lap_num]

        stats = {'LapNumber': lap_num}

        # Speed-related risk indicators
        if 'avg_speed' in lap_data.columns:
            stats['speed_variance_across_drivers'] = lap_data['avg_speed'].var()
            stats['speed_range_across_drivers'] = lap_data['max_speed'].max() - lap_data['min_speed'].min()
            stats['avg_speed_std_within_laps'] = lap_data['speed_std'].mean()

        # Braking patterns (potential close calls)
        if 'brake_frequency' in lap_data.columns:
            stats['total_brake_events'] = lap_data['brake_frequency'].sum()
            stats['avg_brake_frequency'] = lap_data['brake_frequency'].mean()

        # Off-track incidents
        if 'off_track_count' in lap_data.columns:
            stats['total_off_track'] = lap_data['off_track_count'].sum()
            stats['drivers_off_track'] = (lap_data['off_track_count'] > 0).sum()

        # Weather conditions (handle NaN values)
        for weather_col in ['air_temp', 'track_temp', 'humidity', 'rainfall']:
            if weather_col in lap_data.columns:
                if weather_col == 'rainfall':
                    stats[f'total_{weather_col}'] = lap_data[weather_col].sum()
                else:
                    stats[f'avg_{weather_col}'] = lap_data[weather_col].mean()

        # Lap time variance (driver consistency) - now using seconds
        if 'LapTime_seconds' in lap_data.columns:
            stats['laptime_variance'] = lap_data['LapTime_seconds'].var()
            stats['avg_laptime'] = lap_data['LapTime_seconds'].mean()

        lap_stats.append(stats)

    lap_stats_df = pd.DataFrame(lap_stats)

    # Add temporal trend features (escalation indicators)
    for col in ['speed_variance_across_drivers', 'total_brake_events', 'total_off_track']:
        if col in lap_stats_df.columns:
            # Rolling average and trend
            lap_stats_df[f'{col}_rolling_avg'] = lap_stats_df[col].rolling(window=3, min_periods=1).mean()
            lap_stats_df[f'{col}_trend'] = lap_stats_df[col].diff()  # Change from previous lap

    return lap_stats_df

In [None]:
print("Starting data extraction...")
lap_features_df = extract_pre_sc_data(laps, sc_lap=7)

Starting data extraction...
Extracting data from laps 1-6 (pre-safety car)
Total laps in dataset: 115
Unique drivers: 20

Successfully processed 115 laps
Features collected: ['LapNumber', 'Driver', 'LapTime', 'avg_speed', 'speed_std', 'max_speed', 'min_speed', 'throttle_avg', 'brake_frequency', 'rpm_avg', 'rpm_std', 'off_track_count', 'position_changes', 'air_temp', 'track_temp', 'humidity', 'rainfall', 'wind_speed']


In [None]:
lap_features_df

Unnamed: 0,LapNumber,Driver,LapTime,avg_speed,speed_std,max_speed,min_speed,throttle_avg,brake_frequency,rpm_avg,rpm_std,off_track_count,position_changes,air_temp,track_temp,humidity,rainfall,wind_speed
0,1.0,VER,0 days 00:01:35.505000,222.302139,66.633986,305.0,0.0,75.532086,0.141711,10371.475936,1144.995966,0,384,25.4,31.5,62.0,False,1.3
1,2.0,VER,0 days 00:01:34.205000,234.300813,59.425685,308.0,81.0,77.040650,0.143631,10441.962060,1055.125868,0,358,25.4,31.5,62.0,False,1.2
2,3.0,VER,0 days 00:01:34.043000,233.223757,58.152700,306.0,81.0,76.008287,0.135359,10412.773481,1069.052696,0,365,25.4,31.7,62.0,False,1.0
3,4.0,VER,0 days 00:01:33.754000,234.980447,57.880802,308.0,83.0,76.695531,0.142458,10451.765363,998.214600,0,364,25.5,31.7,61.0,False,0.8
4,5.0,VER,0 days 00:01:33.955000,233.067227,57.965613,308.0,83.0,75.697479,0.142857,10410.621849,1038.743001,0,364,25.6,31.7,61.0,False,1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,2.0,PIA,0 days 00:01:35.082000,233.228495,61.363694,317.0,72.0,76.513441,0.129032,10580.282258,1007.075314,0,361,25.4,31.5,62.0,False,1.2
111,3.0,PIA,0 days 00:01:34.379000,230.722992,60.616424,309.0,83.0,75.700831,0.130194,10498.041551,1127.071481,0,366,25.4,31.7,62.0,False,1.0
112,4.0,PIA,0 days 00:01:34.589000,231.027473,61.070066,309.0,84.0,75.541209,0.129121,10474.942308,1154.401029,0,370,25.5,31.7,61.0,False,0.8
113,5.0,PIA,0 days 00:01:34.557000,231.019553,60.881454,308.0,81.0,75.421788,0.131285,10482.365922,1104.050527,0,363,25.6,31.7,61.0,False,1.3


In [None]:
print("\nCreating temporal aggregations...")
temporal_features_df = aggregate_temporal_features(lap_features_df)


Creating temporal aggregations...


In [None]:
temporal_features_df

Unnamed: 0,LapNumber,speed_variance_across_drivers,speed_range_across_drivers,avg_speed_std_within_laps,total_brake_events,avg_brake_frequency,total_off_track,drivers_off_track,avg_air_temp,avg_track_temp,avg_humidity,total_rainfall,laptime_variance,avg_laptime,speed_variance_across_drivers_rolling_avg,speed_variance_across_drivers_trend,total_brake_events_rolling_avg,total_brake_events_trend,total_off_track_rolling_avg,total_off_track_trend
0,1.0,74.322372,322.0,70.001153,3.781702,0.189085,0,0,25.4,31.5,62.0,0,28.885629,102.61635,74.322372,,3.781702,,0.0,
1,2.0,15.861338,282.0,65.516504,3.268202,0.172011,0,0,25.4,31.5,62.0,0,2.473859,96.572579,45.091855,-58.461033,3.524952,-0.513499,0.0,0.0
2,3.0,3.106731,260.0,64.754599,3.120925,0.164259,0,0,25.4,31.7,62.0,0,1.080742,95.627789,31.096814,-12.754607,3.390276,-0.147277,0.0,0.0
3,4.0,4.718975,261.0,64.661317,3.050071,0.16053,0,0,25.5,31.7,61.0,0,0.668745,95.467263,7.895681,1.612243,3.146399,-0.070855,0.0,0.0
4,5.0,5.320882,261.0,64.325567,2.975269,0.156593,0,0,25.6,31.7,61.736842,0,0.522659,95.304053,4.382196,0.601908,3.048755,-0.074802,0.0,0.0
5,6.0,7.799857,331.0,64.029051,2.996351,0.157703,0,0,25.7,32.2,61.0,0,1.063806,95.485889,5.946571,2.478975,3.00723,0.021082,0.0,0.0


## Feature Engineering

### Escalation Pattern Analysis

The `analyze_escalation_patterns` function addresses a fundamental question in predictive modeling: **"What patterns emerge in the laps leading up to a safety car deployment that could have predicted it?"** This approach differs significantly from correlation analysis because we're seeking warning signals that build up over time, rather than features that simply correlate with safety car laps after they occur.

#### How the Analysis Works

The escalation analysis operates through two complementary approaches:

**Trend Analysis**: This method calculates correlation coefficients between lap numbers (1, 2, 3, 4, 5, 6) and feature values across the pre-safety car period. A positive correlation indicates that a risk factor is increasing as we approach the safety car deployment. The analysis also identifies whether lap 6 (immediately before the safety car) represents a peak value, which would suggest the risk factor reached critical levels just before the incident.

**Early vs Late Comparison**: This approach divides the pre-safety car period into early phase (laps 1-3) and late phase (laps 4-6), then calculates percentage increases between these periods. Significant increases suggest that risk factors are escalating as the race progresses toward the safety car deployment.

### Key Findings and Interpretation

#### Promising Signals Identified

The trend analysis revealed two features with strong escalation patterns:

- **Speed Variance Across Drivers Trend**: Shows a 0.456 correlation with lap progression, with peak values occurring near the safety car deployment
- **Total Brake Events Trend**: Demonstrates a 0.407 correlation with lap progression, also peaking near the safety car deployment

Both features exhibited peak values in lap 6, suggesting these risk indicators reached critical levels immediately before the safety car deployment.

#### The Missing Gradual Escalation

Interestingly, the early versus late comparison revealed no features with significant increases when averaging laps 1-3 against laps 4-6. This finding suggests an important insight about the nature of racing incidents: rather than gradually building risk over multiple laps, we appear to be observing sudden changes or threshold effects.

### The Logic of Sudden Changes vs Gradual Escalation

This pattern actually aligns well with the realities of Formula 1 racing and human behavior in high-stakes environments. Consider why gradual escalation might be less common:

**Driver and Team Awareness**: Professional racing teams continuously monitor telemetry data, track conditions, and driver behavior. If risk factors were gradually escalating over multiple laps, experienced drivers, race engineers, and strategists would likely notice these trends and take corrective action. They might adjust driving style, change strategy, or make setup modifications to address emerging issues.

**Threshold Effects in Racing**: Many racing incidents occur when conditions or behaviors cross critical thresholds rather than building gradually. For example, tire degradation might remain manageable for several laps before suddenly reaching a point where grip loss becomes dangerous. Similarly, track conditions or competitive pressure might create sudden changes in driver behavior or car performance.

**Reactive vs Predictive Responses**: The human element in racing means that participants are constantly adapting to changing conditions. This adaptive behavior would naturally suppress gradual escalation patterns, leaving sudden, unexpected changes as the primary predictive signals.

### Implications for Feature Engineering

These findings suggest our modeling approach should focus on detecting sudden changes and threshold effects rather than gradual trends. This insight guides us toward feature engineering strategies that emphasize:

- **Change detection**: Features that identify when values deviate significantly from recent baselines
- **Volatility measures**: Metrics that capture sudden increases in variability or instability
- **Threshold crossing indicators**: Features that detect when values exceed historical norms or safe operating ranges
- **Recent trend sensitivity**: Models that weight very recent observations more heavily than longer-term patterns

The fact that our trend features (which capture lap-to-lap changes) showed the strongest signals supports this interpretation. These features are essentially measuring the rate of change rather than absolute levels, which appears to be more predictive for safety car deployments.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

"""
Analyze temporal patterns that might predict safety car deployment.
Focus on escalation signals rather than associative correlations.
"""

sc_lap = 7

print("=== PREDICTIVE SIGNAL ANALYSIS ===")
print(f"Analyzing escalation patterns in laps leading to SC deployment on lap {sc_lap}")
print(f"Available features: {list(temporal_features_df.columns)}")
print()

# Create binary target: 1 if next lap has SC, 0 otherwise
temporal_features_df = temporal_features_df.copy()
temporal_features_df['sc_next_lap'] = 0
temporal_features_df.loc[temporal_features_df['LapNumber'] == sc_lap - 1, 'sc_next_lap'] = 1

# 1. ESCALATION ANALYSIS - Look for increasing trends
print("1. ESCALATION PATTERN ANALYSIS")
print("=" * 40)

risk_features = [col for col in temporal_features_df.columns
                if any(keyword in col.lower() for keyword in
                      ['variance', 'brake', 'off_track', 'range', 'std'])]

escalation_scores = {}

for feature in risk_features:
    if feature in temporal_features_df.columns and temporal_features_df[feature].notna().sum() > 0:
        # Calculate trend (slope) from lap 1 to lap 6
        x = temporal_features_df['LapNumber'].values
        y = temporal_features_df[feature].fillna(0).values

        if len(x) > 1 and np.var(x) > 0:
            # Simple linear trend
            trend = np.corrcoef(x, y)[0, 1] if not np.isnan(np.corrcoef(x, y)[0, 1]) else 0

            # Peak detection - is lap 6 highest value?
            if len(y) >= 6:
                lap6_percentile = (y[-1] > np.percentile(y, 75)) if np.var(y) > 0 else False
            else:
                lap6_percentile = False

            escalation_scores[feature] = {
                'trend': trend,
                'peak_near_sc': lap6_percentile,
                'final_value': y[-1] if len(y) > 0 else 0,
                'value_range': np.max(y) - np.min(y) if len(y) > 0 else 0
            }

# Print escalation analysis
print("Features with positive escalation trends (higher values before SC):")
for feature, scores in sorted(escalation_scores.items(),
                              key=lambda x: x[1]['trend'], reverse=True):
    if scores['trend'] > 0.1:  # Meaningful positive trend
        print(f"  {feature:30} | Trend: {scores['trend']:6.3f} | Peak near SC: {scores['peak_near_sc']}")

print("\n" + "="*50)

# 2. COMPARATIVE ANALYSIS - Compare early vs late laps
print("2. EARLY vs LATE LAP COMPARISON")
print("=" * 40)

# Split into early (laps 1-3) vs late (laps 4-6) periods
early_laps = temporal_features_df[temporal_features_df['LapNumber'] <= 3]
late_laps = temporal_features_df[temporal_features_df['LapNumber'] >= 4]

comparison_results = {}

for feature in risk_features:
    if feature in temporal_features_df.columns:
        early_mean = early_laps[feature].mean()
        late_mean = late_laps[feature].mean()

        if not np.isnan(early_mean) and not np.isnan(late_mean) and early_mean != 0:
            pct_increase = ((late_mean - early_mean) / early_mean) * 100
            comparison_results[feature] = {
                'early_mean': early_mean,
                'late_mean': late_mean,
                'pct_increase': pct_increase
            }

print("Features showing significant increase from early to late laps:")
for feature, stats in sorted(comparison_results.items(),
                            key=lambda x: x[1]['pct_increase'], reverse=True):
    if stats['pct_increase'] > 10:  # >10% increase
        print(f"  {feature:30} | +{stats['pct_increase']:5.1f}% | Early: {stats['early_mean']:.3f} → Late: {stats['late_mean']:.3f}")

print("\n" + "="*50)

=== PREDICTIVE SIGNAL ANALYSIS ===
Analyzing escalation patterns in laps leading to SC deployment on lap 7
Available features: ['LapNumber', 'speed_variance_across_drivers', 'speed_range_across_drivers', 'avg_speed_std_within_laps', 'total_brake_events', 'avg_brake_frequency', 'total_off_track', 'drivers_off_track', 'avg_air_temp', 'avg_track_temp', 'avg_humidity', 'total_rainfall', 'laptime_variance', 'avg_laptime', 'speed_variance_across_drivers_rolling_avg', 'speed_variance_across_drivers_trend', 'total_brake_events_rolling_avg', 'total_brake_events_trend', 'total_off_track_rolling_avg', 'total_off_track_trend']

1. ESCALATION PATTERN ANALYSIS
Features with positive escalation trends (higher values before SC):
  speed_variance_across_drivers_trend | Trend:  0.456 | Peak near SC: True
  total_brake_events_trend       | Trend:  0.407 | Peak near SC: True

2. EARLY vs LATE LAP COMPARISON
Features showing significant increase from early to late laps:



In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

def create_sudden_change_features(temporal_features_df: pd.DataFrame, lap_features_df: pd.DataFrame):
    """
    Create features designed to detect sudden changes and threshold effects
    rather than gradual escalation patterns.

    The key insight: Racing incidents are more likely caused by sudden, unexpected
    changes rather than gradual escalation that teams would notice and address.
    """

    print("=== SUDDEN CHANGE FEATURE ENGINEERING ===")
    print("Focus: Detecting unexpected deviations and threshold effects")
    print()

    # Work with copies to avoid modifying original data
    temporal_df = temporal_features_df.copy()

    # Define our risk-related base features
    risk_features = [
        'speed_variance_across_drivers', 'speed_range_across_drivers',
        'total_brake_events', 'total_off_track', 'laptime_variance'
    ]

    # Remove existing trend features that might be noise
    base_features = [col for col in temporal_df.columns
                    if not any(suffix in col for suffix in ['_trend', '_rolling', '_avg'])]

    temporal_clean = temporal_df[base_features].copy()

    # 1. SUDDEN SPIKE DETECTION
    print("1. Creating sudden spike detection features...")

    for feature in risk_features:
        if feature in temporal_clean.columns:
            values = temporal_clean[feature].fillna(0)

            # Spike detection: Is current value much higher than recent average?
            recent_baseline = values.rolling(window=3, min_periods=1).mean().shift(1)  # Exclude current lap
            temporal_clean[f'{feature}_spike_ratio'] = values / (recent_baseline + 0.001)  # Avoid division by zero

            # Z-score spike: How many standard deviations above the running mean?
            running_mean = values.expanding().mean().shift(1)
            running_std = values.expanding().std().shift(1)
            temporal_clean[f'{feature}_zscore'] = (values - running_mean) / (running_std + 0.001)

            # Sudden jump: Large increase from previous lap
            temporal_clean[f'{feature}_jump'] = values.diff()
            temporal_clean[f'{feature}_jump_pct'] = values.pct_change().fillna(0)

    # 2. THRESHOLD CROSSING INDICATORS
    print("2. Creating threshold crossing indicators...")

    for feature in risk_features:
        if feature in temporal_clean.columns:
            values = temporal_clean[feature].fillna(0)

            # Historical percentile: Where does current value rank historically?
            temporal_clean[f'{feature}_percentile'] = values.rank(pct=True)

            # Threshold flags: Does current value exceed historical norms?
            p75 = values.quantile(0.75) if len(values) > 1 else values.iloc[0] if len(values) > 0 else 0
            p90 = values.quantile(0.90) if len(values) > 1 else values.iloc[0] if len(values) > 0 else 0

            temporal_clean[f'{feature}_above_p75'] = (values > p75).astype(int)
            temporal_clean[f'{feature}_above_p90'] = (values > p90).astype(int)

    # 3. VOLATILITY AND INSTABILITY MEASURES
    print("3. Creating volatility and instability measures...")

    for feature in risk_features:
        if feature in temporal_clean.columns:
            values = temporal_clean[feature].fillna(0)

            # Recent volatility: Standard deviation of last 3 laps
            temporal_clean[f'{feature}_recent_volatility'] = values.rolling(3).std()

            # Coefficient of variation: Volatility relative to mean
            rolling_mean = values.rolling(3).mean()
            rolling_std = values.rolling(3).std()
            temporal_clean[f'{feature}_cv'] = rolling_std / (rolling_mean + 0.001)

            # Direction changes: How often does the trend reverse?
            direction = values.diff().apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
            direction_changes = (direction.diff() != 0).rolling(3).sum()
            temporal_clean[f'{feature}_direction_changes'] = direction_changes

    # 4. MULTI-FEATURE ANOMALY INDICATORS
    print("4. Creating multi-feature anomaly indicators...")

    # Create composite risk score: How many features are simultaneously elevated?
    risk_flags = []
    for feature in risk_features:
        if f'{feature}_above_p75' in temporal_clean.columns:
            risk_flags.append(f'{feature}_above_p75')

    if risk_flags:
        temporal_clean['concurrent_risk_factors'] = temporal_clean[risk_flags].sum(axis=1)
        temporal_clean['multiple_risks_active'] = (temporal_clean['concurrent_risk_factors'] >= 2).astype(int)

    # Sudden deterioration index: Are multiple factors getting worse simultaneously?
    spike_features = [col for col in temporal_clean.columns if '_spike_ratio' in col]
    if spike_features:
        # Count how many features are spiking (>1.5x recent average)
        spike_flags = temporal_clean[spike_features] > 1.5
        temporal_clean['simultaneous_spikes'] = spike_flags.sum(axis=1)

    # 5. DRIVER-LEVEL SUDDEN CHANGE INDICATORS
    print("5. Creating driver-level sudden change indicators...")

    # Calculate per-lap driver behavior anomalies
    driver_anomalies_per_lap = []

    for lap_num in sorted(lap_features_df['LapNumber'].unique()):
        lap_data = lap_features_df[lap_features_df['LapNumber'] == lap_num].copy()

        anomaly_stats = {'LapNumber': lap_num}

        # Speed anomalies: How many drivers have unusual speed patterns?
        if 'avg_speed' in lap_data.columns and len(lap_data) > 1:
            speed_z_scores = np.abs(stats.zscore(lap_data['avg_speed'].fillna(lap_data['avg_speed'].mean())))
            anomaly_stats['drivers_with_speed_anomalies'] = (speed_z_scores > 1.5).sum()

        # Brake anomalies: Unusual braking behavior
        if 'brake_frequency' in lap_data.columns and len(lap_data) > 1:
            brake_z_scores = np.abs(stats.zscore(lap_data['brake_frequency'].fillna(lap_data['brake_frequency'].mean())))
            anomaly_stats['drivers_with_brake_anomalies'] = (brake_z_scores > 1.5).sum()

        # Consistency anomalies: Drivers struggling with car control
        if 'speed_std' in lap_data.columns and len(lap_data) > 1:
            consistency_z_scores = np.abs(stats.zscore(lap_data['speed_std'].fillna(lap_data['speed_std'].mean())))
            anomaly_stats['drivers_struggling_consistency'] = (consistency_z_scores > 1.5).sum()

        driver_anomalies_per_lap.append(anomaly_stats)

    driver_anomalies_df = pd.DataFrame(driver_anomalies_per_lap)

    # Merge driver anomaly data
    temporal_enhanced = temporal_clean.merge(driver_anomalies_df, on='LapNumber', how='left')

    # 6. RECENT CHANGE ACCELERATION
    print("6. Creating change acceleration features...")

    # Focus on the most recent changes (lap 5 to lap 6 vs lap 4 to lap 5)
    for feature in risk_features:
        if feature in temporal_enhanced.columns:
            # First derivative (change)
            change = temporal_enhanced[feature].diff()
            # Second derivative (acceleration of change)
            acceleration = change.diff()
            temporal_enhanced[f'{feature}_acceleration'] = acceleration

            # Is the rate of change accelerating in the most recent lap?
            temporal_enhanced[f'{feature}_accelerating'] = (acceleration > 0).astype(int)

    # Create target variable
    temporal_enhanced['sc_next_lap'] = 0
    temporal_enhanced.loc[temporal_enhanced['LapNumber'] == 6, 'sc_next_lap'] = 1

    print(f"\nFeature engineering complete!")
    print(f"Created {len(temporal_enhanced.columns)} total features")
    print(f"Original features: {len(temporal_features_df.columns)}")
    print(f"New features added: {len(temporal_enhanced.columns) - len(temporal_features_df.columns)}")

    return temporal_enhanced

def analyze_sudden_change_signals(enhanced_features_df: pd.DataFrame):
    """
    Analyze which sudden change features show the strongest predictive signals.
    """

    print("\n=== SUDDEN CHANGE SIGNAL ANALYSIS ===")

    # Focus on the final lap before safety car (lap 6)
    lap6_data = enhanced_features_df[enhanced_features_df['LapNumber'] == 6].iloc[0]

    print("LAP 6 (Pre-Safety Car) ANOMALY INDICATORS:")
    print("=" * 50)

    # Check threshold crossing indicators
    threshold_features = [col for col in enhanced_features_df.columns if '_above_p' in col]
    active_thresholds = []
    for feature in threshold_features:
        if lap6_data[feature] == 1:
            active_thresholds.append(feature)

    print(f"Active threshold warnings: {len(active_thresholds)}")
    for feature in active_thresholds:
        print(f"  ✓ {feature}")

    # Check spike indicators
    spike_features = [col for col in enhanced_features_df.columns if '_spike_ratio' in col]
    significant_spikes = []
    for feature in spike_features:
        if lap6_data[feature] > 1.5:  # 50% above recent average
            significant_spikes.append((feature, lap6_data[feature]))

    print(f"\nSignificant spikes (>1.5x recent average): {len(significant_spikes)}")
    for feature, ratio in significant_spikes:
        print(f"  ⚠ {feature}: {ratio:.2f}x above recent average")

    # Check z-score anomalies
    zscore_features = [col for col in enhanced_features_df.columns if '_zscore' in col]
    extreme_zscores = []
    for feature in zscore_features:
        if abs(lap6_data[feature]) > 1.5:  # More than 1.5 standard deviations
            extreme_zscores.append((feature, lap6_data[feature]))

    print(f"\nExtreme z-scores (>1.5 std dev): {len(extreme_zscores)}")
    for feature, zscore in extreme_zscores:
        print(f"  📊 {feature}: {zscore:.2f} std dev from mean")

    # Check multi-feature indicators
    if 'concurrent_risk_factors' in enhanced_features_df.columns:
        concurrent_risks = lap6_data['concurrent_risk_factors']
        print(f"\nConcurrent risk factors active: {concurrent_risks}")

    if 'simultaneous_spikes' in enhanced_features_df.columns:
        simultaneous_spikes = lap6_data['simultaneous_spikes']
        print(f"Simultaneous feature spikes: {simultaneous_spikes}")

    return active_thresholds, significant_spikes, extreme_zscores

# Execute the enhanced feature engineering
print("Creating sudden change detection features...")
enhanced_features = create_sudden_change_features(temporal_features_df, lap_features_df)

print("\nAnalyzing sudden change signals...")
active_thresholds, significant_spikes, extreme_zscores = analyze_sudden_change_signals(enhanced_features)

Creating sudden change detection features...
=== SUDDEN CHANGE FEATURE ENGINEERING ===
Focus: Detecting unexpected deviations and threshold effects

1. Creating sudden spike detection features...
2. Creating threshold crossing indicators...
3. Creating volatility and instability measures...
4. Creating multi-feature anomaly indicators...
5. Creating driver-level sudden change indicators...
6. Creating change acceleration features...

Feature engineering complete!
Created 81 total features
Original features: 21
New features added: 60

Analyzing sudden change signals...

=== SUDDEN CHANGE SIGNAL ANALYSIS ===
LAP 6 (Pre-Safety Car) ANOMALY INDICATORS:
  ✓ speed_range_across_drivers_above_p75
  ✓ speed_range_across_drivers_above_p90

Significant spikes (>1.5x recent average): 1
  ⚠ speed_variance_across_drivers_spike_ratio: 1.78x above recent average

Extreme z-scores (>1.5 std dev): 1
  📊 speed_range_across_drivers_zscore: 2.02 std dev from mean

Concurrent risk factors active: 1.0
Simult

The analysis shows a focused anomaly centered around speed range across drivers, which is exactly the type of signal we hoped to find. Think of this metric as measuring how spread out the field becomes in terms of speed - when some drivers are going much faster or slower than others on the same lap, it indicates the field is becoming unstable.
The fact that this single factor triggered multiple types of anomaly detection gives us confidence in the signal. We're seeing the same underlying phenomenon - unusual speed dispersion - being detected through different analytical lenses. The speed range across drivers crossed into both the 75th and 90th percentile thresholds, meaning lap 6 had one of the most extreme speed disparities in the entire pre-safety car period.

The spike ratio of 1.78 tells us that the speed range in lap 6 was nearly double what we would expect based on the recent trend from laps 3-5. This suggests something changed dramatically in that final lap before the safety car. Rather than a gradual build-up of problems, we're seeing evidence of a sudden shift in field dynamics.

The z-score of 2.02 standard deviations is particularly meaningful because it places lap 6's speed range in extremely rare territory statistically. In a normal distribution, only about 2% of observations would be this extreme. This mathematical validation supports our hypothesis that we're detecting genuine anomalies rather than normal racing variation.


## Toy Models

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, Dict, Any
import warnings
warnings.filterwarnings('ignore')

def prepare_model_features(enhanced_features_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series, list]:
    """
    Prepare features for model training with focus on sudden change indicators.

    The key insight here is that we want to prioritize features that showed
    strong signals in our lap 6 analysis, particularly those related to
    speed variance and threshold crossing indicators.
    """

    print("=== MODEL FEATURE PREPARATION ===")

    # Create a copy to avoid modifying original data
    df = enhanced_features_df.copy()

    # Define feature categories based on our findings
    # These categories reflect different types of sudden change detection
    priority_features = [
        # Speed-related anomalies (our strongest signals)
        'speed_range_across_drivers', 'speed_variance_across_drivers',
        'speed_range_across_drivers_spike_ratio', 'speed_variance_across_drivers_spike_ratio',
        'speed_range_across_drivers_zscore', 'speed_variance_across_drivers_zscore',
        'speed_range_across_drivers_above_p75', 'speed_range_across_drivers_above_p90',

        # Multi-feature risk indicators
        'concurrent_risk_factors', 'simultaneous_spikes',

        # Driver behavior anomalies
        'drivers_with_speed_anomalies', 'drivers_with_brake_anomalies', 'drivers_struggling_consistency'
    ]

    # Secondary features (other sudden change indicators)
    secondary_features = [col for col in df.columns if any(keyword in col for keyword in [
        '_spike_ratio', '_zscore', '_jump', '_above_p', '_recent_volatility',
        '_acceleration', '_accelerating', '_direction_changes'
    ]) and col not in priority_features]

    # Base features (for context)
    base_features = [
        'total_brake_events', 'total_off_track', 'laptime_variance'
    ]

    # Combine all features, prioritizing those that showed strong signals
    all_model_features = priority_features + secondary_features + base_features

    # Filter to only include features that exist in our dataset
    available_features = [f for f in all_model_features if f in df.columns]

    print(f"Priority features available: {len([f for f in priority_features if f in df.columns])}")
    print(f"Secondary features available: {len([f for f in secondary_features if f in df.columns])}")
    print(f"Base features available: {len([f for f in base_features if f in df.columns])}")
    print(f"Total modeling features: {len(available_features)}")

    # Prepare feature matrix and target
    X = df[available_features].fillna(0)  # Fill NaN with 0 for missing data
    y = df['sc_next_lap']

    # Remove any remaining infinite values
    X = X.replace([np.inf, -np.inf], 0)

    print(f"Feature matrix shape: {X.shape}")
    print(f"Target distribution: {y.value_counts().to_dict()}")

    return X, y, available_features

def train_anomaly_detection_models(X: pd.DataFrame, y: pd.Series) -> Dict[str, Any]:
    """
    Train multiple anomaly detection models to identify pre-safety car conditions.

    We're using anomaly detection because we're looking for rare, unusual patterns
    that deviate from normal racing conditions. This is more appropriate than
    traditional classification when dealing with sudden change detection.
    """

    print("\n=== ANOMALY DETECTION MODEL TRAINING ===")

    models = {}

    # Scale features for anomaly detection algorithms
    # Using RobustScaler because it's less sensitive to outliers than StandardScaler
    scaler = RobustScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

    print("Training anomaly detection models...")

    # 1. Isolation Forest
    # This algorithm isolates anomalies by randomly selecting features and split values
    # Anomalies are easier to isolate, so they require fewer splits
    print("  - Isolation Forest: Isolates anomalies through random feature splits")
    iso_forest = IsolationForest(
        contamination=0.2,  # Expect ~20% of laps to be somewhat anomalous
        random_state=42,
        n_estimators=100
    )
    iso_forest.fit(X_scaled)

    # Get anomaly scores (lower scores = more anomalous)
    iso_scores = iso_forest.decision_function(X_scaled)
    iso_predictions = iso_forest.predict(X_scaled)  # -1 for anomaly, 1 for normal

    models['isolation_forest'] = {
        'model': iso_forest,
        'scores': iso_scores,
        'predictions': iso_predictions,
        'scaler': scaler
    }

    # 2. One-Class SVM
    # This finds a boundary that encloses the normal data points
    # Points outside this boundary are considered anomalous
    print("  - One-Class SVM: Finds boundary around normal racing conditions")
    oc_svm = OneClassSVM(
        kernel='rbf',  # Radial basis function for non-linear boundaries
        gamma='scale',
        nu=0.2  # Expected fraction of anomalies
    )
    oc_svm.fit(X_scaled)

    svm_scores = oc_svm.decision_function(X_scaled)
    svm_predictions = oc_svm.predict(X_scaled)

    models['one_class_svm'] = {
        'model': oc_svm,
        'scores': svm_scores,
        'predictions': svm_predictions,
        'scaler': scaler
    }

    # 3. Logistic Regression (for comparison)
    # Traditional supervised learning approach
    print("  - Logistic Regression: Traditional supervised baseline")
    log_reg = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    log_reg.fit(X_scaled, y)

    log_reg_probs = log_reg.predict_proba(X_scaled)[:, 1]  # Probability of safety car
    log_reg_predictions = log_reg.predict(X_scaled)

    models['logistic_regression'] = {
        'model': log_reg,
        'scores': log_reg_probs,
        'predictions': log_reg_predictions,
        'scaler': scaler
    }

    return models

def evaluate_models(models: Dict[str, Any], X: pd.DataFrame, y: pd.Series,
                   enhanced_features_df: pd.DataFrame) -> None:
    """
    Evaluate how well each model identifies the pre-safety car lap (lap 6).

    For anomaly detection, we're particularly interested in whether the models
    correctly identify lap 6 as anomalous while treating earlier laps as normal.
    """

    print("\n=== MODEL EVALUATION ===")

    # Get the index of lap 6 (our target lap)
    lap6_idx = enhanced_features_df[enhanced_features_df['LapNumber'] == 6].index[0]
    lap6_position = X.index.get_loc(lap6_idx)

    print(f"Evaluating models on lap 6 detection (position {lap6_position} in dataset)")
    print("="*60)

    for model_name, model_data in models.items():
        print(f"\n{model_name.upper().replace('_', ' ')}:")
        print("-" * 40)

        scores = model_data['scores']
        predictions = model_data['predictions']

        # For anomaly detection models, convert predictions to binary
        if model_name in ['isolation_forest', 'one_class_svm']:
            # -1 = anomaly, 1 = normal, so we convert to 0/1 format
            binary_preds = (predictions == -1).astype(int)

            # Check if lap 6 was identified as anomalous
            lap6_is_anomaly = binary_preds[lap6_position] == 1
            print(f"  Lap 6 identified as anomaly: {'✓ YES' if lap6_is_anomaly else '✗ NO'}")

            # Show anomaly score for lap 6
            lap6_score = scores[lap6_position]
            print(f"  Lap 6 anomaly score: {lap6_score:.3f}")

            # Compare to other laps
            other_laps_scores = np.concatenate([scores[:lap6_position], scores[lap6_position+1:]])
            percentile_rank = (scores[lap6_position] < other_laps_scores).mean() * 100
            print(f"  Lap 6 score percentile: {percentile_rank:.1f}% (lower = more anomalous)")

        else:  # Logistic regression
            lap6_prob = scores[lap6_position]
            lap6_pred = predictions[lap6_position]
            print(f"  Lap 6 safety car probability: {lap6_prob:.3f}")
            print(f"  Lap 6 prediction: {'✓ SAFETY CAR' if lap6_pred == 1 else '✗ NORMAL'}")

        # Show predictions for all laps
        print(f"  Predictions by lap:")
        for i, lap_num in enumerate(enhanced_features_df['LapNumber']):
            if model_name in ['isolation_forest', 'one_class_svm']:
                pred_symbol = "🚨" if (predictions == -1)[i] else "✓"
                score_val = scores[i]
                print(f"    Lap {int(lap_num)}: {pred_symbol} (score: {score_val:.3f})")
            else:
                prob_val = scores[i]
                pred_symbol = "🚨" if predictions[i] == 1 else "✓"
                print(f"    Lap {int(lap_num)}: {pred_symbol} (prob: {prob_val:.3f})")

def analyze_feature_importance(models: Dict[str, Any], X: pd.DataFrame,
                              available_features: list) -> None:
    """
    Analyze which features are most important for each model.

    This helps us understand what the models are learning and whether
    they're focusing on the right signals for safety car prediction.
    """

    print("\n=== FEATURE IMPORTANCE ANALYSIS ===")

    # Only logistic regression provides direct feature importance
    if 'logistic_regression' in models:
        log_reg_model = models['logistic_regression']['model']
        feature_importance = pd.DataFrame({
            'feature': available_features,
            'importance': np.abs(log_reg_model.coef_[0])
        }).sort_values('importance', ascending=False)

        print("LOGISTIC REGRESSION - Top 10 Most Important Features:")
        print("-" * 55)
        for idx, row in feature_importance.head(10).iterrows():
            print(f"  {row['feature'][:40]:40} | {row['importance']:.3f}")

    # For anomaly detection models, we can analyze which features
    # contribute most to the anomaly scores for lap 6
    print(f"\nFEATURE VALUES FOR LAP 6 (Pre-Safety Car):")
    print("-" * 50)

    # Get lap 6 data
    lap6_data = X.iloc[-1]  # Assuming lap 6 is the last row

    # Show the most extreme feature values
    feature_values = []
    for feature in available_features:
        value = lap6_data[feature]
        if not np.isnan(value) and value != 0:
            # Calculate how extreme this value is compared to other laps
            other_values = X[feature].iloc[:-1]  # All laps except lap 6
            if len(other_values) > 0 and other_values.std() > 0:
                z_score = (value - other_values.mean()) / other_values.std()
                feature_values.append((feature, value, z_score))

    # Sort by absolute z-score (most extreme values first)
    feature_values.sort(key=lambda x: abs(x[2]), reverse=True)

    print("Most extreme feature values in lap 6:")
    for feature, value, z_score in feature_values[:10]:
        direction = "↑" if z_score > 0 else "↓"
        print(f"  {feature[:35]:35} | {value:8.3f} | {direction} {abs(z_score):5.2f} std dev")

# Execute the model development pipeline
print("Starting model development pipeline...")

# Prepare features for modeling
X, y, available_features = prepare_model_features(enhanced_features)

# Train anomaly detection models
models = train_anomaly_detection_models(X, y)

# Evaluate model performance
evaluate_models(models, X, y, enhanced_features)

# Analyze feature importance
analyze_feature_importance(models, X, available_features)

Starting model development pipeline...
=== MODEL FEATURE PREPARATION ===
Priority features available: 13
Secondary features available: 44
Base features available: 3
Total modeling features: 60
Feature matrix shape: (6, 60)
Target distribution: {0: 5, 1: 1}

=== ANOMALY DETECTION MODEL TRAINING ===
Training anomaly detection models...
  - Isolation Forest: Isolates anomalies through random feature splits
  - One-Class SVM: Finds boundary around normal racing conditions
  - Logistic Regression: Traditional supervised baseline

=== MODEL EVALUATION ===
Evaluating models on lap 6 detection (position 5 in dataset)

ISOLATION FOREST:
----------------------------------------
  Lap 6 identified as anomaly: ✗ NO
  Lap 6 anomaly score: 0.000
  Lap 6 score percentile: 80.0% (lower = more anomalous)
  Predictions by lap:
    Lap 1: 🚨 (score: -0.006)
    Lap 2: ✓ (score: 0.045)
    Lap 3: ✓ (score: 0.012)
    Lap 4: ✓ (score: 0.125)
    Lap 5: ✓ (score: 0.107)
    Lap 6: ✓ (score: 0.000)

ONE CLASS

## Validating the approach against another race

### Setup

In [None]:
session = f1.get_session(season, 'São Paulo Grand Prix', 'R')
session.load()

core           INFO 	Loading data for São Paulo Grand Prix - Race [v3.5.3]
INFO:fastf1.fastf1.core:Loading data for São Paulo Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
INFO:fastf1.fastf1.req:Using cached data for session_info
req            INFO 	Using cached data for driver_info
INFO:fastf1.fastf1.req:Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
INFO:fastf1.fastf1.req:Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
INFO:fastf1.fastf1.req:Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
INFO:fastf1.fastf1.req:Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
INFO:fastf1.fastf1.req:Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
INFO:fastf1.fastf1.req:Using cached data for timing_app_data
core     

In [None]:
laps = session.laps

display(session.track_status)

In [None]:
track_status_df = session.track_status

track_status_df['is_sc'] = track_status_df['Status'] == '4'
track_status_df['is_vsc'] = track_status_df['Status'] == '6'

laps_with_sc = add_sc_flag_to_laps(laps, track_status_df)

# prepare the data for joining
laps_with_sc = laps_with_sc.reset_index(drop=True)

weather_data = session.laps.get_weather_data()
weather_data = weather_data.reset_index(drop=True)

# exclude the 'Time' column from weather data when joining
joined = pd.concat([laps, weather_data.loc[:, ~(weather_data.columns == 'Time')]], axis=1)

display(joined.head())
display(len(joined))

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,DeletedReason,FastF1Generated,IsAccurate,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,0 days 01:13:34.207000,VER,1,0 days 00:01:39.161000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:49.176000,...,,False,False,22.5,78.0,926.6,True,25.1,259,0.9
1,0 days 01:15:01.341000,VER,1,0 days 00:01:27.134000,2.0,1.0,NaT,NaT,0 days 00:00:21.981000,0 days 00:00:46.412000,...,,False,True,22.4,78.0,926.6,True,24.6,265,0.7
2,0 days 01:16:27.581000,VER,1,0 days 00:01:26.240000,3.0,1.0,NaT,NaT,0 days 00:00:21.716000,0 days 00:00:45.980000,...,,False,True,22.4,78.0,926.7,True,24.6,72,1.2
3,0 days 01:17:54.283000,VER,1,0 days 00:01:26.702000,4.0,1.0,NaT,NaT,0 days 00:00:22.151000,0 days 00:00:46.150000,...,,False,True,22.2,77.0,926.6,True,24.4,188,1.1
4,0 days 01:19:19.677000,VER,1,0 days 00:01:25.394000,5.0,1.0,NaT,NaT,0 days 00:00:21.773000,0 days 00:00:45.180000,...,,False,True,22.2,78.0,926.6,True,24.6,201,0.7


1135

In [None]:
car_data = session.laps.iloc[0].get_car_data()

car_data

Unnamed: 0,Date,RPM,Speed,nGear,Throttle,Brake,DRS,Source,Time,SessionTime
0,2024-11-03 15:49:57.625,10179.0,0.0,2,16.0,True,1,car,0 days 00:00:00.109000,0 days 01:11:54.879000
1,2024-11-03 15:49:57.905,10108.0,0.0,2,16.0,True,1,car,0 days 00:00:00.389000,0 days 01:11:55.159000
2,2024-11-03 15:49:58.146,8708.0,1.0,2,16.0,False,1,car,0 days 00:00:00.630000,0 days 01:11:55.400000
3,2024-11-03 15:49:58.506,6048.0,13.0,2,16.0,False,1,car,0 days 00:00:00.990000,0 days 01:11:55.760000
4,2024-11-03 15:49:58.826,4423.0,20.0,2,16.0,False,1,car,0 days 00:00:01.310000,0 days 01:11:56.080000
...,...,...,...,...,...,...,...,...,...,...
366,2024-11-03 15:51:35.985,10884.0,311.0,8,100.0,False,1,car,0 days 00:01:38.469000,0 days 01:13:33.239000
367,2024-11-03 15:51:36.225,10932.0,311.0,8,100.0,False,1,car,0 days 00:01:38.709000,0 days 01:13:33.479000
368,2024-11-03 15:51:36.585,10929.0,311.0,8,100.0,False,1,car,0 days 00:01:39.069000,0 days 01:13:33.839000
369,2024-11-03 15:51:36.785,10935.0,312.0,8,100.0,False,1,car,0 days 00:01:39.269000,0 days 01:13:34.039000


### Evaluating

In [None]:
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

def create_sliding_windows(laps_df: pd.DataFrame, track_status_df: pd.DataFrame,
                          window_size: int = 5, min_gap: int = 3) -> List[Dict]:
    """
    Create sliding windows for safety car prediction training.

    This function identifies all safety car deployments in a race and creates
    training windows for each one. Each window examines the laps leading up
    to a safety car deployment to learn predictive patterns.

    Args:
        laps_df: DataFrame containing lap data
        track_status_df: DataFrame containing track status information
        window_size: Number of laps to examine before each safety car (default: 5)
        min_gap: Minimum laps between safety car deployments to avoid overlap (default: 3)

    Returns:
        List of dictionaries, each containing window data and metadata
    """

    print("=== CREATING SLIDING WINDOWS FOR SAFETY CAR ANALYSIS ===")

    # Find all safety car deployments
    sc_deployments = track_status_df[track_status_df['is_sc'] == True].copy()

    if sc_deployments.empty:
        print("No safety car deployments found in track status data")
        return []

    print(f"Found {len(sc_deployments)} safety car deployment(s)")

    # Map safety car times to lap numbers
    sc_laps = []
    for _, sc_row in sc_deployments.iterrows():
        sc_time = sc_row['Time']

        # Find the lap that was active when safety car was deployed
        # We look for laps that started before the SC and ended after (or were ongoing)
        potential_laps = laps_df[
            (laps_df['LapStartTime'] <= sc_time) &
            ((laps_df['LapStartTime'] + laps_df['LapTime']) >= sc_time)
        ]

        if not potential_laps.empty:
            # Take the earliest matching lap (most conservative approach)
            sc_lap_num = potential_laps['LapNumber'].min()
            sc_laps.append(sc_lap_num)
            print(f"  Safety car deployed during lap {sc_lap_num} at time {sc_time}")
        else:
            # Fallback: find the closest lap by start time
            time_diffs = np.abs(laps_df['LapStartTime'] - sc_time)
            closest_lap = laps_df.loc[time_diffs.idxmin(), 'LapNumber']
            sc_laps.append(closest_lap)
            print(f"  Safety car deployment mapped to closest lap {closest_lap} (fallback method)")

    # Remove duplicate lap numbers and sort
    sc_laps = sorted(list(set(sc_laps)))
    print(f"Unique safety car laps: {sc_laps}")

    # Create training windows
    windows = []

    for sc_lap in sc_laps:
        # Define the analysis window (laps before the safety car)
        window_start = max(1, sc_lap - window_size)  # Don't go below lap 1
        window_end = sc_lap - 1  # Last lap before safety car

        # Check if we have enough laps for a meaningful window
        if window_end < window_start:
            print(f"  Skipping SC lap {sc_lap}: insufficient preceding laps")
            continue

        # Check for conflicts with other safety car deployments
        conflicting_sc = [other_sc for other_sc in sc_laps
                         if other_sc != sc_lap and
                         window_start <= other_sc <= window_end]

        if conflicting_sc:
            print(f"  Skipping SC lap {sc_lap}: conflicts with other SC deployments {conflicting_sc}")
            continue

        # Extract laps in this window
        window_laps = laps_df[
            (laps_df['LapNumber'] >= window_start) &
            (laps_df['LapNumber'] <= window_end)
        ].copy()

        if len(window_laps) == 0:
            print(f"  Skipping SC lap {sc_lap}: no lap data found for window")
            continue

        # Create window metadata
        window_info = {
            'sc_lap': sc_lap,
            'window_start': window_start,
            'window_end': window_end,
            'actual_laps': sorted(window_laps['LapNumber'].unique()),
            'laps_data': window_laps,
            'num_drivers': window_laps['Driver'].nunique(),
            'total_laps_in_window': len(window_laps)
        }

        windows.append(window_info)
        print(f"  Created window for SC lap {sc_lap}: analyzing laps {window_start}-{window_end}")
        print(f"    - {len(window_laps)} total lap records from {window_info['num_drivers']} drivers")

    print(f"\nSuccessfully created {len(windows)} training windows")
    return windows

def process_window_features(window: Dict, feature_engineering_func) -> pd.DataFrame:
    """
    Apply feature engineering to a single training window.

    This function takes a window of lap data and applies our sudden change
    detection feature engineering to create model-ready features. Each window
    becomes one training example for our machine learning model.

    Args:
        window: Dictionary containing window data and metadata
        feature_engineering_func: Function to apply feature engineering

    Returns:
        DataFrame with engineered features for this window
    """

    print(f"Processing features for SC lap {window['sc_lap']} window...")

    # Extract the lap data from this window
    window_laps = window['laps_data']

    # Apply the same feature engineering pipeline we used before
    # First, we need to create temporal features (aggregated by lap number)
    lap_features = []

    for lap_num in sorted(window_laps['LapNumber'].unique()):
        lap_data = window_laps[window_laps['LapNumber'] == lap_num]

        # Try to get detailed data for each lap (same as our original approach)
        lap_feature_dict = {'LapNumber': lap_num}

        # Since we might not have access to get_car_data() etc. in this context,
        # we'll use the basic lap data that's available
        try:
            # Use available basic features
            lap_feature_dict['avg_laptime'] = lap_data['LapTime'].mean().total_seconds() if not lap_data['LapTime'].isna().all() else np.nan
            lap_feature_dict['laptime_std'] = lap_data['LapTime'].std().total_seconds() if not lap_data['LapTime'].isna().all() else np.nan
            lap_feature_dict['num_drivers'] = len(lap_data)

            # Add any other available features from the lap data
            numeric_cols = lap_data.select_dtypes(include=[np.number]).columns
            for col in numeric_cols:
                if col not in ['LapNumber']:
                    lap_feature_dict[f'{col}_mean'] = lap_data[col].mean()
                    lap_feature_dict[f'{col}_std'] = lap_data[col].std()
                    lap_feature_dict[f'{col}_max'] = lap_data[col].max()
                    lap_feature_dict[f'{col}_min'] = lap_data[col].min()

        except Exception as e:
            print(f"    Warning: Error processing lap {lap_num}: {e}")
            continue

        lap_features.append(lap_feature_dict)

    if not lap_features:
        print(f"    Error: No features could be extracted for window")
        return pd.DataFrame()

    # Convert to DataFrame
    lap_features_df = pd.DataFrame(lap_features)

    # Create temporal aggregations (similar to our original approach)
    temporal_features = create_window_temporal_features(lap_features_df)

    # Apply sudden change feature engineering
    if len(temporal_features) > 0:
        enhanced_features = apply_sudden_change_features(temporal_features)

        # Add target variable (1 for the final lap in the window, 0 for others)
        enhanced_features['sc_next_lap'] = 0
        enhanced_features.loc[enhanced_features['LapNumber'] == window['window_end'], 'sc_next_lap'] = 1

        # Add metadata about this window
        enhanced_features['source_window'] = f"SC_lap_{window['sc_lap']}"
        enhanced_features['sc_lap'] = window['sc_lap']

        return enhanced_features
    else:
        print(f"    Warning: No temporal features created for window")
        return pd.DataFrame()

def create_window_temporal_features(lap_features_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create temporal aggregations for a single window.
    This is a simplified version of our original temporal feature creation.
    """

    if len(lap_features_df) == 0:
        return pd.DataFrame()

    # Create basic temporal features
    temporal_features = lap_features_df.copy()

    # Add lap-to-lap changes for key metrics
    numeric_cols = temporal_features.select_dtypes(include=[np.number]).columns
    numeric_cols = [col for col in numeric_cols if col != 'LapNumber']

    for col in numeric_cols:
        if col in temporal_features.columns:
            temporal_features[f'{col}_change'] = temporal_features[col].diff()
            temporal_features[f'{col}_pct_change'] = temporal_features[col].pct_change().fillna(0)

    return temporal_features

def apply_sudden_change_features(temporal_features_df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply sudden change detection features to a window.
    This is a simplified version of our original sudden change feature engineering.
    """

    if len(temporal_features_df) == 0:
        return pd.DataFrame()

    enhanced_df = temporal_features_df.copy()

    # Find numeric columns that could indicate risk
    risk_columns = [col for col in enhanced_df.columns
                   if any(keyword in col.lower() for keyword in
                         ['time', 'speed', 'std', 'change', 'mean'])
                   and col not in ['LapNumber', 'sc_next_lap']]

    # Apply spike detection to key features
    for col in risk_columns:
        if col in enhanced_df.columns and enhanced_df[col].notna().sum() > 1:
            values = enhanced_df[col].fillna(0)

            # Spike ratio: current value vs recent average
            recent_avg = values.rolling(window=3, min_periods=1).mean().shift(1)
            enhanced_df[f'{col}_spike_ratio'] = values / (recent_avg + 0.001)

            # Z-score: how extreme is this value?
            if values.std() > 0:
                enhanced_df[f'{col}_zscore'] = (values - values.mean()) / values.std()

            # Percentile rank
            enhanced_df[f'{col}_percentile'] = values.rank(pct=True)

            # Threshold indicators
            if len(values) > 1:
                p75 = values.quantile(0.75)
                enhanced_df[f'{col}_above_p75'] = (values > p75).astype(int)

    return enhanced_df

def analyze_multi_window_patterns(all_windows_features: List[pd.DataFrame]) -> None:
    """
    Analyze patterns across multiple training windows to identify consistent
    predictive signals that appear before different safety car deployments.
    """

    print("\n=== MULTI-WINDOW PATTERN ANALYSIS ===")

    if not all_windows_features:
        print("No windows to analyze")
        return

    print(f"Analyzing patterns across {len(all_windows_features)} windows")

    # Combine all pre-SC laps (where sc_next_lap = 1)
    pre_sc_laps = []
    normal_laps = []

    for window_df in all_windows_features:
        if not window_df.empty:
            pre_sc = window_df[window_df['sc_next_lap'] == 1]
            normal = window_df[window_df['sc_next_lap'] == 0]

            if not pre_sc.empty:
                pre_sc_laps.append(pre_sc.iloc[0])  # Take the pre-SC lap

            for _, normal_lap in normal.iterrows():
                normal_laps.append(normal_lap)

    if not pre_sc_laps:
        print("No pre-safety car laps found")
        return

    pre_sc_df = pd.DataFrame(pre_sc_laps)
    normal_df = pd.DataFrame(normal_laps) if normal_laps else pd.DataFrame()

    print(f"Found {len(pre_sc_df)} pre-safety car laps and {len(normal_df)} normal laps")

    # Identify features that are consistently extreme across pre-SC laps
    feature_cols = [col for col in pre_sc_df.columns
                   if col not in ['LapNumber', 'sc_next_lap', 'source_window', 'sc_lap']]

    print("\nFeatures showing consistent elevation in pre-SC laps:")
    print("-" * 60)

    consistent_signals = []

    for feature in feature_cols:
        if feature in pre_sc_df.columns and not pre_sc_df[feature].isna().all():
            pre_sc_values = pre_sc_df[feature].dropna()

            if len(pre_sc_values) > 0:
                # Check if this feature shows consistently high values
                if len(normal_df) > 0 and feature in normal_df.columns:
                    normal_values = normal_df[feature].dropna()
                    if len(normal_values) > 0:
                        # Compare means
                        pre_sc_mean = pre_sc_values.mean()
                        normal_mean = normal_values.mean()

                        if normal_mean != 0:
                            ratio = pre_sc_mean / normal_mean
                            if ratio > 1.5:  # Pre-SC laps show 50% higher values
                                consistent_signals.append((feature, ratio, pre_sc_mean, normal_mean))

                # Also check for high percentile features (above_p75, above_p90)
                if 'above_p' in feature:
                    activation_rate = pre_sc_values.mean()
                    if activation_rate >= 0.5:  # Activated in at least 50% of pre-SC laps
                        print(f"  {feature[:50]:50} | Active in {activation_rate:.1%} of pre-SC laps")

    # Sort by ratio and show top signals
    consistent_signals.sort(key=lambda x: x[1], reverse=True)

    for feature, ratio, pre_sc_mean, normal_mean in consistent_signals[:10]:
        print(f"  {feature[:50]:50} | {ratio:.2f}x higher | Pre-SC: {pre_sc_mean:.3f}, Normal: {normal_mean:.3f}")

    return consistent_signals

# Example usage function that ties everything together
def run_sliding_window_analysis(laps_df: pd.DataFrame, track_status_df: pd.DataFrame,
                               window_size: int = 5) -> Tuple[List[pd.DataFrame], List]:
    """
    Complete pipeline for sliding window safety car analysis.

    This function orchestrates the entire process: finding safety car deployments,
    creating training windows, engineering features, and analyzing patterns.
    """

    print("Starting sliding window safety car analysis...")

    # Step 1: Create sliding windows
    windows = create_sliding_windows(laps_df, track_status_df, window_size)

    if not windows:
        print("No valid training windows could be created")
        return [], []

    # Step 2: Process features for each window
    all_window_features = []

    for i, window in enumerate(windows):
        print(f"\nProcessing window {i+1}/{len(windows)}:")
        window_features = process_window_features(window, None)  # We'll implement the function inline

        if not window_features.empty:
            all_window_features.append(window_features)
            print(f"  Created {len(window_features)} feature rows for this window")
        else:
            print(f"  Failed to create features for this window")

    # Step 3: Analyze patterns across windows
    if all_window_features:
        consistent_signals = analyze_multi_window_patterns(all_window_features)
    else:
        consistent_signals = []

    return all_window_features, consistent_signals

# The user can call this function with their new race data:
# all_windows, signals = run_sliding_window_analysis(new_race_laps, new_race_track_status)

In [None]:
# Assuming you have the lap data and track status for the new race
all_windows, consistent_signals = run_sliding_window_analysis(
    laps,
    track_status_df,
    window_size=5
)

Starting sliding window safety car analysis...
=== CREATING SLIDING WINDOWS FOR SAFETY CAR ANALYSIS ===
Found 2 safety car deployment(s)
  Safety car deployed during lap 29.0 at time 0 days 01:54:50.200000
  Safety car deployed during lap 39.0 at time 0 days 02:35:12.045000
Unique safety car laps: [29.0, 39.0]
  Created window for SC lap 29.0: analyzing laps 24.0-28.0
    - 90 total lap records from 18 drivers
  Created window for SC lap 39.0: analyzing laps 34.0-38.0
    - 80 total lap records from 16 drivers

Successfully created 2 training windows

Processing window 1/2:
Processing features for SC lap 29.0 window...


DataError: No numeric types to aggregate

In [None]:
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Optional
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

from fastf1.core import Session

class SafetyCarPredictor:
    """
    A clean, robust system for predicting safety car deployments in Formula 1 races.

    This class handles the entire pipeline from raw lap data to trained models,
    with careful attention to the real-world complexities of F1 data including
    timing inconsistencies, missing telemetry, and varying data formats.
    """

    def __init__(self, window_size: int = 5):
        """
        Initialize the safety car prediction system.

        Args:
            window_size: Number of laps to analyze before each potential safety car deployment
        """
        self.window_size = window_size
        self.model = None
        self.scaler = None
        self.feature_names = []
        self.training_windows = []

    def find_safety_car_laps(self, laps_df: pd.DataFrame, track_status_df: pd.DataFrame) -> List[int]:
        """
        Identify the lap numbers when safety cars were deployed.

        This method carefully maps safety car deployment times to lap numbers,
        handling the complexity that different drivers complete laps at different times.

        Args:
            laps_df: DataFrame containing lap data with LapNumber, Driver, and timing columns
            track_status_df: DataFrame containing track status with Time and Status columns

        Returns:
            List of lap numbers when safety cars were deployed
        """
        print("=== FINDING SAFETY CAR DEPLOYMENTS ===")

        # Find all safety car deployments (Status == '4' indicates safety car)
        sc_deployments = track_status_df[track_status_df['Status'] == '4'].copy()

        if sc_deployments.empty:
            print("No safety car deployments found")
            return []

        print(f"Found {len(sc_deployments)} safety car deployment(s)")

        sc_laps = []

        for idx, sc_row in sc_deployments.iterrows():
            sc_time = sc_row['Time']
            print(f"\nAnalyzing safety car deployment at time: {sc_time}")

            # Strategy: Find the lap that most drivers were on when SC was deployed
            # We'll look at all laps that were "active" during the SC deployment time

            # Convert LapTime to timedelta if it isn't already
            if 'LapTime' in laps_df.columns:
                if not pd.api.types.is_timedelta64_dtype(laps_df['LapTime']):
                    # Try to convert if it's not already a timedelta
                    try:
                        laps_with_time = laps_df.copy()
                        laps_with_time['LapTime'] = pd.to_timedelta(laps_with_time['LapTime'])
                    except:
                        laps_with_time = laps_df.copy()
                        laps_with_time['LapTime'] = pd.to_timedelta(laps_with_time['LapTime'], errors='coerce')
                else:
                    laps_with_time = laps_df.copy()
            else:
                print("Warning: No LapTime column found, using approximation")
                laps_with_time = laps_df.copy()

            # Find laps that were potentially active during SC deployment
            candidate_laps = []

            # Method 1: If we have LapStartTime, use it directly
            if 'LapStartTime' in laps_with_time.columns:
                # Find laps where SC time falls between lap start and estimated lap end
                for _, lap_row in laps_with_time.iterrows():
                    lap_start = lap_row['LapStartTime']

                    # Estimate lap end time
                    if pd.notna(lap_row.get('LapTime')):
                        try:
                            lap_end = lap_start + lap_row['LapTime']
                            if lap_start <= sc_time <= lap_end:
                                candidate_laps.append(lap_row['LapNumber'])
                        except:
                            # Fallback: just check if SC time is close to lap start
                            time_diff = abs((sc_time - lap_start).total_seconds())
                            if time_diff < 180:  # Within 3 minutes (reasonable lap time)
                                candidate_laps.append(lap_row['LapNumber'])
                    else:
                        # No lap time available, use proximity to start time
                        time_diff = abs((sc_time - lap_start).total_seconds())
                        if time_diff < 180:
                            candidate_laps.append(lap_row['LapNumber'])

            # Method 2: Fallback - find the most common lap number around the SC time
            if not candidate_laps:
                print("Using fallback method: closest lap by time")
                # Find laps with timestamps close to SC deployment
                if 'LapStartTime' in laps_with_time.columns:
                    time_diffs = abs(laps_with_time['LapStartTime'] - sc_time).dt.total_seconds()
                    closest_indices = time_diffs.nsmallest(10).index  # Take 10 closest laps
                    candidate_laps = laps_with_time.loc[closest_indices, 'LapNumber'].tolist()

            # Determine the most likely lap number
            if candidate_laps:
                # Take the most common lap number among candidates
                from collections import Counter
                lap_counts = Counter(candidate_laps)
                most_common_lap = lap_counts.most_common(1)[0][0]
                sc_laps.append(most_common_lap)
                print(f"  Safety car mapped to lap {most_common_lap}")
                print(f"  Candidate laps: {sorted(set(candidate_laps))}")
            else:
                print("  Warning: Could not map safety car to a specific lap")

        # Remove duplicates and sort
        unique_sc_laps = sorted(list(set(sc_laps)))
        print(f"\nFinal safety car laps: {unique_sc_laps}")
        return unique_sc_laps

    def create_training_windows(self, laps_df: pd.DataFrame, sc_laps: List[int]) -> List[Dict]:
        """
        Create training windows for each safety car deployment.

        Each window contains the laps leading up to a safety car deployment,
        providing the data needed to learn predictive patterns.

        Args:
            laps_df: DataFrame containing lap data
            sc_laps: List of lap numbers where safety cars were deployed

        Returns:
            List of training window dictionaries
        """
        print("\n=== CREATING TRAINING WINDOWS ===")

        windows = []

        for sc_lap in sc_laps:
            # Define the analysis window
            window_start = max(1, sc_lap - self.window_size)
            window_end = sc_lap - 1  # Last lap before safety car

            print(f"\nCreating window for safety car on lap {sc_lap}")
            print(f"  Analyzing laps {window_start} to {window_end}")

            # Check if we have enough laps
            if window_end < window_start:
                print(f"  Skipping: Not enough preceding laps")
                continue

            # Extract laps in this window
            window_laps = laps_df[
                (laps_df['LapNumber'] >= window_start) &
                (laps_df['LapNumber'] <= window_end)
            ].copy()

            if len(window_laps) == 0:
                print(f"  Skipping: No lap data found in window")
                continue

            # Check for conflicts with other safety car deployments
            conflicting_sc = [other_sc for other_sc in sc_laps
                             if other_sc != sc_lap and window_start <= other_sc <= window_end]

            if conflicting_sc:
                print(f"  Skipping: Conflicts with other safety cars at laps {conflicting_sc}")
                continue

            # Create window metadata
            window_info = {
                'sc_lap': sc_lap,
                'window_start': window_start,
                'window_end': window_end,
                'laps_data': window_laps,
                'num_drivers': window_laps['Driver'].nunique() if 'Driver' in window_laps.columns else 0,
                'total_laps': len(window_laps)
            }

            windows.append(window_info)
            print(f"  Created window: {len(window_laps)} lap records from {window_info['num_drivers']} drivers")

        self.training_windows = windows
        print(f"\nSuccessfully created {len(windows)} training windows")
        return windows

    def extract_basic_features(self, window_laps: pd.DataFrame) -> pd.DataFrame:
        """
        Extract basic features from a window of lap data.

        This method creates robust features that work with the core data available
        in all F1 datasets, handling missing columns and data type issues gracefully.

        Args:
            window_laps: DataFrame containing laps in the analysis window

        Returns:
            DataFrame with features aggregated by lap number
        """

        # Convert LapTime to seconds for calculations
        window_laps = window_laps.copy()

        if 'LapTime' in window_laps.columns:
            # Handle different LapTime formats
            if pd.api.types.is_timedelta64_dtype(window_laps['LapTime']):
                window_laps['LapTime_seconds'] = window_laps['LapTime'].dt.total_seconds()
            else:
                # Try to convert to timedelta first
                try:
                    window_laps['LapTime'] = pd.to_timedelta(window_laps['LapTime'])
                    window_laps['LapTime_seconds'] = window_laps['LapTime'].dt.total_seconds()
                except:
                    # If conversion fails, try to extract numeric value
                    try:
                        window_laps['LapTime_seconds'] = pd.to_numeric(window_laps['LapTime'], errors='coerce')
                    except:
                        window_laps['LapTime_seconds'] = np.nan

        # Group by lap number to create aggregated features
        lap_features = []

        for lap_num in sorted(window_laps['LapNumber'].unique()):
            lap_data = window_laps[window_laps['LapNumber'] == lap_num]

            features = {'LapNumber': lap_num}

            # Basic lap time statistics
            if 'LapTime_seconds' in lap_data.columns:
                laptime_clean = lap_data['LapTime_seconds'].dropna()
                if len(laptime_clean) > 0:
                    features['laptime_mean'] = laptime_clean.mean()
                    features['laptime_std'] = laptime_clean.std() if len(laptime_clean) > 1 else 0
                    features['laptime_min'] = laptime_clean.min()
                    features['laptime_max'] = laptime_clean.max()
                    features['laptime_range'] = laptime_clean.max() - laptime_clean.min()

            # Driver count and field characteristics
            features['num_drivers'] = len(lap_data)
            features['drivers_with_valid_times'] = lap_data['LapTime_seconds'].notna().sum() if 'LapTime_seconds' in lap_data.columns else 0

            # Position-related features (if available)
            if 'Position' in lap_data.columns:
                positions = lap_data['Position'].dropna()
                if len(positions) > 0:
                    features['position_spread'] = positions.max() - positions.min()
                    features['avg_position'] = positions.mean()

            # Tire age features (if available)
            if 'TyreLife' in lap_data.columns:
                tyre_life = lap_data['TyreLife'].dropna()
                if len(tyre_life) > 0:
                    features['avg_tyre_life'] = tyre_life.mean()
                    features['max_tyre_life'] = tyre_life.max()
                    features['tyre_life_spread'] = tyre_life.max() - tyre_life.min()

            # Stint-related features (if available)
            if 'Stint' in lap_data.columns:
                stint_data = lap_data['Stint'].dropna()
                if len(stint_data) > 0:
                    features['avg_stint'] = stint_data.mean()
                    features['stint_variety'] = stint_data.nunique()

            # Speed-related features (if available)
            speed_columns = [col for col in lap_data.columns if 'Speed' in col]
            for speed_col in speed_columns:
                speed_data = lap_data[speed_col].dropna()
                if len(speed_data) > 0:
                    features[f'{speed_col.lower()}_mean'] = speed_data.mean()
                    features[f'{speed_col.lower()}_std'] = speed_data.std() if len(speed_data) > 1 else 0
                    features[f'{speed_col.lower()}_range'] = speed_data.max() - speed_data.min()

            lap_features.append(features)

        return pd.DataFrame(lap_features)

    def create_predictive_features(self, basic_features_df: pd.DataFrame, target_lap: int) -> pd.DataFrame:
        """
        Create predictive features focused on sudden change detection.

        This method applies our key insight that racing incidents are more likely
        caused by sudden, unexpected changes rather than gradual escalation.

        Args:
            basic_features_df: DataFrame with basic features per lap
            target_lap: The lap number that we're trying to predict (safety car lap)

        Returns:
            DataFrame with sudden change detection features
        """

        df = basic_features_df.copy()

        # Get numeric columns for feature engineering
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        risk_features = [col for col in numeric_cols if col not in ['LapNumber']]

        # 1. Sudden spike detection
        for feature in risk_features:
            if feature in df.columns:
                values = df[feature].fillna(0)

                # Spike ratio: current value vs recent 3-lap average
                recent_baseline = values.rolling(window=3, min_periods=1).mean().shift(1)
                df[f'{feature}_spike_ratio'] = values / (recent_baseline + 0.001)

                # Lap-to-lap jump
                df[f'{feature}_jump'] = values.diff()
                df[f'{feature}_jump_pct'] = values.pct_change().fillna(0)

                # Z-score (how extreme is current value?)
                if values.std() > 0:
                    df[f'{feature}_zscore'] = (values - values.mean()) / values.std()
                else:
                    df[f'{feature}_zscore'] = 0

        # 2. Threshold crossing indicators
        for feature in risk_features:
            if feature in df.columns:
                values = df[feature].fillna(0)

                # Percentile ranking
                df[f'{feature}_percentile'] = values.rank(pct=True)

                # Above historical thresholds
                if len(values) > 1:
                    p75 = values.quantile(0.75)
                    p90 = values.quantile(0.90)
                    df[f'{feature}_above_p75'] = (values > p75).astype(int)
                    df[f'{feature}_above_p90'] = (values > p90).astype(int)

        # 3. Multi-feature risk indicators
        spike_features = [col for col in df.columns if '_spike_ratio' in col]
        if spike_features:
            # Count simultaneous spikes (>1.5x recent average)
            spike_flags = df[spike_features] > 1.5
            df['simultaneous_spikes'] = spike_flags.sum(axis=1)

        threshold_features = [col for col in df.columns if '_above_p75' in col]
        if threshold_features:
            # Count concurrent risk factors
            df['concurrent_risk_factors'] = df[threshold_features].sum(axis=1)

        # 4. Create target variable
        df['sc_next_lap'] = 0
        final_lap = target_lap - 1  # Last lap before safety car
        df.loc[df['LapNumber'] == final_lap, 'sc_next_lap'] = 1

        return df

    def train_model(self, training_data: List[pd.DataFrame]) -> None:
        """
        Train a safety car prediction model on multiple training windows.

        Args:
            training_data: List of DataFrames, each containing features for one window
        """
        print("\n=== TRAINING SAFETY CAR PREDICTION MODEL ===")

        if not training_data:
            print("No training data available")
            return

        # Combine all training windows
        all_features = pd.concat(training_data, ignore_index=True)

        # Prepare features and target
        exclude_cols = ['LapNumber', 'sc_next_lap']
        feature_cols = [col for col in all_features.columns if col not in exclude_cols]

        X = all_features[feature_cols].fillna(0)
        y = all_features['sc_next_lap']

        # Handle infinite values
        X = X.replace([np.inf, -np.inf], 0)

        print(f"Training data shape: {X.shape}")
        print(f"Target distribution: {y.value_counts().to_dict()}")
        print(f"Features: {len(feature_cols)}")

        # Scale features
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X)

        # Train logistic regression model
        self.model = LogisticRegression(
            random_state=42,
            max_iter=1000,
            class_weight='balanced'  # Handle class imbalance
        )

        self.model.fit(X_scaled, y)
        self.feature_names = feature_cols

        # Evaluate training performance
        y_pred = self.model.predict(X_scaled)
        y_prob = self.model.predict_proba(X_scaled)[:, 1]

        print(f"\nTraining Performance:")
        print(f"Accuracy: {(y_pred == y).mean():.3f}")

        if len(np.unique(y)) > 1:
            print(f"ROC AUC: {roc_auc_score(y, y_prob):.3f}")

        # Show feature importance
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': np.abs(self.model.coef_[0])
        }).sort_values('importance', ascending=False)

        print(f"\nTop 10 Most Important Features:")
        for _, row in feature_importance.head(10).iterrows():
            print(f"  {row['feature'][:40]:40} | {row['importance']:.3f}")

    def analyze_predictions(self, training_data: List[pd.DataFrame]) -> None:
        """
        Analyze model predictions on training data to understand what it learned.

        Args:
            training_data: List of DataFrames containing training windows
        """
        print("\n=== PREDICTION ANALYSIS ===")

        if self.model is None:
            print("No trained model available")
            return

        for i, window_df in enumerate(training_data):
            sc_lap = window_df[window_df['sc_next_lap'] == 1]['LapNumber'].iloc[0] + 1 if (window_df['sc_next_lap'] == 1).any() else "Unknown"

            print(f"\nWindow {i+1} (Safety Car on lap {sc_lap}):")
            print("-" * 40)

            # Prepare features
            exclude_cols = ['LapNumber', 'sc_next_lap']
            feature_cols = [col for col in window_df.columns if col not in exclude_cols and col in self.feature_names]

            if not feature_cols:
                print("  No matching features found")
                continue

            X_window = window_df[feature_cols].fillna(0).replace([np.inf, -np.inf], 0)
            X_scaled = self.scaler.transform(X_window)

            # Get predictions
            probabilities = self.model.predict_proba(X_scaled)[:, 1]
            predictions = self.model.predict(X_scaled)

            # Show results by lap
            for j, (_, row) in enumerate(window_df.iterrows()):
                lap_num = row['LapNumber']
                prob = probabilities[j]
                pred = predictions[j]
                actual = row['sc_next_lap']

                status = "🚨 SC PREDICTED" if pred == 1 else "✓ Normal"
                actual_status = "(ACTUAL SC NEXT)" if actual == 1 else ""

                print(f"  Lap {int(lap_num):2d}: {status} | Probability: {prob:.3f} {actual_status}")

def run_complete_analysis(session: Session, window_size: int = 5):
    """
    Run the complete safety car prediction analysis pipeline.

    This function orchestrates the entire process from finding safety car deployments
    to training and evaluating prediction models.

    Args:
        session: FastF1 Session object
        window_size: Number of laps to analyze before each safety car

    Returns:
        Trained SafetyCarPredictor instance
    """

    print("Starting complete safety car prediction analysis...")
    print("="*60)

    # Initialize predictor
    predictor = SafetyCarPredictor(window_size=window_size)

    laps_df = session.laps
    track_status_df = session.track_status

    # Step 1: Find safety car deployments
    sc_laps = predictor.find_safety_car_laps(laps_df, track_status_df)

    if not sc_laps:
        print("No safety car deployments found. Analysis cannot continue.")
        return predictor

    # Step 2: Create training windows
    windows = predictor.create_training_windows(laps_df, sc_laps)

    if not windows:
        print("No valid training windows created. Analysis cannot continue.")
        return predictor

    # Step 3: Extract features for each window
    training_data = []

    for window in windows:
        print(f"\nProcessing window for safety car on lap {window['sc_lap']}...")

        # Extract basic features
        basic_features = predictor.extract_basic_features(window['laps_data'])

        if basic_features.empty:
            print("  No features extracted, skipping window")
            continue

        # Create predictive features
        predictive_features = predictor.create_predictive_features(basic_features, window['sc_lap'])

        if not predictive_features.empty:
            training_data.append(predictive_features)
            print(f"  Created {len(predictive_features)} feature rows")
        else:
            print("  No predictive features created, skipping window")

    if not training_data:
        print("No training data available. Analysis cannot continue.")
        return predictor

    # Step 4: Train model
    predictor.train_model(training_data)

    # Step 5: Analyze predictions
    predictor.analyze_predictions(training_data)

    print("\n" + "="*60)
    print("Analysis complete!")

    return predictor

# Example usage:
# predictor = run_complete_analysis(session, window_size=5)

In [None]:
session_sp = f1.get_session(season, 'São Paulo Grand Prix', 'R')
session_sp.load()

In [None]:
session_sp.track_status

Unnamed: 0,Time,Status,Message,is_sc,is_vsc
0,0 days 00:00:00,1,AllClear,False,False
1,0 days 00:52:58.758000,2,Yellow,False,False
2,0 days 00:58:32.359000,1,AllClear,False,False
3,0 days 01:13:10.258000,2,Yellow,False,False
4,0 days 01:13:19.484000,1,AllClear,False,False
5,0 days 01:17:25.352000,2,Yellow,False,False
6,0 days 01:17:28.803000,1,AllClear,False,False
7,0 days 01:27:37.971000,2,Yellow,False,False
8,0 days 01:27:48.625000,1,AllClear,False,False
9,0 days 01:47:45.888000,2,Yellow,False,False


In [None]:
# expected SC deployed on lap 15 and lap 25
predictor = run_complete_analysis(session_sp, window_size=5)

Starting complete safety car prediction analysis...
=== FINDING SAFETY CAR DEPLOYMENTS ===
Found 2 safety car deployment(s)

Analyzing safety car deployment at time: 0 days 01:54:50.200000
  Safety car mapped to lap 30.0
  Candidate laps: [28.0, 29.0, 30.0]

Analyzing safety car deployment at time: 0 days 02:35:12.045000
  Safety car mapped to lap 39.0
  Candidate laps: [39.0]

Final safety car laps: [30.0, 39.0]

=== CREATING TRAINING WINDOWS ===

Creating window for safety car on lap 30.0
  Analyzing laps 25.0 to 29.0
  Created window: 90 lap records from 18 drivers

Creating window for safety car on lap 39.0
  Analyzing laps 34.0 to 38.0
  Created window: 80 lap records from 16 drivers

Successfully created 2 training windows

Processing window for safety car on lap 30.0...
  Created 5 feature rows

Processing window for safety car on lap 39.0...
  Created 5 feature rows

=== TRAINING SAFETY CAR PREDICTION MODEL ===
Training data shape: (10, 210)
Target distribution: {0: 8, 1: 2}
Fe

In [None]:
session_sa = f1.get_session(season, 'Saudi Arabian Grand Prix', 'R')
session_sa.load()

predictor = run_complete_analysis(session_sa, window_size=5)

core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
INFO:fastf1.fastf1.core:Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
INFO:fastf1.fastf1.req:Using cached data for session_info
req            INFO 	Using cached data for driver_info
INFO:fastf1.fastf1.req:Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
INFO:fastf1.fastf1.req:Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
INFO:fastf1.fastf1.req:Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
INFO:fastf1.fastf1.req:Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
INFO:fastf1.fastf1.req:Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
INFO:fastf1.fastf1.req:Using cached data for timing_app_data
c

Starting complete safety car prediction analysis...
=== FINDING SAFETY CAR DEPLOYMENTS ===
Found 1 safety car deployment(s)

Analyzing safety car deployment at time: 0 days 01:09:49.924000
  Safety car mapped to lap 7.0
  Candidate laps: [6.0, 7.0, 8.0]

Final safety car laps: [7.0]

=== CREATING TRAINING WINDOWS ===

Creating window for safety car on lap 7.0
  Analyzing laps 2.0 to 6.0
  Created window: 95 lap records from 19 drivers

Successfully created 1 training windows

Processing window for safety car on lap 7.0...
  Created 5 feature rows

=== TRAINING SAFETY CAR PREDICTION MODEL ===
Training data shape: (5, 210)
Target distribution: {0: 4, 1: 1}
Features: 210

Training Performance:
Accuracy: 1.000
ROC AUC: 1.000

Top 10 Most Important Features:
  avg_position_jump_pct                    | 0.075
  avg_position_jump                        | 0.075
  avg_position_zscore                      | 0.075
  drivers_with_valid_times_jump_pct        | 0.075
  drivers_with_valid_times_zscor