In [None]:
import fastf1

from f1_etl import DriverLabelEncoder, DataConfig, SessionConfig
from f1_etl import DataAggregator, RawDataExtractor

In [None]:
session_sa = fastf1.get_session(2024, "Saudi Arabian Grand Prix", 'R')
session_mon = fastf1.get_session(2024, "Monaco Grand Prix", 'R')

session_sa.load()
session_mon.load()

driver_enc_sa = DriverLabelEncoder()
driver_enc_sa.fit_session(session_sa)

driver_enc_mon = DriverLabelEncoder()
driver_enc_mon.fit_session(session_mon)

In [None]:
driver_enc_sa.driver_to_number

In [None]:
driver_enc_mon.driver_to_number

In [2]:
session_qa = fastf1.get_session(2024, "Qatar Grand Prix", "R")
session_qa.load()

core           INFO 	Loading data for Qatar Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '81', '63', '10', '55', '14', '24', '20', '4', '77', '44', '22', '30', '23', '27', '11', '18', '43', '31']


In [4]:
driver_enc_qa = DriverLabelEncoder()
driver_enc_qa.fit_session(session_qa)

driver_enc_qa.driver_to_number

{'VER': '1',
 'LEC': '16',
 'PIA': '81',
 'RUS': '63',
 'GAS': '10',
 'SAI': '55',
 'ALO': '14',
 'ZHO': '24',
 'MAG': '20',
 'NOR': '4',
 'BOT': '77',
 'HAM': '44',
 'TSU': '22',
 'LAW': '30',
 'ALB': '23',
 'HUL': '27',
 'PER': '11',
 'STR': '18',
 'COL': '43',
 'OCO': '31'}

In [None]:
data_config = DataConfig(
    sessions=[
        SessionConfig(2024, "Qatar Grand Prix", "R")
    ],
    drivers=["27", "31", "23", "77", "11"]
)

# TrackStatus Encoder

In [9]:
from f1_etl import FixedVocabTrackStatusEncoder

track_status_enc = FixedVocabTrackStatusEncoder()

In [10]:
config = data_config

for session in config.sessions:
    effective_drivers = config.get_effective_drivers(session)
    

# Step 1: Extract raw data
extractor = RawDataExtractor(config.cache_dir)
sessions_data = [
    extractor.extract_session(session_config) for session_config in config.sessions
]

# Step 2: Aggregate data with per-session driver filtering
aggregator = DataAggregator()
telemetry_data = aggregator.aggregate_telemetry_data(
    sessions_data, config, config.sessions
)

if telemetry_data.empty:
    raise ValueError("No telemetry data extracted")

# Step 3: Setup fixed vocabulary encoder for track status

label_encoder = FixedVocabTrackStatusEncoder(use_onehot=False)


# Analyze distributions before encoding (optional but useful)
label_encoder.analyze_data(telemetry_data["TrackStatus"], "training_data")

if "TrackStatus" not in telemetry_data.columns:
    raise ValueError("TrackStatus column not found in telemetry data")

# Fit and transform
encoded_labels = label_encoder.fit_transform(telemetry_data["TrackStatus"])

core           INFO 	Loading data for Qatar Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Qatar Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '81', '63', '10', '55', '14', '24', '20', '4', '77', '44', '22', '30', '23', '27', '11', '18', '43', '31']
core           INFO 	Loading data for Chinese Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Chinese Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '11', '16', '55', '63', '14', '81', '44', '27', '31', '23', '10', '24', '18', '20', '2', '3', '22', '77']
core           INFO 	Loading data for Mexico City Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Mexico City Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['55', '4', '16', '44', '63', '1', '20', '81', '27', '10', '18', '43', '31', '77', '24', '30', '11', '14', '23', '22']



📊 Track Status Analysis (training_data):
   green       : 1425374 samples ( 82.2%)
   safety_car  : 146082 samples (  8.4%)
   vsc         :  2337 samples (  0.1%)
   yellow      : 159368 samples (  9.2%)
   Missing classes: [np.str_('red'), np.str_('unknown'), np.str_('vsc_ending')]
✅ FixedVocabTrackStatusEncoder fitted
   Classes seen: ['green', 'safety_car', 'vsc', 'yellow']
   Total classes: 7
   Output mode: integer labels


In [13]:
label_encoder.class_to_idx

{np.str_('green'): 0,
 np.str_('red'): 1,
 np.str_('safety_car'): 2,
 np.str_('unknown'): 3,
 np.str_('vsc'): 4,
 np.str_('vsc_ending'): 5,
 np.str_('yellow'): 6}

In [15]:
label_encoder.class_to_idx['safety_car']

2