In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
import fastf1 
from fastf1 import get_session
import matplotlib.pyplot as plt
import seaborn as sns

from f1_etl import (
    DataConfig,
    SessionConfig,
    create_safety_car_dataset,
)
from f1_etl.train import (
    ModelEvaluationSuite,
    create_metadata_from_f1_dataset,
    prepare_data_with_validation,
    create_model_metadata,
    train_and_validate_model,
    evaluate_on_external_dataset,
    compare_performance_across_datasets,
)

fastf1.Cache.enable_cache('E:\School Stuff\F1cache')

In [7]:
drivers=('16' '81' '55' '4' '63' '1' '44' '22' '23' '10' '14' '3' '77' '18' '2' '24' '31' '11' '27' '20')

data_config = DataConfig(
        sessions=[
            SessionConfig(2024, "Qatar Grand Prix", "R"),
            SessionConfig(2024, "Chinese Grand Prix", "R"),
            SessionConfig(2024, "Mexico City Grand Prix", "R"),
            SessionConfig(2024, "São Paulo Grand Prix", "R"),
            SessionConfig(2024, "Miami Grand Prix", "R"),
            SessionConfig(2024, "United States Grand Prix", "R"),
            SessionConfig(2024, "Monaco Grand Prix", "R"),
        ],
        drivers=drivers,
        include_weather=False,
    )

dataset = create_safety_car_dataset(
    config=data_config,
    window_size=50,
    prediction_horizon=100,
    normalize=True,
    target_column="TrackStatus",
    resampling_strategy="smote",
)

# 2. Create metadata
dataset_metadata = create_metadata_from_f1_dataset(
    data_config=data_config,
    dataset=dataset,
    features_used="multivariate_all_9_features",
)

2025-07-08 17:23:17,175 - f1_etl - INFO - Preprocessing configuration:
2025-07-08 17:23:17,176 - f1_etl - INFO -   Missing values: enabled (forward_fill)
2025-07-08 17:23:17,177 - f1_etl - INFO -   Normalization: enabled (standard)
2025-07-08 17:23:17,177 - f1_etl - INFO -   Resampling: smote
2025-07-08 17:23:17,178 - f1_etl - INFO - Driver configuration:
2025-07-08 17:23:17,179 - f1_etl - INFO -   Global drivers: 168155463144222310143771822431112720
2025-07-08 17:23:17,179 - f1_etl - INFO -   Qatar Grand Prix: 168155463144222310143771822431112720
2025-07-08 17:23:17,179 - f1_etl - INFO -   Chinese Grand Prix: 168155463144222310143771822431112720
2025-07-08 17:23:17,180 - f1_etl - INFO -   Mexico City Grand Prix: 168155463144222310143771822431112720
2025-07-08 17:23:17,180 - f1_etl - INFO -   São Paulo Grand Prix: 168155463144222310143771822431112720
2025-07-08 17:23:17,181 - f1_etl - INFO -   Miami Grand Prix: 168155463144222310143771822431112720
2025-07-08 17:23:17,181 - f1_etl - INF

Loading session: 2024 Qatar Grand Prix R


core           INFO 	Loading data for Qatar Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '81', '63', '10', '55', '14', '24', '20', '4', '77', '44', '22', '30', '23', '27', '11', '18', '43', '31']
core           INFO 	Loading data for Chinese Grand Prix - Rac

Loading session: 2024 Chinese Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '11', '16', '55', '63', '14', '81', '44', '27', '31', '23', '10', '24', '18', '20', '2', '3', '22', '77']
core           INFO 	Loading data for Mexico City Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Mexico City Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['55', '4', '16', '44', '63', '1', '20', '81', '27', '10', '18', '43', '31', '77', '24', '30', '11', '14', '23', '22']
core           INFO 	Loading data for São Paulo Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 São Paulo Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '31', '10', '63', '16', '4', '22', '81', '30', '44', '11', '50', '77', '14', '24', '55', '43', '23', '18', '27']
core           INFO 	Loading data for Miami Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Miami Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '16', '11', '55', '44', '22', '63', '14', '31', '27', '10', '81', '24', '3', '77', '18', '23', '20', '2']
core           INFO 	Loading data for United States Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 United States Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '1', '4', '81', '63', '11', '27', '30', '43', '20', '10', '14', '22', '18', '23', '77', '31', '24', '44']
core           INFO 	Loading data for Monaco Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading session: 2024 Monaco Grand Prix R


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '81', '55', '4', '63', '1', '44', '22', '23', '10', '14', '3', '77', '18', '2', '24', '31', '11', '27', '20']
2025-07-08 17:24:16,157 - f1_etl - INFO - Creating new fixed vocabulary encoder



📊 Track Status Analysis (training_data):
   green       : 8530965 samples ( 84.4%)
   red         : 513060 samples (  5.1%)
   safety_car  : 654091 samples (  6.5%)
   vsc         : 66622 samples (  0.7%)
   vsc_ending  :  3448 samples (  0.0%)
   yellow      : 345367 samples (  3.4%)
   Missing classes: ['unknown']
✅ FixedVocabTrackStatusEncoder fitted
   Classes seen: ['green', 'red', 'safety_car', 'vsc', 'vsc_ending', 'yellow']
   Total classes: 7
   Output mode: integer labels


2025-07-08 17:24:21,100 - f1_etl - INFO - Original class distribution before resampling: {'green': 8530965, 'red': 513060, 'safety_car': 654091, 'unknown': 0, 'vsc': 66622, 'vsc_ending': 3448, 'yellow': 345367}
2025-07-08 17:24:21,101 - f1_etl - INFO - Applying smote resampling at session/driver level
2025-07-08 17:24:21,102 - f1_etl - INFO - Sampling strategy: minority
2025-07-08 17:24:57,379 - f1_etl - INFO - Resampling complete: 10113553 -> 18456642 samples
2025-07-08 17:24:57,380 - f1_etl - INFO - Class distribution before resampling:
2025-07-08 17:24:57,381 - f1_etl - INFO -   1: 8530965
2025-07-08 17:24:57,381 - f1_etl - INFO -   4: 654091
2025-07-08 17:24:57,382 - f1_etl - INFO -   5: 513060
2025-07-08 17:24:57,382 - f1_etl - INFO -   2: 345367
2025-07-08 17:24:57,382 - f1_etl - INFO -   6: 66622
2025-07-08 17:24:57,383 - f1_etl - INFO -   7: 3448
2025-07-08 17:24:57,383 - f1_etl - INFO - Class distribution after resampling:
2025-07-08 17:24:57,384 - f1_etl - INFO -   1: 8530965

In [27]:
print(splits['X_train'].shape)

(516709, 9, 50)


In [8]:
# 3. Prepare data
splits = prepare_data_with_validation(dataset, val_size=0.0, test_size=0.3)
class_names = list(dataset["label_encoder"].class_to_idx.keys())


=== DATA SPLIT SUMMARY ===
Total samples: 738,157
Train: 516,709 (70.0%)
Val:   None (skipped)
Test:  221,348 (30.0%) - removed 100 samples

Train class distribution:
  Class 0: 240,881 (46.6%)
  Class 1: 14,547 (2.8%)
  Class 2: 120,280 (23.3%)
  Class 4: 68,628 (13.3%)
  Class 5: 63,574 (12.3%)
  Class 6: 8,799 (1.7%)

Test class distribution:
  Class 0: 99,825 (45.1%)
  Class 1: 6,082 (2.7%)
  Class 2: 44,521 (20.1%)
  Class 4: 26,021 (11.8%)
  Class 5: 39,921 (18.0%)
  Class 6: 4,978 (2.2%)


In [24]:
print(np.unique(splits['y_train']))

[0 1 2 4 5 6]


In [14]:
from sklearn.base import BaseEstimator, ClassifierMixin
from datetime import datetime

class PyTorchGRUWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, input_dim, hidden_dim=96, num_layers=2, output_dim=1, 
                 dropout=0.2, lr=0.001, weight_decay=1e-05, epochs=30, batch_size=64):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.output_dim = output_dim
        self.dropout = dropout
        self.lr = lr
        self.weight_decay = weight_decay
        self.epochs = epochs
        self.batch_size = batch_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = None
        
    def fit(self, X, y):
        # Create model
        self.model = GRUClassifier(
            self.input_dim, self.hidden_dim, self.num_layers, self.output_dim, self.dropout
        ).to(self.device)
        
        # Convert to PyTorch tensors and create DataLoader
        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor(y, dtype=torch.float32)
        train_dataset = TensorDataset(X_tensor, y_tensor)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        
        # Calculate class weights
        pos_count = np.sum(y)
        neg_count = len(y) - pos_count
        pos_weight = torch.tensor([neg_count / pos_count]).to(self.device)
        
        # Setup training
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        
        # Training loop
        best_loss = float('inf')
        for epoch in range(self.epochs):
            self.model.train()
            total_loss = 0
            num_batches = 0
            
            for X_batch, y_batch in train_loader:
                X_batch = X_batch.to(self.device)
                y_batch = y_batch.to(self.device)
                
                optimizer.zero_grad()
                outputs = self.model(X_batch)
                loss = criterion(outputs, y_batch.unsqueeze(1))
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                num_batches += 1
            
            avg_loss = total_loss / num_batches
            if avg_loss < best_loss:
                best_loss = avg_loss
        
        return self
    
    def predict(self, X):
        if self.model is None:
            raise ValueError("Model must be fitted before making predictions")
            
        self.model.eval()
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(X_tensor)
            predictions = torch.sigmoid(outputs).cpu().numpy()
        
        return (predictions > 0.5).astype(int).flatten()
    
    def predict_proba(self, X):
        if self.model is None:
            raise ValueError("Model must be fitted before making predictions")
            
        self.model.eval()
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(X_tensor)
            probs = torch.sigmoid(outputs).cpu().numpy().flatten()
        
        # Return probabilities for both classes [negative_class, positive_class]
        return np.column_stack([1 - probs, probs])



# 4. Create sklearn-compatible model
model_name = f'gru_driver_{"_".join(drivers)}'  # Fixed syntax
model = PyTorchGRUWrapper(
    input_dim=dataset['X'].shape[2],
    hidden_dim=96,
    num_layers=2,
    epochs=30,
    batch_size=64
)

# 5. Your existing evaluation code
run_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{model_name}"

evaluator = ModelEvaluationSuite(
    output_dir="evaluation_results",
    run_id=run_id,
)

model_metadata = create_model_metadata(
    model_name=model_name,
    model=model,
)

training_results = train_and_validate_model(
    model=model,
    splits=splits,
    class_names=class_names,
    evaluator=evaluator,
    dataset_metadata=dataset_metadata,
    model_metadata=model_metadata,
)

# 6. Evaluate on external dataset
external_config = DataConfig(
    sessions=[
        SessionConfig(2024, "Canadian Grand Prix", "R"),
        SessionConfig(2024, "Saudi Arabian Grand Prix", "R"),
    ],
    drivers=drivers,
    include_weather=False,
)

external_results = evaluate_on_external_dataset(
    trained_model=training_results["model"],
    external_config=external_config,
    original_dataset_metadata=dataset_metadata,
    model_metadata=model_metadata,
    class_names=class_names,
    evaluator=evaluator,
    resampling_strategy=dataset_metadata.resampling_strategy,
    resampling_config=dataset_metadata.resampling_config,
)

# 7. Compare results
compare_performance_across_datasets(training_results, external_results)


TRAINING WITH TEST: gru_driver_1_6_8_1_5_5_4_6_3_1_4_4_2_2_2_3_1_0_1_4_3_7_7_1_8_2_2_4_3_1_1_1_2_7_2_0
Training on train set...


RuntimeError: input.size(-1) must be equal to input_size. Expected 9, got 50

In [18]:
compare_performance_across_datasets(training_results, external_results)

NameError: name 'training_results' is not defined