In [17]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import pickle

In [18]:
# Load data with qualifying AND engineered features
# f1_results_features.csv has: qualifying data + driver_win_rate + team_reliability + race outcomes
f1_results = pd.read_csv("../data/f1_results_features.csv")

print(f"Loaded {len(f1_results)} race results")
print(f"Columns: {f1_results.columns.tolist()}")


Loaded 180 race results
Columns: ['Year', 'Race', 'Driver', 'Team', 'Position', 'GridPosition', 'Points', 'Status', 'BestQualifyingTime', 'GapToPole', 'QualifyingPerformance', 'PositionChange', 'driver_win_rate', 'team_reliability', 'race_winner', 'podium_finish', 'points_finish']


In [19]:
f1_results['Status'].value_counts()

Status
Finished         126
Lapped            34
Retired           17
Disqualified       2
Did not start      1
Name: count, dtype: int64

In [None]:
def create_historical_features(df, n_previous=6):
def create_historical_features(df, n_previous=6):
    features = []
    
    for driver in df['Driver'].unique():
        driver_data = df[df['Driver'] == driver].copy().reset_index(drop=True)
        
        # Position-based features (lower is better)
        driver_data['avg_position_last'] = driver_data['Position'].rolling(n_previous, min_periods=1).mean()
        driver_data['best_position_last'] = driver_data['Position'].rolling(n_previous, min_periods=1).min()
        driver_data['avg_grid_last'] = driver_data['GridPosition'].rolling(n_previous, min_periods=1).mean()
        
        # DNF and reliability
        driver_data['is_dnf'] = (~driver_data['Status'].str.contains('Finished', na=False)).astype(int)
        driver_data['dnf_last'] = driver_data['is_dnf'].rolling(n_previous, min_periods=1).sum()
        driver_data['reliability_rate'] = 1 - (driver_data['dnf_last'] / n_previous)
        
        # Performance metrics
        driver_data['positions_gained'] = driver_data['GridPosition'] - driver_data['Position']
        driver_data['avg_positions_gained'] = driver_data['positions_gained'].rolling(n_previous, min_periods=1).mean()
        
        # Success metrics
        driver_data['podiums_last'] = (driver_data['Position'] <= 3).astype(int).rolling(n_previous, min_periods=1).sum()
        driver_data['wins_last'] = (driver_data['Position'] == 1).astype(int).rolling(n_previous, min_periods=1).sum()
        driver_data['points_last'] = driver_data['Points'].rolling(n_previous, min_periods=1).sum()
        
        # Qualifying features (if available)
        if 'BestQualifyingTime' in driver_data.columns:
            driver_data['avg_quali_time'] = driver_data['BestQualifyingTime'].rolling(n_previous, min_periods=1).mean()
            driver_data['avg_gap_to_pole'] = driver_data['GapToPole'].rolling(n_previous, min_periods=1).mean()
        
        # Recent form trend (last 3 vs previous races)
        recent_avg = driver_data['Position'].rolling(3, min_periods=1).mean()
        if n_previous > 3:
            older_avg = driver_data['Position'].shift(3).rolling(n_previous-3, min_periods=1).mean()
            driver_data['form_trend'] = older_avg - recent_avg  # Positive = improving
        else:
            driver_data['form_trend'] = 0.0  # Not enough data for trend
        
        features.append(driver_data)
    
    result = pd.concat(features, ignore_index=True)
    return result


In [21]:
# Enhanced race features - 20+ features combining pre-engineered + rolling features!
race_features = [
    'Driver', 'Team', 'GridPosition',
    # Pre-engineered features from dataset
    'driver_win_rate', 'team_reliability', 'QualifyingPerformance', 'PositionChange',
    # Rolling historical features
    'avg_position_last', 'best_position_last', 'avg_grid_last',
    'dnf_last', 'reliability_rate', 'avg_positions_gained',
    'podiums_last', 'wins_last', 'points_last', 'form_trend'
]

# Add qualifying rolling features if available in the data
if 'BestQualifyingTime' in f1_results.columns:
    race_features.extend(['avg_quali_time', 'avg_gap_to_pole'])
    print("✓ Qualifying rolling features included")
else:
    print("⚠ No qualifying features - check data source")

print(f"Total features: {len(race_features)}")
print(f"  - Pre-engineered: 4")
print(f"  - Rolling historical: {len(race_features) - 7}")
print(f"  - Categorical: 2 (Driver, Team)")


✓ Qualifying rolling features included
Total features: 19
  - Pre-engineered: 4
  - Rolling historical: 12
  - Categorical: 2 (Driver, Team)


In [22]:
processed_data = create_historical_features(f1_results)

In [23]:
# Test different lookback periods to find optimal
print("Testing different lookback periods...")
print("=" * 50)

for lookback in [3, 5, 6, 8, 10]:
    test_data = create_historical_features(f1_results, n_previous=lookback)
    non_null = test_data[race_features].dropna()
    print(f"Lookback={lookback:2d}: {len(non_null):4d} valid samples, "
          f"{non_null['form_trend'].abs().mean():.3f} avg trend strength")

print("\n✓ Using lookback=6 (optimal balance)")


Testing different lookback periods...
Lookback= 3:  179 valid samples, 0.000 avg trend strength
Lookback= 5:  119 valid samples, 3.420 avg trend strength
Lookback= 6:  119 valid samples, 3.000 avg trend strength
Lookback= 8:  119 valid samples, 2.839 avg trend strength
Lookback=10:  119 valid samples, 2.838 avg trend strength

✓ Using lookback=6 (optimal balance)


In [24]:
X = processed_data[race_features]
y = processed_data['Position']

In [25]:
categorical_features = ['Driver', 'Team']
numerical_features = [col for col in race_features if col not in categorical_features]

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [27]:
race_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=5,
        random_state=42
    ))
])

In [28]:
race_model.fit(X, y)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [29]:
pickle.dump(race_model, open('../models/race_prediction_pipeline.pk1', 'wb'))

In [30]:
# Save feature information for reference
feature_info = {
    'n_features': len(race_features),
    'lookback_races': 6,
    'features': race_features,
    'has_qualifying': 'avg_quali_time' in race_features,
    'has_preengineered': True,
    'preengineered_features': ['driver_win_rate', 'team_reliability', 'QualifyingPerformance', 'PositionChange'],
    'model_version': '3.0_features_dataset',
    'data_source': 'f1_results_features.csv',
    'created': pd.Timestamp.now().isoformat()
}

with open('../models/race_position_feature_info.pk1', 'wb') as f:
    pickle.dump(feature_info, f)