# F1 Feature Store

This notebook develops a comprehensive feature store for F1 predictions, including:
- Weather data integration (simulated)
- Momentum indicators
- Track characteristics
- Team strategy patterns
- Advanced performance metrics

The feature store provides a centralized, reusable set of features for all models.

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import json
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
# plt.style.use('seaborn-darkgrid') # Original style - may not work on all systems
# Safe plotting style setup
try:
    import seaborn as sns
    sns.set_theme()  # Modern seaborn initialization
except:
    try:
        plt.style.use('ggplot')  # Fallback style
    except:
        pass  # Use default style
sns.set_palette('husl')

In [None]:
# Setup imports with robust path handling
import sys
import os
from pathlib import Path

# Determine the notebook location and add necessary paths
try:
    # In Jupyter notebooks, __file__ might not be defined
    notebook_dir = Path.cwd()
    
    # Check if we're in the advanced directory
    if 'advanced' in str(notebook_dir):
        # We're in the advanced directory
        if str(notebook_dir) not in sys.path:
            sys.path.insert(0, str(notebook_dir))
    else:
        # Add the advanced directory to path
        workspace_root = notebook_dir
        
        # Navigate to find the advanced directory
        possible_paths = [
            notebook_dir / 'notebooks' / 'advanced',  # From workspace root
            notebook_dir / 'advanced',  # From notebooks directory
            notebook_dir.parent / 'advanced',  # If we're in a sibling directory
            notebook_dir.parent / 'notebooks' / 'advanced',  # From other locations
        ]
        
        for path in possible_paths:
            if path.exists() and str(path) not in sys.path:
                sys.path.insert(0, str(path))
                break
                
except Exception as e:
    print(f"Path setup warning: {e}")
    # Fallback to simple path addition
    sys.path.append('.')

# Import the required modules
from f1db_data_loader import load_f1db_data
from f1_ml import fix_column_mappings, merge_race_data

# Import feature engineering functions from f1_ml package
from f1_ml.features import (
    F1FeatureStore,
    create_track_features,
    simulate_weather_features,
    create_momentum_features,
    create_strategy_features,
    create_advanced_metrics
)

In [None]:
# Load F1 data
print("Loading F1DB data...")
data = load_f1db_data()

# Apply column mappings to ensure compatibility
print("\nApplying column mappings...")
data = fix_column_mappings(data)

# Extract individual dataframes for compatibility with existing code
results = data.get('results', pd.DataFrame())
races = data.get('races', pd.DataFrame())
drivers = data.get('drivers', pd.DataFrame())
constructors = data.get('constructors', pd.DataFrame())
circuits = data.get('circuits', pd.DataFrame())
pit_stops = data.get('pit_stops', pd.DataFrame())
lap_times = data.get('lap_times', pd.DataFrame())
driver_standings = data.get('driver_standings', pd.DataFrame())

# Print summary
print(f"\nData loaded successfully:")
print(f"  Results: {len(results)} records")
print(f"  Races: {len(races)} records")
print(f"  Drivers: {len(drivers)} records")
print(f"  Constructors: {len(constructors)} records")
print(f"  Circuits: {len(circuits)} records")
print(f"  Pit stops: {len(pit_stops)} records")
print(f"  Lap times: {len(lap_times)} records")
print(f"  Driver standings: {len(driver_standings)} records")

# Create base dataframe with all race results
# Option 1: Use merge_race_data for automatic merging (recommended)
df_base = merge_race_data(data)

# Option 2: Manual merging (if you need more control)
# df_base = results.merge(races[['raceId', 'year', 'round', 'circuitId', 'date']], on='raceId')
# df_base = df_base.merge(drivers[['driverId', 'driverRef', 'surname', 'code', 'dob']], on='driverId')
# df_base = df_base.merge(constructors[['constructorId', 'constructorRef', 'name']], 
#                        on='constructorId', suffixes=('_race', '_constructor'))
# df_base = df_base.merge(circuits[['circuitId', 'circuitRef', 'location', 'country', 'lat', 'lng']], 
#                        on='circuitId')

# Convert dates if not already done by merge_race_data
if 'date' in df_base.columns and df_base['date'].dtype == 'object':
    df_base['date'] = pd.to_datetime(df_base['date'])
    
if 'dob' in df_base.columns and df_base['dob'].dtype == 'object':
    df_base['dob'] = pd.to_datetime(df_base['dob'])
    
# Calculate driver age if not already present
if 'driver_age' not in df_base.columns and 'date' in df_base.columns and 'dob' in df_base.columns:
    df_base['driver_age'] = (df_base['date'] - df_base['dob']).dt.days / 365.25

# Sort by date
df_base = df_base.sort_values(['date', 'raceId', 'positionOrder'])

print(f"Base dataframe shape: {df_base.shape}")
print(f"Date range: {df_base['date'].min()} to {df_base['date'].max()}")

# Show available columns
print(f"\nAvailable columns: {sorted(df_base.columns.tolist())}")

In [None]:
# Create base dataframe with all race results
df_base = results.merge(races[['raceId', 'year', 'round', 'circuitId', 'date']], on='raceId')
df_base = df_base.merge(drivers[['driverId', 'driverRef', 'surname', 'code', 'dob']], on='driverId')
df_base = df_base.merge(constructors[['constructorId', 'constructorRef', 'name']], 
                       on='constructorId', suffixes=('_race', '_constructor'))
df_base = df_base.merge(circuits[['circuitId', 'circuitRef', 'location', 'country', 'lat', 'lng']], 
                       on='circuitId')

# Convert dates
df_base['date'] = pd.to_datetime(df_base['date'])
df_base['dob'] = pd.to_datetime(df_base['dob'])
df_base['driver_age'] = (df_base['date'] - df_base['dob']).dt.days / 365.25

# Sort by date
df_base = df_base.sort_values(['date', 'raceId', 'positionOrder'])

print(f"Base dataframe shape: {df_base.shape}")
print(f"Date range: {df_base['date'].min()} to {df_base['date'].max()}")

## 2. Track Characteristics

In [None]:
# Create track features using imported function
track_features = create_track_features(df_base, circuits)

print("Track features created:")
print(track_features[['circuitRef', 'is_street_circuit', 'is_high_speed', 
                      'is_technical', 'overtaking_difficulty']].head(10))

## 3. Weather Features (Simulated)

In [None]:
# Generate weather features using imported function
weather_features = simulate_weather_features(df_base)

# Visualize weather distribution
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

axes[0, 0].hist(weather_features['rain_probability'], bins=20, edgecolor='black')
axes[0, 0].set_title('Rain Probability Distribution')
axes[0, 0].set_xlabel('Probability')

axes[0, 1].hist(weather_features['temperature'], bins=20, edgecolor='black')
axes[0, 1].set_title('Temperature Distribution')
axes[0, 1].set_xlabel('Temperature (°C)')

axes[1, 0].scatter(weather_features['temperature'], weather_features['humidity'], 
                  alpha=0.5, c=weather_features['is_wet_race'], cmap='coolwarm')
axes[1, 0].set_xlabel('Temperature (°C)')
axes[1, 0].set_ylabel('Humidity (%)')
axes[1, 0].set_title('Temperature vs Humidity (color = wet race)')

wet_race_pct = weather_features.groupby(pd.cut(weather_features['rain_probability'], 
                                               bins=5))['is_wet_race'].mean()
axes[1, 1].bar(range(len(wet_race_pct)), wet_race_pct.values)
axes[1, 1].set_xlabel('Rain Probability Bins')
axes[1, 1].set_ylabel('Actual Wet Race %')
axes[1, 1].set_title('Rain Probability vs Actual Wet Races')

plt.tight_layout()
plt.show()

print(f"\nWeather features generated for {len(weather_features)} races")
print(f"Wet race percentage: {weather_features['is_wet_race'].mean():.1%}")

## 4. Momentum and Form Features

In [None]:
# Create momentum features using imported function
df_momentum = create_momentum_features(df_base, windows=[3, 5, 10], driver_standings=driver_standings)

# Visualize momentum trends
recent_data = df_momentum[df_momentum['year'] >= 2020]
top_drivers = recent_data.groupby('driverId')['points'].sum().nlargest(10).index

fig, ax = plt.subplots(figsize=(14, 8))

for driver_id in top_drivers[:5]:
    driver_data = recent_data[recent_data['driverId'] == driver_id].sort_values('date')
    driver_name = driver_data['surname'].iloc[0]
    
    ax.plot(driver_data['date'], 
           driver_data['points_momentum_5'].rolling(3).mean(), 
           label=driver_name, linewidth=2)

ax.set_xlabel('Date')
ax.set_ylabel('Points Momentum (5-race average)')
ax.set_title('Driver Momentum Trends (Top 5 Drivers)')
ax.legend()
ax.grid(True, alpha=0.3)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nMomentum features created:")
momentum_cols = [col for col in df_momentum.columns if 'momentum' in col or 'trend' in col]
print(f"Total momentum features: {len(momentum_cols)}")
print(f"Sample features: {momentum_cols[:5]}")

## 5. Strategy Pattern Features

In [None]:
# Create strategy features using imported function
df_strategy = create_strategy_features(df_momentum, pit_stops, lap_times)

print("\nStrategy features created")
strategy_cols = ['n_pit_stops', 'avg_pit_time', 'constructor_avg_pit_time', 
                'lap_consistency_score', 'strategy_type']
available_cols = [col for col in strategy_cols if col in df_strategy.columns]
print(f"Available strategy features: {available_cols}")

# Visualize strategy preferences if available
if 'strategy_type' in df_strategy.columns:
    # Calculate constructor strategy preferences
    strategy_prefs = df_strategy.groupby(['constructorId', 'strategy_type']).size().unstack(fill_value=0)
    strategy_prefs = strategy_prefs.div(strategy_prefs.sum(axis=1), axis=0)
    
    # Visualize strategy preferences
    fig, ax = plt.subplots(figsize=(10, 6))
    
    top_constructors = df_strategy.groupby('constructorId').size().nlargest(10).index
    strategy_prefs_top = strategy_prefs.loc[top_constructors]
    
    strategy_prefs_top.plot(kind='bar', stacked=True, ax=ax)
    ax.set_xlabel('Constructor ID')
    ax.set_ylabel('Strategy Preference %')
    ax.set_title('Pit Stop Strategy Preferences by Constructor')
    ax.legend(title='Strategy Type')
    
    plt.tight_layout()
    plt.show()

## 6. Advanced Performance Metrics

In [None]:
# Create advanced metrics using imported function
df_advanced, h2h_records = create_advanced_metrics(df_strategy, drivers)

print("\nAdvanced metrics created:")
advanced_cols = ['era_adjusted_points', 'era_adjusted_position', 'teammate_position_diff', 
                'clutch_factor', 'start_performance']
print(f"New advanced features: {[col for col in advanced_cols if col in df_advanced.columns]}")

# Visualize advanced metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Era-adjusted performance over time
era_performance = df_advanced.groupby('year')['era_adjusted_points'].mean()
axes[0, 0].plot(era_performance.index, era_performance.values)
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Era-Adjusted Points')
axes[0, 0].set_title('Average Era-Adjusted Performance Over Time')
axes[0, 0].grid(True, alpha=0.3)

# Clutch factor distribution
clutch_stats = df_advanced.groupby('driverId').agg({
    'clutch_points': 'mean',
    'points': 'mean'
})
clutch_stats['clutch_factor'] = clutch_stats['clutch_points'] / (clutch_stats['points'] + 0.1)
axes[0, 1].hist(clutch_stats['clutch_factor'].dropna(), bins=20, edgecolor='black')
axes[0, 1].set_xlabel('Clutch Factor')
axes[0, 1].set_ylabel('Number of Drivers')
axes[0, 1].set_title('Distribution of Driver Clutch Factors')
axes[0, 1].grid(True, alpha=0.3)

# Teammate performance comparison
recent_teammate_diff = df_advanced[df_advanced['year'] >= 2020].groupby('driverId')['teammate_position_diff'].mean()
top_teammates = recent_teammate_diff.nsmallest(10)

axes[1, 0].barh(range(len(top_teammates)), top_teammates.values)
axes[1, 0].set_yticks(range(len(top_teammates)))
axes[1, 0].set_yticklabels([drivers[drivers['driverId'] == d]['surname'].iloc[0] 
                            for d in top_teammates.index])
axes[1, 0].set_xlabel('Average Position Difference vs Teammate')
axes[1, 0].set_title('Top 10 Drivers vs Teammates (2020+)')
axes[1, 0].grid(True, alpha=0.3)

# Head-to-head matrix
if not h2h_records.empty:
    # Create matrix for visualization
    h2h_pivot = h2h_records.pivot_table(
        values='driver1_wins', 
        index='driver1', 
        columns='driver2',
        aggfunc='sum'
    )
    
    # Get driver names
    driver_names = {d: drivers[drivers['driverId'] == d]['surname'].iloc[0] 
                   for d in h2h_pivot.index if d in drivers['driverId'].values}
    
    h2h_pivot.index = [driver_names.get(d, f'Driver{d}') for d in h2h_pivot.index]
    h2h_pivot.columns = [driver_names.get(d, f'Driver{d}') for d in h2h_pivot.columns]
    
    sns.heatmap(h2h_pivot.fillna(0), annot=True, fmt='.0f', cmap='RdYlGn', 
               ax=axes[1, 1], cbar_kws={'label': 'Wins'})
    axes[1, 1].set_title('Head-to-Head Records (Top Drivers)')

plt.tight_layout()
plt.show()

## 7. Feature Store Assembly

In [None]:
# Import F1FeatureStore from the f1_ml package
from f1_ml.features import F1FeatureStore

# Create and populate feature store
feature_store = F1FeatureStore()
all_features = feature_store.build_feature_store(df_advanced, track_features, weather_features)

print("\nFeature Store Summary:")
print("=" * 50)
print(f"Total records: {len(all_features):,}")
print(f"Total features: {len(all_features.columns)}")
print(f"Date range: {all_features['date'].min()} to {all_features['date'].max()}")
print(f"\nFeature groups:")
for group, metadata in feature_store.feature_metadata.items():
    print(f"  {group}: {metadata['count']} features")

# Save feature store
feature_store.save_feature_store('f1_feature_store.parquet')

## 8. Feature Quality Analysis

In [None]:
def analyze_feature_quality(feature_store):
    """
    Analyze feature quality and usefulness
    """
    df = feature_store.base_features
    
    # Get numeric features only
    numeric_features = df.select_dtypes(include=[np.number]).columns
    
    # Calculate feature statistics
    feature_stats = []
    
    for feature in numeric_features:
        if feature in ['raceId', 'driverId', 'constructorId', 'circuitId']:  # Skip IDs
            continue
            
        stats = {
            'feature': feature,
            'missing_pct': df[feature].isna().mean() * 100,
            'unique_values': df[feature].nunique(),
            'std_dev': df[feature].std(),
            'skewness': df[feature].skew(),
            'kurtosis': df[feature].kurtosis()
        }
        
        # Correlation with target (position)
        if 'positionOrder' in df.columns:
            stats['correlation_with_position'] = df[feature].corr(df['positionOrder'])
        
        feature_stats.append(stats)
    
    feature_quality_df = pd.DataFrame(feature_stats)
    
    # Identify potential issues
    print("\nFeature Quality Analysis:")
    print("=" * 50)
    
    # High missing data
    high_missing = feature_quality_df[feature_quality_df['missing_pct'] > 20]
    if not high_missing.empty:
        print("\nFeatures with >20% missing data:")
        print(high_missing[['feature', 'missing_pct']].round(1))
    
    # Low variance features
    low_variance = feature_quality_df[feature_quality_df['std_dev'] < 0.01]
    if not low_variance.empty:
        print("\nLow variance features (might not be useful):")
        print(low_variance[['feature', 'std_dev']])
    
    # Highly correlated with target
    if 'correlation_with_position' in feature_quality_df.columns:
        high_corr = feature_quality_df.nlargest(10, 'correlation_with_position')
        print("\nTop 10 features correlated with position:")
        print(high_corr[['feature', 'correlation_with_position']].round(3))
    
    # Visualize feature distributions
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Missing data
    missing_data = feature_quality_df.nlargest(15, 'missing_pct')
    axes[0, 0].barh(missing_data['feature'], missing_data['missing_pct'])
    axes[0, 0].set_xlabel('Missing %')
    axes[0, 0].set_title('Features with Most Missing Data')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Feature importance proxy (absolute correlation)
    if 'correlation_with_position' in feature_quality_df.columns:
        feature_quality_df['abs_correlation'] = feature_quality_df['correlation_with_position'].abs()
        top_corr = feature_quality_df.nlargest(15, 'abs_correlation')
        axes[0, 1].barh(top_corr['feature'], top_corr['abs_correlation'])
        axes[0, 1].set_xlabel('Absolute Correlation with Position')
        axes[0, 1].set_title('Most Predictive Features')
        axes[0, 1].grid(True, alpha=0.3)
    
    # Skewness distribution
    axes[1, 0].hist(feature_quality_df['skewness'].dropna(), bins=30, edgecolor='black')
    axes[1, 0].set_xlabel('Skewness')
    axes[1, 0].set_ylabel('Number of Features')
    axes[1, 0].set_title('Feature Skewness Distribution')
    axes[1, 0].axvline(x=0, color='red', linestyle='--', label='No skew')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # Feature count by group
    group_counts = pd.Series({group: metadata['count'] 
                             for group, metadata in feature_store.feature_metadata.items()})
    axes[1, 1].pie(group_counts.values, labels=group_counts.index, autopct='%1.1f%%')
    axes[1, 1].set_title('Feature Distribution by Group')
    
    plt.tight_layout()
    plt.show()
    
    return feature_quality_df

# Analyze feature quality
feature_quality = analyze_feature_quality(feature_store)

# Get recommended feature set
recommended_features = feature_store.get_feature_set(
    ['basic', 'driver', 'track', 'weather', 'momentum', 'advanced']
)
print(f"\nRecommended feature set: {len(recommended_features)} features")

## Summary

The F1 Feature Store provides:

1. **Track Characteristics**: Circuit type, overtaking difficulty, historical performance
2. **Weather Features**: Simulated weather conditions affecting race outcomes
3. **Momentum Indicators**: Recent form, consistency, championship pressure
4. **Strategy Patterns**: Pit stop timing, tire strategies, team preferences
5. **Advanced Metrics**: Era-adjusted performance, clutch factor, head-to-head records

### Key Insights:
- Weather significantly impacts race strategies and outcomes
- Momentum features capture driver form better than static averages
- Track characteristics strongly influence overtaking opportunities
- Team strategy patterns are predictable and vary by constructor

The feature store is saved as a Parquet file for efficient storage and quick loading in production models.