# F1 Feature Store

This notebook develops a comprehensive feature store for F1 predictions, including:
- Weather data integration (simulated)
- Momentum indicators
- Track characteristics
- Team strategy patterns
- Advanced performance metrics

The feature store provides a centralized, reusable set of features for all models.

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import json
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

In [None]:
# Load data
import sys
sys.path.append('.')
from enhanced_f1db_data_loader import load_f1db_data_enhanced

print("Loading F1 data...")
f1_data = load_f1db_data_enhanced(data_dir='../../data/f1db', auto_sync=True)

# Get all necessary datasets
results = f1_data.get('results', pd.DataFrame())
races = f1_data.get('races', pd.DataFrame())
drivers = f1_data.get('drivers', pd.DataFrame())
constructors = f1_data.get('constructors', pd.DataFrame())
qualifying = f1_data.get('qualifying', pd.DataFrame())
circuits = f1_data.get('circuits', pd.DataFrame())
lap_times = f1_data.get('lap_times', pd.DataFrame())
pit_stops = f1_data.get('pit_stops', pd.DataFrame())
driver_standings = f1_data.get('driver_standings', pd.DataFrame())
constructor_standings = f1_data.get('constructor_standings', pd.DataFrame())
status = f1_data.get('status', pd.DataFrame())

print(f"Loaded {len(f1_data)} datasets")

## 1. Base Data Preparation

In [None]:
# Create base dataframe with all race results
df_base = results.merge(races[['raceId', 'year', 'round', 'circuitId', 'date', 'name']], on='raceId')
df_base = df_base.merge(drivers[['driverId', 'driverRef', 'surname', 'code', 'dob']], on='driverId')
df_base = df_base.merge(constructors[['constructorId', 'constructorRef', 'name']], 
                       on='constructorId', suffixes=('_race', '_constructor'))
df_base = df_base.merge(circuits[['circuitId', 'circuitRef', 'location', 'country', 'lat', 'lng']], 
                       on='circuitId')

# Convert dates
df_base['date'] = pd.to_datetime(df_base['date'])
df_base['dob'] = pd.to_datetime(df_base['dob'])
df_base['driver_age'] = (df_base['date'] - df_base['dob']).dt.days / 365.25

# Sort by date
df_base = df_base.sort_values(['date', 'raceId', 'positionOrder'])

print(f"Base dataframe shape: {df_base.shape}")
print(f"Date range: {df_base['date'].min()} to {df_base['date'].max()}")

## 2. Track Characteristics

In [None]:
def create_track_features(df, circuits):
    """
    Create track characteristic features
    """
    # Calculate track statistics
    track_stats = df.groupby('circuitId').agg({
        'positionOrder': ['mean', 'std'],
        'statusId': lambda x: (x > 1).mean(),  # DNF rate
        'laps': 'mean',
        'milliseconds': 'mean',
        'points': 'mean'
    }).round(3)
    
    track_stats.columns = ['avg_position', 'position_variance', 'dnf_rate', 
                           'avg_laps', 'avg_race_time', 'avg_points']
    
    # Add circuit information
    track_features = circuits.merge(track_stats, left_on='circuitId', right_index=True, how='left')
    
    # Categorize tracks based on characteristics
    # Street circuits (Monaco, Singapore, etc.)
    street_circuits = ['monaco', 'singapore', 'adelaide', 'detroit', 'phoenix', 
                      'dallas', 'las_vegas', 'baku', 'sochi', 'valencia']
    track_features['is_street_circuit'] = track_features['circuitRef'].str.lower().isin(street_circuits).astype(int)
    
    # High-speed circuits (Monza, Spa, etc.)
    high_speed_circuits = ['monza', 'spa', 'silverstone', 'suzuka', 'interlagos']
    track_features['is_high_speed'] = track_features['circuitRef'].str.lower().isin(high_speed_circuits).astype(int)
    
    # Technical circuits (Monaco, Hungary, etc.)
    technical_circuits = ['monaco', 'hungaroring', 'marina_bay', 'catalunya']
    track_features['is_technical'] = track_features['circuitRef'].str.lower().isin(technical_circuits).astype(int)
    
    # Altitude effect (Mexico City, Interlagos, Red Bull Ring)
    high_altitude_circuits = ['rodriguez', 'interlagos', 'red_bull_ring']
    track_features['is_high_altitude'] = track_features['circuitRef'].str.lower().isin(high_altitude_circuits).astype(int)
    
    # Calculate overtaking difficulty index based on position changes
    position_changes = []
    for circuit_id in df['circuitId'].unique():
        circuit_races = df[df['circuitId'] == circuit_id]
        
        # Calculate average position change from grid to finish
        avg_position_change = np.abs(circuit_races['grid'] - circuit_races['positionOrder']).mean()
        position_changes.append({
            'circuitId': circuit_id,
            'overtaking_index': avg_position_change
        })
    
    overtaking_df = pd.DataFrame(position_changes)
    track_features = track_features.merge(overtaking_df, on='circuitId', how='left')
    
    # Normalize overtaking index
    track_features['overtaking_difficulty'] = 1 - (track_features['overtaking_index'] / track_features['overtaking_index'].max())
    
    return track_features

# Create track features
track_features = create_track_features(df_base, circuits)

print("Track features created:")
print(track_features[['circuitRef', 'is_street_circuit', 'is_high_speed', 
                      'is_technical', 'overtaking_difficulty']].head(10))

## 3. Weather Features (Simulated)

In [None]:
def simulate_weather_features(df):
    """
    Simulate weather features based on historical patterns
    Note: In production, this would integrate with actual weather APIs
    """
    np.random.seed(42)  # For reproducibility
    
    weather_features = []
    
    for race_id in df['raceId'].unique():
        race_info = df[df['raceId'] == race_id].iloc[0]
        
        # Simulate based on location and season
        month = race_info['date'].month
        location = race_info['location']
        
        # Rain probability based on location and season
        if location in ['Silverstone', 'Spa-Francorchamps', 'Suzuka', 'São Paulo']:
            rain_prob_base = 0.3
        elif location in ['Monaco', 'Singapore', 'Kuala Lumpur']:
            rain_prob_base = 0.2
        else:
            rain_prob_base = 0.1
        
        # Seasonal adjustment
        if month in [6, 7, 8]:  # Summer
            rain_prob = rain_prob_base * 0.7
        elif month in [3, 4, 5, 9, 10, 11]:  # Spring/Fall
            rain_prob = rain_prob_base * 1.2
        else:  # Winter
            rain_prob = rain_prob_base * 1.5
        
        # Generate weather conditions
        is_wet = np.random.random() < rain_prob
        
        weather_features.append({
            'raceId': race_id,
            'rain_probability': min(rain_prob, 0.8),
            'is_wet_race': int(is_wet),
            'temperature': np.random.normal(22 + (month - 6) * 2, 5),  # Base 22°C with seasonal variation
            'track_temp': np.random.normal(30 + (month - 6) * 3, 7),
            'humidity': np.random.normal(60 + is_wet * 20, 10),
            'wind_speed': np.random.exponential(10),
            'weather_changeability': np.random.beta(2, 5)  # How likely weather is to change
        })
    
    weather_df = pd.DataFrame(weather_features)
    
    # Ensure reasonable ranges
    weather_df['temperature'] = weather_df['temperature'].clip(5, 40)
    weather_df['track_temp'] = weather_df['track_temp'].clip(10, 60)
    weather_df['humidity'] = weather_df['humidity'].clip(20, 95)
    weather_df['wind_speed'] = weather_df['wind_speed'].clip(0, 40)
    
    return weather_df

# Generate weather features
weather_features = simulate_weather_features(df_base)

# Visualize weather distribution
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

axes[0, 0].hist(weather_features['rain_probability'], bins=20, edgecolor='black')
axes[0, 0].set_title('Rain Probability Distribution')
axes[0, 0].set_xlabel('Probability')

axes[0, 1].hist(weather_features['temperature'], bins=20, edgecolor='black')
axes[0, 1].set_title('Temperature Distribution')
axes[0, 1].set_xlabel('Temperature (°C)')

axes[1, 0].scatter(weather_features['temperature'], weather_features['humidity'], 
                  alpha=0.5, c=weather_features['is_wet_race'], cmap='coolwarm')
axes[1, 0].set_xlabel('Temperature (°C)')
axes[1, 0].set_ylabel('Humidity (%)')
axes[1, 0].set_title('Temperature vs Humidity (color = wet race)')

wet_race_pct = weather_features.groupby(pd.cut(weather_features['rain_probability'], 
                                               bins=5))['is_wet_race'].mean()
axes[1, 1].bar(range(len(wet_race_pct)), wet_race_pct.values)
axes[1, 1].set_xlabel('Rain Probability Bins')
axes[1, 1].set_ylabel('Actual Wet Race %')
axes[1, 1].set_title('Rain Probability vs Actual Wet Races')

plt.tight_layout()
plt.show()

print(f"\nWeather features generated for {len(weather_features)} races")
print(f"Wet race percentage: {weather_features['is_wet_race'].mean():.1%}")

## 4. Momentum and Form Features

In [None]:
def create_momentum_features(df, windows=[3, 5, 10]):
    """
    Create momentum and form indicators
    """
    df = df.copy()
    df = df.sort_values(['driverId', 'date'])
    
    # Driver momentum features
    for w in windows:
        # Position trend
        df[f'position_trend_{w}'] = df.groupby('driverId')['positionOrder'].transform(
            lambda x: x.shift(1).rolling(window=w, min_periods=1).apply(
                lambda y: stats.linregress(range(len(y)), y)[0] if len(y) > 1 else 0
            )
        )
        
        # Points momentum
        df[f'points_momentum_{w}'] = df.groupby('driverId')['points'].transform(
            lambda x: x.shift(1).rolling(window=w, min_periods=1).mean()
        )
        
        # Consistency score (inverse of std deviation)
        df[f'consistency_{w}'] = df.groupby('driverId')['positionOrder'].transform(
            lambda x: 1 / (1 + x.shift(1).rolling(window=w, min_periods=1).std())
        )
        
        # Beat teammate rate
        df['beat_teammate'] = df.groupby(['raceId', 'constructorId'])['positionOrder'].rank() == 1
        df[f'teammate_dominance_{w}'] = df.groupby('driverId')['beat_teammate'].transform(
            lambda x: x.shift(1).rolling(window=w, min_periods=1).mean()
        )
    
    # Constructor momentum
    constructor_points = df.groupby(['raceId', 'constructorId'])['points'].sum().reset_index()
    constructor_points = constructor_points.sort_values(['constructorId', 'raceId'])
    
    for w in windows:
        constructor_points[f'constructor_momentum_{w}'] = constructor_points.groupby('constructorId')['points'].transform(
            lambda x: x.rolling(window=w, min_periods=1).mean()
        )
    
    # Merge back
    df = df.merge(constructor_points.drop('points', axis=1), on=['raceId', 'constructorId'], how='left')
    
    # Championship pressure (position in standings)
    if not driver_standings.empty:
        standings_features = driver_standings.groupby(['raceId', 'driverId']).agg({
            'position': 'first',
            'points': 'first'
        }).reset_index()
        standings_features.columns = ['raceId', 'driverId', 'championship_position', 'championship_points']
        
        df = df.merge(standings_features, on=['raceId', 'driverId'], how='left')
        
        # Points gap to leader
        max_points = df.groupby('raceId')['championship_points'].transform('max')
        df['points_gap_to_leader'] = max_points - df['championship_points']
        df['championship_pressure'] = 1 / (1 + df['championship_position'])
    
    return df

# Create momentum features
df_momentum = create_momentum_features(df_base)

# Visualize momentum trends
recent_data = df_momentum[df_momentum['year'] >= 2020]
top_drivers = recent_data.groupby('driverId')['points'].sum().nlargest(10).index

fig, ax = plt.subplots(figsize=(14, 8))

for driver_id in top_drivers[:5]:
    driver_data = recent_data[recent_data['driverId'] == driver_id].sort_values('date')
    driver_name = driver_data['surname'].iloc[0]
    
    ax.plot(driver_data['date'], 
           driver_data['points_momentum_5'].rolling(3).mean(), 
           label=driver_name, linewidth=2)

ax.set_xlabel('Date')
ax.set_ylabel('Points Momentum (5-race average)')
ax.set_title('Driver Momentum Trends (Top 5 Drivers)')
ax.legend()
ax.grid(True, alpha=0.3)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nMomentum features created:")
momentum_cols = [col for col in df_momentum.columns if 'momentum' in col or 'trend' in col]
print(f"Total momentum features: {len(momentum_cols)}")
print(f"Sample features: {momentum_cols[:5]}")

## 5. Strategy Pattern Features

In [None]:
def create_strategy_features(df, pit_stops, lap_times):
    """
    Create features related to team strategy patterns
    """
    strategy_features = []
    
    # Pit stop analysis
    if not pit_stops.empty:
        # Average pit stops per race
        pit_stop_stats = pit_stops.groupby(['raceId', 'driverId']).agg({
            'stop': 'count',
            'milliseconds': ['mean', 'std'],
            'lap': ['min', 'max']
        }).reset_index()
        
        pit_stop_stats.columns = ['raceId', 'driverId', 'n_pit_stops', 
                                  'avg_pit_time', 'pit_time_variance',
                                  'first_stop_lap', 'last_stop_lap']
        
        # Calculate pit stop efficiency by constructor
        constructor_pit_efficiency = pit_stops.merge(
            df[['raceId', 'driverId', 'constructorId']].drop_duplicates(),
            on=['raceId', 'driverId']
        ).groupby('constructorId')['milliseconds'].agg(['mean', 'std']).reset_index()
        
        constructor_pit_efficiency.columns = ['constructorId', 
                                             'constructor_avg_pit_time', 
                                             'constructor_pit_consistency']
    else:
        pit_stop_stats = pd.DataFrame()
        constructor_pit_efficiency = pd.DataFrame()
    
    # Lap time consistency
    if not lap_times.empty:
        # Sample lap times (full dataset is too large)
        sample_races = df['raceId'].unique()[-20:]  # Last 20 races
        lap_time_sample = lap_times[lap_times['raceId'].isin(sample_races)]
        
        lap_consistency = lap_time_sample.groupby(['raceId', 'driverId']).agg({
            'milliseconds': ['mean', 'std', 'min']
        }).reset_index()
        
        lap_consistency.columns = ['raceId', 'driverId', 
                                  'avg_lap_time', 'lap_time_std', 'fastest_lap']
        
        lap_consistency['lap_consistency_score'] = 1 / (1 + lap_consistency['lap_time_std'] / lap_consistency['avg_lap_time'])
    else:
        lap_consistency = pd.DataFrame()
    
    # Tire strategy patterns (inferred from pit stop timing)
    if not pit_stop_stats.empty:
        # Classify strategies
        def classify_strategy(row):
            if pd.isna(row['n_pit_stops']):
                return 'unknown'
            elif row['n_pit_stops'] == 1:
                return 'one_stop'
            elif row['n_pit_stops'] == 2:
                return 'two_stop'
            else:
                return 'multi_stop'
        
        pit_stop_stats['strategy_type'] = pit_stop_stats.apply(classify_strategy, axis=1)
    
    # Merge all strategy features
    df_strategy = df.copy()
    
    if not pit_stop_stats.empty:
        df_strategy = df_strategy.merge(pit_stop_stats, on=['raceId', 'driverId'], how='left')
    
    if not constructor_pit_efficiency.empty:
        df_strategy = df_strategy.merge(constructor_pit_efficiency, on='constructorId', how='left')
    
    if not lap_consistency.empty:
        df_strategy = df_strategy.merge(lap_consistency, on=['raceId', 'driverId'], how='left')
    
    # Calculate constructor strategy preferences
    if 'strategy_type' in df_strategy.columns:
        strategy_prefs = df_strategy.groupby(['constructorId', 'strategy_type']).size().unstack(fill_value=0)
        strategy_prefs = strategy_prefs.div(strategy_prefs.sum(axis=1), axis=0)
        
        # Visualize strategy preferences
        fig, ax = plt.subplots(figsize=(10, 6))
        
        top_constructors = df_strategy.groupby('constructorId').size().nlargest(10).index
        strategy_prefs_top = strategy_prefs.loc[top_constructors]
        
        strategy_prefs_top.plot(kind='bar', stacked=True, ax=ax)
        ax.set_xlabel('Constructor ID')
        ax.set_ylabel('Strategy Preference %')
        ax.set_title('Pit Stop Strategy Preferences by Constructor')
        ax.legend(title='Strategy Type')
        
        plt.tight_layout()
        plt.show()
    
    return df_strategy

# Create strategy features
df_strategy = create_strategy_features(df_momentum, pit_stops, lap_times)

print("\nStrategy features created")
strategy_cols = ['n_pit_stops', 'avg_pit_time', 'constructor_avg_pit_time', 
                'lap_consistency_score', 'strategy_type']
available_cols = [col for col in strategy_cols if col in df_strategy.columns]
print(f"Available strategy features: {available_cols}")

## 6. Advanced Performance Metrics

In [None]:
def create_advanced_metrics(df):
    """
    Create advanced performance metrics
    """
    df = df.copy()
    
    # Relative performance to teammate
    teammate_comparison = df.groupby(['raceId', 'constructorId']).apply(
        lambda x: x.assign(
            teammate_position_diff=x['positionOrder'] - x['positionOrder'].mean(),
            teammate_points_ratio=x['points'] / (x['points'].sum() + 0.1)
        )
    ).reset_index(drop=True)
    
    df['teammate_position_diff'] = teammate_comparison['teammate_position_diff']
    df['teammate_points_ratio'] = teammate_comparison['teammate_points_ratio']
    
    # Era-adjusted performance (account for different eras having different competitiveness)
    era_adjustment = df.groupby('year').agg({
        'points': ['mean', 'std'],
        'positionOrder': ['mean', 'std']
    })
    
    era_adjustment.columns = ['era_avg_points', 'era_std_points', 
                              'era_avg_position', 'era_std_position']
    
    df = df.merge(era_adjustment, left_on='year', right_index=True, how='left')
    
    # Standardize performance by era
    df['era_adjusted_points'] = (df['points'] - df['era_avg_points']) / (df['era_std_points'] + 0.1)
    df['era_adjusted_position'] = (df['era_avg_position'] - df['positionOrder']) / (df['era_std_position'] + 0.1)
    
    # Performance in different race phases
    df['start_performance'] = np.clip((df['grid'] - df['position'].fillna(df['positionOrder'])) / df['grid'], -1, 1)
    
    # Clutch factor (performance in high-pressure situations)
    # Define high pressure as: late season races, close championship battles
    df['is_late_season'] = df['round'] >= df.groupby('year')['round'].transform('max') * 0.75
    df['clutch_points'] = df['points'] * df['is_late_season']
    
    # Calculate driver clutch factor
    clutch_stats = df.groupby('driverId').agg({
        'clutch_points': 'mean',
        'points': 'mean'
    })
    clutch_stats['clutch_factor'] = clutch_stats['clutch_points'] / (clutch_stats['points'] + 0.1)
    
    df = df.merge(clutch_stats[['clutch_factor']], left_on='driverId', right_index=True, how='left')
    
    # Head-to-head records
    h2h_records = []
    top_drivers = df.groupby('driverId')['points'].sum().nlargest(20).index
    
    for d1 in top_drivers[:10]:  # Limit for performance
        for d2 in top_drivers[:10]:
            if d1 < d2:  # Avoid duplicates
                races_together = df[
                    (df['driverId'].isin([d1, d2])) & 
                    (df['raceId'].isin(
                        df[df['driverId'] == d1]['raceId'].intersection(
                            df[df['driverId'] == d2]['raceId']
                        )
                    ))
                ]
                
                if len(races_together) > 10:  # Minimum races together
                    d1_wins = 0
                    d2_wins = 0
                    
                    for race in races_together['raceId'].unique():
                        race_data = races_together[races_together['raceId'] == race]
                        d1_pos = race_data[race_data['driverId'] == d1]['positionOrder'].values
                        d2_pos = race_data[race_data['driverId'] == d2]['positionOrder'].values
                        
                        if len(d1_pos) > 0 and len(d2_pos) > 0:
                            if d1_pos[0] < d2_pos[0]:
                                d1_wins += 1
                            else:
                                d2_wins += 1
                    
                    h2h_records.append({
                        'driver1': d1,
                        'driver2': d2,
                        'driver1_wins': d1_wins,
                        'driver2_wins': d2_wins,
                        'total_races': d1_wins + d2_wins
                    })
    
    h2h_df = pd.DataFrame(h2h_records)
    
    # Visualize advanced metrics
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Era-adjusted performance over time
    era_performance = df.groupby('year')['era_adjusted_points'].mean()
    axes[0, 0].plot(era_performance.index, era_performance.values)
    axes[0, 0].set_xlabel('Year')
    axes[0, 0].set_ylabel('Era-Adjusted Points')
    axes[0, 0].set_title('Average Era-Adjusted Performance Over Time')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Clutch factor distribution
    axes[0, 1].hist(clutch_stats['clutch_factor'].dropna(), bins=20, edgecolor='black')
    axes[0, 1].set_xlabel('Clutch Factor')
    axes[0, 1].set_ylabel('Number of Drivers')
    axes[0, 1].set_title('Distribution of Driver Clutch Factors')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Teammate performance comparison
    recent_teammate_diff = df[df['year'] >= 2020].groupby('driverId')['teammate_position_diff'].mean()
    top_teammates = recent_teammate_diff.nsmallest(10)
    
    axes[1, 0].barh(range(len(top_teammates)), top_teammates.values)
    axes[1, 0].set_yticks(range(len(top_teammates)))
    axes[1, 0].set_yticklabels([drivers[drivers['driverId'] == d]['surname'].iloc[0] 
                                for d in top_teammates.index])
    axes[1, 0].set_xlabel('Average Position Difference vs Teammate')
    axes[1, 0].set_title('Top 10 Drivers vs Teammates (2020+)')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Head-to-head matrix
    if not h2h_df.empty:
        # Create matrix for visualization
        h2h_pivot = h2h_df.pivot_table(
            values='driver1_wins', 
            index='driver1', 
            columns='driver2',
            aggfunc='sum'
        )
        
        # Get driver names
        driver_names = {d: drivers[drivers['driverId'] == d]['surname'].iloc[0] 
                       for d in h2h_pivot.index if d in drivers['driverId'].values}
        
        h2h_pivot.index = [driver_names.get(d, f'Driver{d}') for d in h2h_pivot.index]
        h2h_pivot.columns = [driver_names.get(d, f'Driver{d}') for d in h2h_pivot.columns]
        
        sns.heatmap(h2h_pivot.fillna(0), annot=True, fmt='.0f', cmap='RdYlGn', 
                   ax=axes[1, 1], cbar_kws={'label': 'Wins'})
        axes[1, 1].set_title('Head-to-Head Records (Top Drivers)')
    
    plt.tight_layout()
    plt.show()
    
    return df, h2h_df

# Create advanced metrics
df_advanced, h2h_records = create_advanced_metrics(df_strategy)

print("\nAdvanced metrics created:")
advanced_cols = ['era_adjusted_points', 'era_adjusted_position', 'teammate_position_diff', 
                'clutch_factor', 'start_performance']
print(f"New advanced features: {[col for col in advanced_cols if col in df_advanced.columns]}")

## 7. Feature Store Assembly

In [None]:
class F1FeatureStore:
    """
    Centralized feature store for F1 predictions
    """
    def __init__(self):
        self.base_features = None
        self.track_features = None
        self.weather_features = None
        self.feature_metadata = {}
        
    def build_feature_store(self, df, track_features, weather_features):
        """
        Assemble all features into a unified store
        """
        # Start with base dataframe
        self.base_features = df.copy()
        
        # Add track features
        self.base_features = self.base_features.merge(
            track_features, on='circuitId', how='left', suffixes=('', '_track')
        )
        
        # Add weather features
        self.base_features = self.base_features.merge(
            weather_features, on='raceId', how='left'
        )
        
        # Store feature metadata
        self._create_feature_metadata()
        
        return self.base_features
    
    def _create_feature_metadata(self):
        """
        Create metadata about features for documentation
        """
        feature_groups = {
            'basic': ['grid', 'positionOrder', 'points', 'laps', 'statusId'],
            'driver': ['driver_age', 'driverId', 'constructorId'],
            'track': ['is_street_circuit', 'is_high_speed', 'is_technical', 
                     'overtaking_difficulty', 'dnf_rate'],
            'weather': ['rain_probability', 'is_wet_race', 'temperature', 
                       'humidity', 'wind_speed'],
            'momentum': [col for col in self.base_features.columns if 'momentum' in col or 'trend' in col],
            'strategy': ['n_pit_stops', 'avg_pit_time', 'lap_consistency_score'],
            'advanced': ['era_adjusted_points', 'teammate_position_diff', 
                        'clutch_factor', 'start_performance']
        }
        
        for group, features in feature_groups.items():
            available_features = [f for f in features if f in self.base_features.columns]
            self.feature_metadata[group] = {
                'features': available_features,
                'count': len(available_features),
                'missing': [f for f in features if f not in self.base_features.columns]
            }
    
    def get_feature_set(self, feature_groups=['basic', 'driver', 'momentum']):
        """
        Get specific feature sets for modeling
        """
        features = []
        for group in feature_groups:
            if group in self.feature_metadata:
                features.extend(self.feature_metadata[group]['features'])
        
        return list(set(features))  # Remove duplicates
    
    def get_race_features(self, race_id):
        """
        Get all features for a specific race
        """
        return self.base_features[self.base_features['raceId'] == race_id]
    
    def get_driver_features(self, driver_id, last_n_races=None):
        """
        Get features for a specific driver
        """
        driver_data = self.base_features[self.base_features['driverId'] == driver_id]
        
        if last_n_races:
            driver_data = driver_data.sort_values('date').tail(last_n_races)
        
        return driver_data
    
    def save_feature_store(self, path='f1_feature_store.parquet'):
        """
        Save feature store to disk
        """
        self.base_features.to_parquet(path, index=False)
        
        # Save metadata
        with open(path.replace('.parquet', '_metadata.json'), 'w') as f:
            json.dump(self.feature_metadata, f, indent=2)
        
        print(f"Feature store saved to {path}")
    
    def load_feature_store(self, path='f1_feature_store.parquet'):
        """
        Load feature store from disk
        """
        self.base_features = pd.read_parquet(path)
        
        # Load metadata
        with open(path.replace('.parquet', '_metadata.json'), 'r') as f:
            self.feature_metadata = json.load(f)
        
        print(f"Feature store loaded from {path}")

# Create and populate feature store
feature_store = F1FeatureStore()
all_features = feature_store.build_feature_store(df_advanced, track_features, weather_features)

print("\nFeature Store Summary:")
print("=" * 50)
print(f"Total records: {len(all_features):,}")
print(f"Total features: {len(all_features.columns)}")
print(f"Date range: {all_features['date'].min()} to {all_features['date'].max()}")
print(f"\nFeature groups:")
for group, metadata in feature_store.feature_metadata.items():
    print(f"  {group}: {metadata['count']} features")

# Save feature store
feature_store.save_feature_store('f1_feature_store.parquet')

## 8. Feature Quality Analysis

In [None]:
def analyze_feature_quality(feature_store):
    """
    Analyze feature quality and usefulness
    """
    df = feature_store.base_features
    
    # Get numeric features only
    numeric_features = df.select_dtypes(include=[np.number]).columns
    
    # Calculate feature statistics
    feature_stats = []
    
    for feature in numeric_features:
        if feature in ['raceId', 'driverId', 'constructorId', 'circuitId']:  # Skip IDs
            continue
            
        stats = {
            'feature': feature,
            'missing_pct': df[feature].isna().mean() * 100,
            'unique_values': df[feature].nunique(),
            'std_dev': df[feature].std(),
            'skewness': df[feature].skew(),
            'kurtosis': df[feature].kurtosis()
        }
        
        # Correlation with target (position)
        if 'positionOrder' in df.columns:
            stats['correlation_with_position'] = df[feature].corr(df['positionOrder'])
        
        feature_stats.append(stats)
    
    feature_quality_df = pd.DataFrame(feature_stats)
    
    # Identify potential issues
    print("\nFeature Quality Analysis:")
    print("=" * 50)
    
    # High missing data
    high_missing = feature_quality_df[feature_quality_df['missing_pct'] > 20]
    if not high_missing.empty:
        print("\nFeatures with >20% missing data:")
        print(high_missing[['feature', 'missing_pct']].round(1))
    
    # Low variance features
    low_variance = feature_quality_df[feature_quality_df['std_dev'] < 0.01]
    if not low_variance.empty:
        print("\nLow variance features (might not be useful):")
        print(low_variance[['feature', 'std_dev']])
    
    # Highly correlated with target
    if 'correlation_with_position' in feature_quality_df.columns:
        high_corr = feature_quality_df.nlargest(10, 'correlation_with_position')
        print("\nTop 10 features correlated with position:")
        print(high_corr[['feature', 'correlation_with_position']].round(3))
    
    # Visualize feature distributions
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Missing data
    missing_data = feature_quality_df.nlargest(15, 'missing_pct')
    axes[0, 0].barh(missing_data['feature'], missing_data['missing_pct'])
    axes[0, 0].set_xlabel('Missing %')
    axes[0, 0].set_title('Features with Most Missing Data')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Feature importance proxy (absolute correlation)
    if 'correlation_with_position' in feature_quality_df.columns:
        feature_quality_df['abs_correlation'] = feature_quality_df['correlation_with_position'].abs()
        top_corr = feature_quality_df.nlargest(15, 'abs_correlation')
        axes[0, 1].barh(top_corr['feature'], top_corr['abs_correlation'])
        axes[0, 1].set_xlabel('Absolute Correlation with Position')
        axes[0, 1].set_title('Most Predictive Features')
        axes[0, 1].grid(True, alpha=0.3)
    
    # Skewness distribution
    axes[1, 0].hist(feature_quality_df['skewness'].dropna(), bins=30, edgecolor='black')
    axes[1, 0].set_xlabel('Skewness')
    axes[1, 0].set_ylabel('Number of Features')
    axes[1, 0].set_title('Feature Skewness Distribution')
    axes[1, 0].axvline(x=0, color='red', linestyle='--', label='No skew')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # Feature count by group
    group_counts = pd.Series({group: metadata['count'] 
                             for group, metadata in feature_store.feature_metadata.items()})
    axes[1, 1].pie(group_counts.values, labels=group_counts.index, autopct='%1.1f%%')
    axes[1, 1].set_title('Feature Distribution by Group')
    
    plt.tight_layout()
    plt.show()
    
    return feature_quality_df

# Analyze feature quality
feature_quality = analyze_feature_quality(feature_store)

# Get recommended feature set
recommended_features = feature_store.get_feature_set(
    ['basic', 'driver', 'track', 'weather', 'momentum', 'advanced']
)
print(f"\nRecommended feature set: {len(recommended_features)} features")

## Summary

The F1 Feature Store provides:

1. **Track Characteristics**: Circuit type, overtaking difficulty, historical performance
2. **Weather Features**: Simulated weather conditions affecting race outcomes
3. **Momentum Indicators**: Recent form, consistency, championship pressure
4. **Strategy Patterns**: Pit stop timing, tire strategies, team preferences
5. **Advanced Metrics**: Era-adjusted performance, clutch factor, head-to-head records

### Key Insights:
- Weather significantly impacts race strategies and outcomes
- Momentum features capture driver form better than static averages
- Track characteristics strongly influence overtaking opportunities
- Team strategy patterns are predictable and vary by constructor

The feature store is saved as a Parquet file for efficient storage and quick loading in production models.