# Improved F1 Random Forest and Gradient Boosting Models

This notebook improves upon the original Random Forest and Gradient Boosting models with:
- Top 5 predictions alongside Top 3 and Top 10
- Race name validation in outputs
- Comprehensive overfitting prevention
- Temporal validation splits
- Better feature engineering

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
# Remove imblearn import if not available - we'll implement SMOTE alternative if needed
try:
    from imblearn.over_sampling import SMOTE
    SMOTE_AVAILABLE = True
except ImportError:
    print("Warning: imblearn not available. SMOTE functionality will be disabled.")
    SMOTE_AVAILABLE = False
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Suppress SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# Load data using the correct paths
data_path = '/app/data/f1db/'

print("Loading F1 data...")
drivers = pd.read_csv(f'{data_path}drivers.csv')
qualifying = pd.read_csv(f'{data_path}qualifying.csv')
races = pd.read_csv(f'{data_path}races.csv')
lap_times = pd.read_csv(f'{data_path}lap_times.csv')
pit_stop_times = pd.read_csv(f'{data_path}pit_stops.csv')
driver_standings = pd.read_csv(f'{data_path}driver_standings.csv')
results = pd.read_csv(f'{data_path}results.csv')
status = pd.read_csv(f'{data_path}status.csv')
constructor_results = pd.read_csv(f'{data_path}constructor_results.csv')
constructor_standings = pd.read_csv(f'{data_path}constructor_standings.csv')

print(f"Data loaded successfully!")
print(f"Total races: {len(races)}")
print(f"Date range: {races['date'].min()} to {races['date'].max()}")

## Data Preparation with Temporal Features

In [None]:
# Identify drivers who finished the races
finished_status_ids = status[status['status'].str.lower().isin(['finished', '+1 lap', '+2 laps', '+3 laps', '+4 laps', '+5 laps'])]['statusId'].unique()
finished_results = results[results['statusId'].isin(finished_status_ids)]

# Merge races and results to get race year and date
results = results.merge(races[['raceId', 'year', 'date', 'name', 'round']], on='raceId', how='left')

# Convert date to datetime
results['date'] = pd.to_datetime(results['date'])
races['date'] = pd.to_datetime(races['date'])

# Sort by date for temporal consistency
results = results.sort_values('date')

# Create target variables
results['top_10'] = results['positionOrder'] <= 10
results['top_5'] = results['positionOrder'] <= 5  # NEW: Top 5 predictions
results['top_3'] = results['positionOrder'] <= 3

print(f"\nTarget distribution:")
print(f"Top 10 finishes: {results['top_10'].sum()} ({results['top_10'].mean()*100:.1f}%)")
print(f"Top 5 finishes: {results['top_5'].sum()} ({results['top_5'].mean()*100:.1f}%)")
print(f"Top 3 finishes: {results['top_3'].sum()} ({results['top_3'].mean()*100:.1f}%)")

## Advanced Feature Engineering with Rolling Windows

In [None]:
def create_rolling_features(df, window_sizes=[3, 5, 10]):
    """Create rolling features to capture recent form"""
    df = df.sort_values(['driverId', 'date'])
    
    for window in window_sizes:
        # Rolling average position
        df[f'avg_position_last_{window}'] = df.groupby('driverId')['positionOrder'].transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=1).mean()
        )
        
        # Rolling points
        df[f'avg_points_last_{window}'] = df.groupby('driverId')['points'].transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=1).mean()
        )
        
        # Rolling DNF rate
        df['dnf'] = ~df['statusId'].isin(finished_status_ids)
        df[f'dnf_rate_last_{window}'] = df.groupby('driverId')['dnf'].transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=1).mean()
        )
    
    return df

# Apply rolling features
results = create_rolling_features(results)

# Calculate seasonal performance (with shift to prevent leakage)
seasonal_performance = results.groupby(['driverId', 'year'])['positionOrder'].transform('mean')
results['avg_position_per_season'] = results.groupby('driverId')['positionOrder'].shift(1)

# Calculate qualifying features
def process_qualifying_times(qual_str):
    """Convert qualifying time string to seconds"""
    if pd.isna(qual_str) or qual_str == '\\N' or qual_str == '':
        return np.nan
    try:
        parts = qual_str.split(':')
        if len(parts) == 2:
            return int(parts[0]) * 60 + float(parts[1])
        else:
            return np.nan
    except:
        return np.nan

# Process qualifying times
for col in ['q1', 'q2', 'q3']:
    qualifying[col] = qualifying[col].apply(process_qualifying_times)

# Best qualifying time
qualifying['best_quali_time'] = qualifying[['q1', 'q2', 'q3']].min(axis=1)

# Merge qualifying data
results = results.merge(
    qualifying[['raceId', 'driverId', 'position', 'best_quali_time']],
    on=['raceId', 'driverId'],
    how='left',
    suffixes=('', '_quali')
)

# Constructor features
constructor_avg_points = constructor_results.groupby('constructorId')['points'].mean().reset_index()
constructor_avg_points.columns = ['constructorId', 'avg_constructor_points']
results = results.merge(constructor_avg_points, on='constructorId', how='left')

# Track-specific features
track_performance = results.groupby(['driverId', 'name'])['positionOrder'].agg(['mean', 'std']).reset_index()
track_performance.columns = ['driverId', 'track_name', 'avg_position_at_track', 'std_position_at_track']
results = results.merge(
    track_performance,
    left_on=['driverId', 'name'],
    right_on=['driverId', 'track_name'],
    how='left'
)

print("Feature engineering completed!")
print(f"Total features created: {len([col for col in results.columns if col not in ['raceId', 'driverId', 'date']])}")

## Temporal Train-Test Split for Overfitting Prevention

In [None]:
# Use only recent data (2018 onwards) for better relevance
recent_results = results[results['year'] >= 2018].copy()

# Define temporal split points
train_end_date = pd.Timestamp('2022-12-31')
val_end_date = pd.Timestamp('2023-12-31')

# Split data temporally
train_data = recent_results[recent_results['date'] <= train_end_date]
val_data = recent_results[(recent_results['date'] > train_end_date) & (recent_results['date'] <= val_end_date)]
test_data = recent_results[recent_results['date'] > val_end_date]

print(f"\nTemporal split:")
print(f"Train: {train_data['date'].min()} to {train_data['date'].max()} ({len(train_data)} samples)")
print(f"Val: {val_data['date'].min()} to {val_data['date'].max()} ({len(val_data)} samples)")
print(f"Test: {test_data['date'].min()} to {test_data['date'].max()} ({len(test_data)} samples)")

# Feature selection
feature_cols = [
    'grid', 'position_quali', 'avg_constructor_points',
    'avg_position_last_3', 'avg_position_last_5', 'avg_position_last_10',
    'avg_points_last_3', 'avg_points_last_5', 'avg_points_last_10',
    'dnf_rate_last_3', 'dnf_rate_last_5', 'dnf_rate_last_10',
    'avg_position_at_track', 'std_position_at_track'
]

# Remove features with too many NaN values
feature_cols = [col for col in feature_cols if col in recent_results.columns]

# Prepare features and targets
def prepare_data(df, feature_cols):
    X = df[feature_cols].fillna(df[feature_cols].mean())
    y_top_10 = df['top_10']
    y_top_5 = df['top_5']
    y_top_3 = df['top_3']
    return X, y_top_10, y_top_5, y_top_3

X_train, y_train_top_10, y_train_top_5, y_train_top_3 = prepare_data(train_data, feature_cols)
X_val, y_val_top_10, y_val_top_5, y_val_top_3 = prepare_data(val_data, feature_cols)
X_test, y_test_top_10, y_test_top_5, y_test_top_3 = prepare_data(test_data, feature_cols)

print(f"\nFeatures used: {feature_cols}")
print(f"Feature dimensions: {X_train.shape[1]}")

## Model Training with Regularization

In [None]:
# Define regularized models to prevent overfitting
rf_params = {
    'n_estimators': 100,
    'max_depth': 8,  # Shallow trees
    'min_samples_split': 50,  # Require many samples to split
    'min_samples_leaf': 20,   # Require many samples in leaves
    'max_features': 'sqrt',   # Use subset of features
    'random_state': 42,
    'n_jobs': -1
}

gb_params = {
    'n_estimators': 100,
    'learning_rate': 0.05,    # Small learning rate
    'max_depth': 4,           # Very shallow trees
    'min_samples_split': 50,
    'min_samples_leaf': 20,
    'subsample': 0.8,         # Use subset of data
    'random_state': 42
}

# Initialize models
rf_model = RandomForestClassifier(**rf_params)
gb_model = GradientBoostingClassifier(**gb_params)

# Train models for each target
print("Training models...")

models = {}
for target_name, y_train, y_val in [
    ('top_10', y_train_top_10, y_val_top_10),
    ('top_5', y_train_top_5, y_val_top_5),
    ('top_3', y_train_top_3, y_val_top_3)
]:
    print(f"\nTraining {target_name} models...")
    
    # Train Random Forest
    rf = RandomForestClassifier(**rf_params)
    rf.fit(X_train, y_train)
    
    # Train Gradient Boosting
    gb = GradientBoostingClassifier(**gb_params)
    gb.fit(X_train, y_train)
    
    # Ensemble
    ensemble = VotingClassifier(
        estimators=[('rf', rf), ('gb', gb)],
        voting='soft'
    )
    ensemble.fit(X_train, y_train)
    
    # Evaluate on validation set
    val_pred = ensemble.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    val_prec = precision_score(y_val, val_pred)
    val_rec = recall_score(y_val, val_pred)
    val_f1 = f1_score(y_val, val_pred)
    
    print(f"{target_name} Validation Metrics:")
    print(f"  Accuracy: {val_acc:.3f}")
    print(f"  Precision: {val_prec:.3f}")
    print(f"  Recall: {val_rec:.3f}")
    print(f"  F1: {val_f1:.3f}")
    
    models[target_name] = ensemble
    
    # Check for overfitting
    train_pred = ensemble.predict(X_train)
    train_acc = accuracy_score(y_train, train_pred)
    print(f"  Train Accuracy: {train_acc:.3f}")
    print(f"  Overfitting ratio: {train_acc/val_acc:.3f}")

## Overfitting Tests and Validation

In [None]:
def overfitting_analysis(model, X_train, y_train, X_val, y_val, model_name):
    """Comprehensive overfitting analysis"""
    
    # Get predictions and probabilities
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    train_proba = model.predict_proba(X_train)[:, 1]
    val_proba = model.predict_proba(X_val)[:, 1]
    
    # Calculate metrics
    train_acc = accuracy_score(y_train, train_pred)
    val_acc = accuracy_score(y_val, val_pred)
    train_auc = roc_auc_score(y_train, train_proba)
    val_auc = roc_auc_score(y_val, val_proba)
    
    # Plot learning curves
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # ROC curves
    ax = axes[0]
    fpr_train, tpr_train, _ = roc_curve(y_train, train_proba)
    fpr_val, tpr_val, _ = roc_curve(y_val, val_proba)
    ax.plot(fpr_train, tpr_train, label=f'Train (AUC={train_auc:.3f})')
    ax.plot(fpr_val, tpr_val, label=f'Val (AUC={val_auc:.3f})')
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'{model_name} ROC Curves')
    ax.legend()
    
    # Confusion matrices
    ax = axes[1]
    cm = confusion_matrix(y_val, val_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(f'{model_name} Validation Confusion Matrix')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    
    plt.tight_layout()
    plt.show()
    
    # Print overfitting metrics
    print(f"\n{model_name} Overfitting Analysis:")
    print(f"Train Accuracy: {train_acc:.3f}")
    print(f"Val Accuracy: {val_acc:.3f}")
    print(f"Accuracy Gap: {train_acc - val_acc:.3f}")
    print(f"Train AUC: {train_auc:.3f}")
    print(f"Val AUC: {val_auc:.3f}")
    print(f"AUC Gap: {train_auc - val_auc:.3f}")
    
    if train_acc - val_acc > 0.1:
        print("⚠️ WARNING: Significant overfitting detected!")
    elif train_acc - val_acc > 0.05:
        print("⚠️ WARNING: Moderate overfitting detected!")
    else:
        print("✅ Model shows good generalization")
    
    return train_acc, val_acc, train_auc, val_auc

# Analyze overfitting for each model
overfitting_results = {}
for target_name in ['top_10', 'top_5', 'top_3']:
    y_train = locals()[f'y_train_{target_name}']
    y_val = locals()[f'y_val_{target_name}']
    
    results = overfitting_analysis(
        models[target_name], X_train, y_train, X_val, y_val, 
        f'{target_name.upper()} Model'
    )
    overfitting_results[target_name] = results

## Predictions for Next Race with Race Name

In [None]:
# Get the next race information
latest_race_date = races['date'].max()
next_race = races[races['date'] > latest_race_date].iloc[0] if len(races[races['date'] > latest_race_date]) > 0 else races.iloc[-1]
next_race_name = next_race['name']
next_race_year = next_race['year']
next_race_round = next_race['round']

print(f"\n{'='*60}")
print(f"PREDICTIONS FOR NEXT RACE: {next_race_name} {next_race_year}")
print(f"Round {next_race_round} - Scheduled for {next_race['date']}")
print(f"{'='*60}\n")

# Get current drivers dynamically from recent results
# Find drivers who have participated in recent races
recent_year = results['year'].max()
recent_races = results[results['year'] >= recent_year - 1]  # Last 2 years

# Get active drivers based on recent participation
active_driver_ids = recent_races.groupby('driverId').size()
active_driver_ids = active_driver_ids[active_driver_ids >= 5].index  # At least 5 races

# Get driver information
driver_info = drivers[drivers['driverId'].isin(active_driver_ids)][['driverId', 'surname', 'forename']]

# If we don't have enough active drivers, get the most recent participants
if len(driver_info) < 10:
    print(f"Found only {len(driver_info)} active drivers, expanding search...")
    # Get drivers from the most recent races
    latest_races = results.nlargest(200, 'date')  # Last ~10 races
    active_driver_ids = latest_races['driverId'].value_counts().head(20).index
    driver_info = drivers[drivers['driverId'].isin(active_driver_ids)][['driverId', 'surname', 'forename']]

print(f"Found {len(driver_info)} active drivers for predictions")

# Prepare features for current drivers
predictions_data = []

for _, driver in driver_info.iterrows():
    driver_id = driver['driverId']
    driver_name = driver['surname']
    
    # Get latest features for this driver
    driver_results = recent_results[recent_results['driverId'] == driver_id]
    
    if len(driver_results) > 0:
        # Get most recent data
        # Fix: Convert pandas Series to numpy array properly
        latest_row = driver_results.iloc[-1]
        # Extract only the feature columns and convert to numpy array
        latest_features_values = latest_row[feature_cols].values
        latest_features = latest_features_values.reshape(1, -1)
        
        # Fill NaN values
        latest_features = pd.DataFrame(latest_features, columns=feature_cols).fillna(X_train[feature_cols].mean())
        
        # Make predictions
        pred_top_10 = models['top_10'].predict(latest_features)[0]
        pred_top_5 = models['top_5'].predict(latest_features)[0]
        pred_top_3 = models['top_3'].predict(latest_features)[0]
        
        # Get probabilities
        prob_top_10 = models['top_10'].predict_proba(latest_features)[0][1]
        prob_top_5 = models['top_5'].predict_proba(latest_features)[0][1]
        prob_top_3 = models['top_3'].predict_proba(latest_features)[0][1]
        
        predictions_data.append({
            'Driver': driver_name,
            'Top_10': pred_top_10,
            'Top_10_Prob': prob_top_10,
            'Top_5': pred_top_5,
            'Top_5_Prob': prob_top_5,
            'Top_3': pred_top_3,
            'Top_3_Prob': prob_top_3
        })

# Create predictions DataFrame
predictions_df = pd.DataFrame(predictions_data)

if len(predictions_df) == 0:
    print("No predictions available - insufficient driver data")
else:
    # Sort by probability for better display
    predictions_df = predictions_df.sort_values('Top_3_Prob', ascending=False)
    
    # Display predictions
    print("\n🏎️ TOP 10 PREDICTIONS:")
    print("-" * 40)
    for _, row in predictions_df.iterrows():
        # Fix boolean comparison for pandas Series
        emoji = "✅" if bool(row['Top_10']) else "❌"
        print(f"{emoji} {row['Driver']:15} - {row['Top_10']} (Confidence: {row['Top_10_Prob']:.1%})")
    
    print("\n🏎️ TOP 5 PREDICTIONS:")
    print("-" * 40)
    for _, row in predictions_df.iterrows():
        # Fix boolean comparison for pandas Series
        emoji = "✅" if bool(row['Top_5']) else "❌"
        print(f"{emoji} {row['Driver']:15} - {row['Top_5']} (Confidence: {row['Top_5_Prob']:.1%})")
    
    print("\n🏎️ TOP 3 (PODIUM) PREDICTIONS:")
    print("-" * 40)
    for _, row in predictions_df.iterrows():
        # Fix boolean comparison for pandas Series
        emoji = "✅" if bool(row['Top_3']) else "❌"
        print(f"{emoji} {row['Driver']:15} - {row['Top_3']} (Confidence: {row['Top_3_Prob']:.1%})")
    
    # Summary statistics
    print(f"\n📊 SUMMARY FOR {next_race_name}:")
    print(f"Drivers predicted to finish Top 10: {predictions_df['Top_10'].sum()}")
    print(f"Drivers predicted to finish Top 5: {predictions_df['Top_5'].sum()}")
    print(f"Drivers predicted to finish Top 3: {predictions_df['Top_3'].sum()}")

## Feature Importance Analysis

In [None]:
# Extract feature importance from ensemble models
def get_ensemble_importance(ensemble_model, feature_names):
    """Get averaged feature importance from ensemble"""
    rf_importance = ensemble_model.estimators_[0][1].feature_importances_
    gb_importance = ensemble_model.estimators_[1][1].feature_importances_
    
    # Average the importances
    avg_importance = (rf_importance + gb_importance) / 2
    
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': avg_importance
    }).sort_values('Importance', ascending=False)
    
    return importance_df

# Plot feature importance for each model
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, (target_name, ax) in enumerate(zip(['top_10', 'top_5', 'top_3'], axes)):
    importance_df = get_ensemble_importance(models[target_name], feature_cols)
    
    # Plot top 10 features
    top_features = importance_df.head(10)
    ax.barh(top_features['Feature'], top_features['Importance'])
    ax.set_xlabel('Importance')
    ax.set_title(f'{target_name.upper()} Feature Importance')
    ax.invert_yaxis()

plt.tight_layout()
plt.show()

# Print top features
print("\nTop 5 Most Important Features by Model:")
for target_name in ['top_10', 'top_5', 'top_3']:
    importance_df = get_ensemble_importance(models[target_name], feature_cols)
    print(f"\n{target_name.upper()}:")
    for idx, row in importance_df.head(5).iterrows():
        print(f"  {row['Feature']}: {row['Importance']:.3f}")

## Model Performance Summary

In [None]:
# Test set evaluation
print("\n" + "="*60)
print("FINAL MODEL EVALUATION ON TEST SET")
print("="*60)

test_results = {}
for target_name in ['top_10', 'top_5', 'top_3']:
    y_test = locals()[f'y_test_{target_name}']
    model = models[target_name]
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    
    print(f"\n{target_name.upper()} Model Performance:")
    print(f"  Accuracy:  {acc:.3f}")
    print(f"  Precision: {prec:.3f}")
    print(f"  Recall:    {rec:.3f}")
    print(f"  F1 Score:  {f1:.3f}")
    print(f"  AUC-ROC:   {auc:.3f}")
    
    # Check if performance is realistic
    if acc > 0.95:
        print("  ⚠️ WARNING: Accuracy may be too high - check for overfitting!")
    elif acc > 0.85:
        print("  ⚠️ Note: High accuracy - monitor for overfitting")
    else:
        print("  ✅ Accuracy appears realistic")
    
    test_results[target_name] = {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'auc': auc
    }

# Create summary DataFrame
summary_df = pd.DataFrame(test_results).T
print("\n" + "="*60)
print("SUMMARY TABLE")
print("="*60)
print(summary_df.round(3))

## Save Models for Production Use

In [None]:
import joblib
import os

# Create model directory
model_dir = '/app/notebooks/advanced/models'
os.makedirs(model_dir, exist_ok=True)

# Save models
for target_name, model in models.items():
    model_path = f'{model_dir}/f1_{target_name}_model.pkl'
    joblib.dump(model, model_path)
    print(f"Saved {target_name} model to {model_path}")

# Save feature names and preprocessing info
model_info = {
    'feature_cols': feature_cols,
    'train_date_range': (train_data['date'].min().isoformat(), train_data['date'].max().isoformat()),
    'val_date_range': (val_data['date'].min().isoformat(), val_data['date'].max().isoformat()),
    'test_date_range': (test_data['date'].min().isoformat(), test_data['date'].max().isoformat()),
    'performance_metrics': test_results,
    'next_race': next_race_name,
    'model_params': {
        'rf': rf_params,
        'gb': gb_params
    }
}

import json
with open(f'{model_dir}/model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)
    
print(f"\nModel information saved to {model_dir}/model_info.json")