# F1 Telemetry Analysis - Complete Workflow

This notebook demonstrates the complete workflow for F1 telemetry analysis:
1. Data ingestion with FastF1
2. Distributed processing with PySpark
3. Data quality validation
4. ML-powered predictive analytics

## Setup

In [None]:
import sys
sys.path.append('../src')

from ingestion.telemetry_loader import TelemetryLoader
from processing.spark_processor import SparkProcessor
from quality.telemetry_validator import TelemetryQualityValidator
from ml.lap_time_predictor import LapTimeFeatureEngineer, LapTimePredictorModel

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')
%matplotlib inline

## 1. Data Ingestion

Load F1 race data using FastF1 library.

In [None]:
# Initialize loader
loader = TelemetryLoader(cache_dir='../data/cache')

# Load 2024 Bahrain GP Race
print("Loading 2024 Bahrain GP...")
session = loader.load_session(2024, 'Bahrain', 'R')

# Extract lap data
laps_df = loader.extract_lap_data(session)
print(f"\nLoaded {len(laps_df)} laps from {laps_df['Driver'].nunique()} drivers")

laps_df.head()

In [None]:
# Quick EDA - Lap time distribution
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(laps_df['LapTime'].dropna(), bins=50, edgecolor='black')
plt.xlabel('Lap Time (seconds)')
plt.ylabel('Frequency')
plt.title('Lap Time Distribution')

plt.subplot(1, 2, 2)
tire_counts = laps_df['Compound'].value_counts()
plt.bar(tire_counts.index, tire_counts.values)
plt.xlabel('Tire Compound')
plt.ylabel('Number of Laps')
plt.title('Laps by Tire Compound')

plt.tight_layout()
plt.show()

## 2. PySpark Processing

Process data through medallion architecture layers.

In [None]:
# Initialize Spark processor
processor = SparkProcessor()

# Process through layers
bronze_df = processor.process_bronze_laps(laps_df)
print(f"Bronze layer: {bronze_df.count()} records")

silver_df = processor.process_silver_laps(bronze_df)
print(f"Silver layer: {silver_df.count()} records")

# Show sample of enriched data
silver_df.select(
    'Driver', 'LapNumber', 'LapTime', 'CompoundStd', 
    'DeltaToFastest', 'TimeMismatch'
).show(10)

In [None]:
# Generate Gold layer analytics
gold_drivers = processor.process_gold_driver_stats(silver_df)
gold_tires = processor.process_gold_tire_analysis(silver_df)

print("\n=== Driver Performance Stats ===")
gold_drivers.orderBy('FastestLap').show(10, truncate=False)

print("\n=== Tire Compound Analysis ===")
gold_tires.orderBy('AvgLapTime').show()

## 3. Data Quality Validation

Run comprehensive quality checks on the telemetry data.

In [None]:
# Convert back to Pandas for validation
laps_validated = silver_df.toPandas()

# Run validation
validator = TelemetryQualityValidator()
passed, report = validator.validate_lap_data(laps_validated)

print("=== Data Quality Report ===")
print(f"Validation Status: {'PASSED' if passed else 'FAILED'}")
print(f"Total Rows: {report['total_rows']}")
print(f"Schema Valid: {report['schema_valid']}")
print(f"\nIssues: {len(report['issues'])}")
for issue in report['issues']:
    print(f"  - {issue}")
print(f"\nWarnings: {len(report['warnings'])}")
for warning in report['warnings']:
    print(f"  - {warning}")

In [None]:
# Compound performance analysis
compound_analysis = validator.validate_cross_compound_performance(laps_validated)

print("\n=== Cross-Compound Performance Analysis ===")
for gp, data in compound_analysis.items():
    print(f"\n{gp}:")
    for compound in data['compound_performance']:
        print(f"  {compound['Compound']}: "
              f"Median={compound['median']:.2f}s, "
              f"Count={compound['count']}")
    if 'expected_order' in data:
        print(f"  Expected order: {data['expected_order']}")

## 4. Machine Learning - Lap Time Prediction

Build and train predictive models for lap time forecasting.

In [None]:
# Feature engineering
engineer = LapTimeFeatureEngineer()
df_features = engineer.engineer_lap_features(laps_validated)

print(f"Feature engineering complete: {len(df_features.columns)} total columns")
print("\nNew features created:")
new_cols = set(df_features.columns) - set(laps_validated.columns)
for col in sorted(new_cols):
    print(f"  - {col}")

In [None]:
# Prepare model data
from sklearn.model_selection import train_test_split

X, y, feature_names = engineer.prepare_model_data(df_features)

print(f"Model dataset: {X.shape[0]} samples, {X.shape[1]} features")
print(f"Target range: {y.min():.2f}s to {y.max():.2f}s")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTrain set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

In [None]:
# Train multiple model types
models = {}
results = {}

for model_type in ['xgboost', 'lightgbm', 'random_forest']:
    print(f"\n=== Training {model_type.upper()} ===")
    
    model = LapTimePredictorModel(model_type=model_type)
    model.train(X_train, y_train, optimize=False)
    
    metrics = model.evaluate(X_test, y_test)
    
    models[model_type] = model
    results[model_type] = metrics
    
    print(f"MAE: {metrics['mae']:.3f}s")
    print(f"RMSE: {metrics['rmse']:.3f}s")
    print(f"R²: {metrics['r2']:.3f}")
    print(f"MAPE: {metrics['mape']:.2f}%")

In [None]:
# Compare model performance
results_df = pd.DataFrame(results).T

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics_to_plot = ['mae', 'rmse', 'r2', 'mape']
titles = ['Mean Absolute Error (s)', 'Root Mean Squared Error (s)', 
          'R² Score', 'Mean Absolute Percentage Error (%)']

for idx, (metric, title) in enumerate(zip(metrics_to_plot, titles)):
    ax = axes[idx // 2, idx % 2]
    results_df[metric].plot(kind='bar', ax=ax, color=['#FF1E1E', '#00D2BE', '#FFF500'])
    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.set_xlabel('Model')
    ax.set_ylabel(metric.upper())
    ax.grid(axis='y', alpha=0.3)
    
    # Rotate labels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

print("\n=== Model Comparison ===")
print(results_df)

In [None]:
# Feature importance from best model
best_model_name = results_df['r2'].idxmax()
best_model = models[best_model_name]

print(f"Best Model: {best_model_name.upper()}")
print("\n=== Top 15 Most Important Features ===")
print(best_model.feature_importance.head(15))

# Visualize feature importance
plt.figure(figsize=(10, 8))
top_features = best_model.feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title(f'Top 15 Feature Importances - {best_model_name.upper()}')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Prediction analysis
y_pred = best_model.predict(X_test)
residuals = y_test - y_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Actual vs Predicted
axes[0].scatter(y_test, y_pred, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
             'r--', lw=2)
axes[0].set_xlabel('Actual Lap Time (s)')
axes[0].set_ylabel('Predicted Lap Time (s)')
axes[0].set_title('Actual vs Predicted Lap Times')
axes[0].grid(alpha=0.3)

# Residuals
axes[1].scatter(y_pred, residuals, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Lap Time (s)')
axes[1].set_ylabel('Residual (s)')
axes[1].set_title('Residual Plot')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nResidual Statistics:")
print(f"Mean: {residuals.mean():.4f}s")
print(f"Std Dev: {residuals.std():.4f}s")
print(f"95% within: ±{1.96 * residuals.std():.4f}s")

## 5. Save Model and Results

In [None]:
# Save best model
model_path = f'../data/models/{best_model_name}_lap_predictor.pkl'
best_model.save_model(model_path)
print(f"Model saved to {model_path}")

# Save results summary
results_df.to_csv('../data/output/model_comparison.csv')
print("Results saved to ../data/output/model_comparison.csv")

In [None]:
# Cleanup
processor.stop()
print("Spark session stopped")

## Next Steps

1. **Expand Dataset**: Load multiple races/seasons for more robust training
2. **Hyperparameter Tuning**: Run Optuna optimization for better performance
3. **Advanced Features**: 
   - Weather data integration
   - Detailed telemetry features (throttle, brake, speed profiles)
   - Driver-specific learned patterns
4. **Production Pipeline**:
   - Delta Lake storage
   - Scheduled data refreshes
   - Model retraining automation
5. **Deployment**:
   - Real-time prediction API
   - Interactive dashboard
   - Race strategy optimization