# PitWall Live - Data Loading & Exploration

This notebook covers:
1. Loading F1 data from FastF1
2. Exploring race results, qualifying, and lap times
3. Data quality analysis
4. Initial feature exploration

## Setup

In [None]:
# Install dependencies if needed
# !pip install fastf1 pandas numpy matplotlib seaborn tqdm

In [None]:
import os
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tqdm.notebook import tqdm

import fastf1
from fastf1 import get_session, get_event_schedule

# Configure plotting
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Enable FastF1 caching
CACHE_DIR = Path('../data/cache')
CACHE_DIR.mkdir(parents=True, exist_ok=True)
fastf1.Cache.enable_cache(str(CACHE_DIR))

print(f"FastF1 version: {fastf1.__version__}")
print(f"Cache directory: {CACHE_DIR.absolute()}")

## 1. Load Season Schedule

First, let's explore what data is available for recent seasons.

In [None]:
# Load 2024 schedule
schedule_2024 = get_event_schedule(2024)
print(f"\n2024 Season: {len(schedule_2024)} events")
schedule_2024[['RoundNumber', 'EventName', 'Country', 'Location', 'EventDate', 'EventFormat']].head(10)

In [None]:
# Load schedules for multiple seasons
SEASONS = [2022, 2023, 2024]

schedules = {}
for year in SEASONS:
    schedules[year] = get_event_schedule(year)
    print(f"{year}: {len(schedules[year])} events")

## 2. Load Race Session Data

Load a complete race session with results and lap data.

In [None]:
# Load a sample race session
session = get_session(2024, 'Bahrain', 'R')
session.load()

print(f"Session: {session.event['EventName']} {session.name}")
print(f"Date: {session.date}")
print(f"Drivers: {len(session.drivers)}")

In [None]:
# View race results
results = session.results
print("Race Results:")
results[['Position', 'Abbreviation', 'FullName', 'TeamName', 'GridPosition', 'Status', 'Points', 'Time']].head(20)

In [None]:
# Analyze positions gained/lost
results['PositionsGained'] = results['GridPosition'] - results['Position']

fig, ax = plt.subplots(figsize=(12, 6))
colors = ['green' if x > 0 else 'red' if x < 0 else 'gray' for x in results['PositionsGained']]
ax.barh(results['Abbreviation'], results['PositionsGained'], color=colors)
ax.set_xlabel('Positions Gained/Lost')
ax.set_ylabel('Driver')
ax.set_title('Positions Gained/Lost from Grid to Finish - Bahrain 2024')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

## 3. Lap Time Analysis

In [None]:
# Get all laps
laps = session.laps
print(f"Total laps recorded: {len(laps)}")
print(f"\nColumns: {list(laps.columns)}")

In [None]:
# Convert lap times to seconds for analysis
laps['LapTimeSeconds'] = laps['LapTime'].dt.total_seconds()

# Filter out pit laps and anomalies
clean_laps = laps[
    (laps['LapTimeSeconds'] > 80) &  # Min reasonable lap time
    (laps['LapTimeSeconds'] < 120) &  # Max reasonable lap time
    (laps['PitInTime'].isna()) &  # Not a pit-in lap
    (laps['PitOutTime'].isna())   # Not a pit-out lap
].copy()

print(f"Clean laps: {len(clean_laps)} ({len(clean_laps)/len(laps)*100:.1f}%)")

In [None]:
# Lap time distribution by driver
fig, ax = plt.subplots(figsize=(14, 8))

# Get top 10 drivers by finishing position
top_drivers = results.head(10)['Abbreviation'].tolist()
top_laps = clean_laps[clean_laps['Driver'].isin(top_drivers)]

sns.boxplot(data=top_laps, x='Driver', y='LapTimeSeconds', 
            order=top_drivers, palette='viridis', ax=ax)
ax.set_xlabel('Driver')
ax.set_ylabel('Lap Time (seconds)')
ax.set_title('Lap Time Distribution - Top 10 Finishers')
plt.tight_layout()
plt.show()

In [None]:
# Lap time evolution through the race
fig, ax = plt.subplots(figsize=(14, 8))

for driver in ['VER', 'PER', 'SAI', 'LEC']:
    driver_laps = clean_laps[clean_laps['Driver'] == driver]
    ax.plot(driver_laps['LapNumber'], driver_laps['LapTimeSeconds'], 
            label=driver, marker='o', markersize=3, alpha=0.7)

ax.set_xlabel('Lap Number')
ax.set_ylabel('Lap Time (seconds)')
ax.set_title('Lap Time Evolution - Select Drivers')
ax.legend()
plt.tight_layout()
plt.show()

## 4. Tire Strategy Analysis

In [None]:
# Analyze tire compounds used
if 'Compound' in laps.columns:
    compound_counts = laps.groupby(['Driver', 'Compound']).size().unstack(fill_value=0)
    print("Laps by Compound:")
    display(compound_counts.head(10))

In [None]:
# Tire degradation analysis for VER
ver_laps = clean_laps[clean_laps['Driver'] == 'VER'].copy()

if 'Compound' in ver_laps.columns:
    # Identify stints
    ver_laps['StintNumber'] = (ver_laps['Compound'] != ver_laps['Compound'].shift()).cumsum()
    
    fig, ax = plt.subplots(figsize=(14, 6))
    
    colors = {'SOFT': 'red', 'MEDIUM': 'yellow', 'HARD': 'white'}
    
    for stint in ver_laps['StintNumber'].unique():
        stint_laps = ver_laps[ver_laps['StintNumber'] == stint]
        compound = stint_laps['Compound'].iloc[0]
        color = colors.get(compound, 'gray')
        
        ax.scatter(stint_laps['LapNumber'], stint_laps['LapTimeSeconds'],
                   c=color, edgecolors='black', s=50, label=f'{compound}' if stint == 1 else '')
    
    ax.set_xlabel('Lap Number')
    ax.set_ylabel('Lap Time (seconds)')
    ax.set_title('VER Tire Strategy and Degradation')
    ax.legend()
    plt.tight_layout()
    plt.show()

## 5. Load Multiple Races for Training Data

In [None]:
def load_race_results(year: int, round_num: int) -> pd.DataFrame:
    """Load race results for a specific race."""
    try:
        schedule = get_event_schedule(year)
        event = schedule[schedule['RoundNumber'] == round_num].iloc[0]
        
        session = get_session(year, event['EventName'], 'R')
        session.load()
        
        results = session.results.copy()
        results['Season'] = year
        results['Round'] = round_num
        results['GrandPrix'] = event['EventName']
        results['Date'] = event['EventDate']
        
        return results
    except Exception as e:
        print(f"Error loading {year} R{round_num}: {e}")
        return pd.DataFrame()

In [None]:
# Load 2023 season results
all_results_2023 = []

schedule_2023 = get_event_schedule(2023)
race_events = schedule_2023[schedule_2023['EventFormat'] != 'testing']

for _, event in tqdm(race_events.iterrows(), total=len(race_events), desc="Loading 2023"):
    results = load_race_results(2023, event['RoundNumber'])
    if len(results) > 0:
        all_results_2023.append(results)

df_2023 = pd.concat(all_results_2023, ignore_index=True)
print(f"\nLoaded {len(df_2023)} driver-race results for 2023")

In [None]:
# Summary statistics
print("2023 Season Summary:")
print(f"Races: {df_2023['GrandPrix'].nunique()}")
print(f"Drivers: {df_2023['Abbreviation'].nunique()}")
print(f"Teams: {df_2023['TeamName'].nunique()}")

# Wins by driver
wins = df_2023[df_2023['Position'] == 1].groupby('Abbreviation').size().sort_values(ascending=False)
print(f"\nWins by Driver:")
print(wins.head(10))

In [None]:
# Save processed data
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

df_2023.to_parquet(output_dir / 'race_results_2023.parquet', index=False)
print(f"Saved to {output_dir / 'race_results_2023.parquet'}")

## 6. Qualifying Data

In [None]:
# Load qualifying session
quali_session = get_session(2024, 'Bahrain', 'Q')
quali_session.load()

quali_results = quali_session.results
print("Qualifying Results:")
quali_results[['Position', 'Abbreviation', 'TeamName', 'Q1', 'Q2', 'Q3']].head(20)

In [None]:
# Q3 times analysis (pole shootout)
q3_drivers = quali_results[quali_results['Q3'].notna()].copy()

# Convert to seconds
q3_drivers['Q3_seconds'] = q3_drivers['Q3'].dt.total_seconds()
pole_time = q3_drivers['Q3_seconds'].min()
q3_drivers['GapToPole'] = q3_drivers['Q3_seconds'] - pole_time

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.barh(q3_drivers['Abbreviation'], q3_drivers['GapToPole'])
ax.set_xlabel('Gap to Pole (seconds)')
ax.set_ylabel('Driver')
ax.set_title('Q3 Gap to Pole - Bahrain 2024')
ax.invert_yaxis()

# Add time labels
for bar, gap in zip(bars, q3_drivers['GapToPole']):
    ax.text(gap + 0.01, bar.get_y() + bar.get_height()/2, 
            f'+{gap:.3f}s', va='center', fontsize=9)

plt.tight_layout()
plt.show()

## 7. Data Quality Check

In [None]:
# Check for missing values in 2023 data
print("Missing Values in 2023 Race Results:")
missing = df_2023.isnull().sum()
missing_pct = (missing / len(df_2023) * 100).round(2)
pd.DataFrame({'Missing': missing, 'Percent': missing_pct})[missing > 0]

In [None]:
# Check data types
print("\nData Types:")
df_2023.dtypes

In [None]:
# Verify position distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Grid position distribution
axes[0].hist(df_2023['GridPosition'].dropna(), bins=20, edgecolor='black')
axes[0].set_xlabel('Grid Position')
axes[0].set_ylabel('Count')
axes[0].set_title('Grid Position Distribution')

# Finish position distribution
axes[1].hist(df_2023['Position'].dropna(), bins=20, edgecolor='black')
axes[1].set_xlabel('Finish Position')
axes[1].set_ylabel('Count')
axes[1].set_title('Finish Position Distribution')

plt.tight_layout()
plt.show()

## 8. Key Insights Summary

From this exploration we've learned:

1. **Data Availability**: FastF1 provides comprehensive data from 2018 onwards
2. **Key Features**: Grid position, lap times, tire compounds, and pit stops are well recorded
3. **Data Quality**: Some missing values in qualifying times for drivers eliminated early
4. **Patterns**: Clear correlation between grid position and race finish

Next steps:
- Feature engineering notebook
- Race winner prediction model
- Lap time prediction model

In [None]:
# Save summary statistics
summary = {
    'seasons_available': SEASONS,
    'total_races_2023': df_2023['GrandPrix'].nunique(),
    'total_drivers_2023': df_2023['Abbreviation'].nunique(),
    'data_quality': {
        'missing_grid_position': missing_pct.get('GridPosition', 0),
        'missing_points': missing_pct.get('Points', 0),
    }
}

import json
with open(output_dir / 'data_summary.json', 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print("Data exploration complete!")