# F1 Data Preprocessing and Analysis

This notebook demonstrates the data preprocessing pipeline for F1 race data. We'll go through:
1. Loading and examining raw data
2. Data cleaning and preprocessing
3. Feature engineering
4. Data validation
5. Exploratory data analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.append('..')
from src.data.preprocessor import F1DataPreprocessor

# Set plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

## 1. Load Raw Data

First, let's load and examine the raw F1 data.

In [None]:
# Load raw data
raw_data_path = '../data/processed/f1_2023_processed.csv'
df_raw = pd.read_csv(raw_data_path, low_memory=False)

print("Raw Data Shape:", df_raw.shape)
print("\nColumns:")
for col in df_raw.columns:
    print(f"- {col}: {df_raw[col].dtype}")

print("\nSample Data:")
df_raw.head()

## 2. Analyze Missing Values

Let's examine missing values in our dataset.

In [None]:
# Calculate missing value statistics
missing = df_raw.isnull().sum()
missing_pct = (missing / len(df_raw)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing,
    'Percentage': missing_pct
}).sort_values('Percentage', ascending=False)

# Plot missing values
plt.figure(figsize=(12, 6))
plt.bar(range(len(missing_df[missing_df['Percentage'] > 0])), 
        missing_df[missing_df['Percentage'] > 0]['Percentage'])
plt.xticks(range(len(missing_df[missing_df['Percentage'] > 0])), 
           missing_df[missing_df['Percentage'] > 0].index, rotation=45)
plt.title('Missing Values by Column')
plt.ylabel('Percentage Missing')
plt.tight_layout()

print("Columns with missing values:")
print(missing_df[missing_df['Missing Values'] > 0])

## 3. Preprocess Data

Now let's run our preprocessing pipeline.

In [None]:
# Initialize preprocessor
preprocessor = F1DataPreprocessor()

# Preprocess data
df_processed = preprocessor.preprocess_data(df_raw)

print("Processed Data Shape:", df_processed.shape)
print("\nProcessed Data Types:")
print(df_processed.dtypes)

## 4. Validate Processed Data

Let's check the quality of our processed data.

In [None]:
# Check remaining missing values
missing_processed = df_processed.isnull().sum()
print("Columns with missing values after processing:")
print(missing_processed[missing_processed > 0])

# Check value ranges for numerical columns
print("\nValue ranges for numerical columns:")
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    print(f"{col}: {df_processed[col].min()} - {df_processed[col].max()}")

## 5. Exploratory Data Analysis

Let's visualize some key aspects of our processed data.

In [None]:
# 1. Lap Time Distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=df_processed, x='LapTime', bins=50)
plt.title('Lap Time Distribution')
plt.xlabel('Lap Time (seconds)')
plt.show()

# 2. Position Changes
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_processed, x='Driver', y='Position')
plt.xticks(rotation=45)
plt.title('Position Distribution by Driver')
plt.tight_layout()
plt.show()

# 3. Lap Time Evolution
plt.figure(figsize=(15, 6))
for driver in df_processed['Driver'].unique()[:5]:  # Plot top 5 drivers
    driver_data = df_processed[df_processed['Driver'] == driver]
    plt.plot(driver_data['LapNumber'], driver_data['LapTime'], label=driver)
plt.title('Lap Time Evolution')
plt.xlabel('Lap Number')
plt.ylabel('Lap Time (seconds)')
plt.legend()
plt.show()

## 6. Feature Analysis

Let's examine our derived features.

In [None]:
# 1. Lap Time Delta Distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=df_processed, x='LapTimeDelta', bins=50)
plt.title('Lap Time Delta Distribution')
plt.xlabel('Lap Time Delta (seconds)')
plt.show()

# 2. Position Changes
plt.figure(figsize=(12, 6))
sns.histplot(data=df_processed, x='PositionChange', bins=20)
plt.title('Position Changes Distribution')
plt.xlabel('Position Change')
plt.show()

# 3. Sector Time Analysis
sector_cols = ['Sector1Time', 'Sector2Time', 'Sector3Time']
if all(col in df_processed.columns for col in sector_cols):
    plt.figure(figsize=(15, 5))
    for i, col in enumerate(sector_cols, 1):
        plt.subplot(1, 3, i)
        sns.boxplot(data=df_processed, y=col)
        plt.title(f'{col} Distribution')
    plt.tight_layout()
    plt.show()

## 7. Save Processed Data

Finally, let's save our processed dataset.

In [None]:
# Save processed data
output_path = '../data/processed/f1_2023_cleaned.csv'
df_processed.to_csv(output_path, index=False)
print(f"Processed data saved to: {output_path}")