In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Define the project root and data path
# Adjust this path if your notebook is not in the project root or scripts/model_testing
current_dir = Path.cwd() # This assumes you run the notebook from your project root
# If you run it from F1/scripts/model_testing, you might need:
# project_root = current_dir.parent.parent
# PROCESSED_DATA_DIR = project_root / 'data' / 'processed'

# Assuming the processed data is in data/processed relative to your project root
data_path = "D:/Projects/F1/data/processed/seasons_2023_2024_data.csv"

# print(f"Attempting to load data from: {data_path.resolve()}")

In [12]:
try:
    df = pd.read_csv(data_path)
    print("Data loaded successfully!")
    print(f"Initial DataFrame shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: Data file not found at {data_path.resolve()}")
    print("Please ensure 'seasons_2023_2024_data_enhanced.csv' is in 'data/processed'.")
    df = None # Set df to None to prevent errors in subsequent cells

Data loaded successfully!
Initial DataFrame shape: (55072, 98)


In [13]:
if df is not None:
    print("\n--- First 5 Rows ---")
    display(df.head())

    print("\n--- DataFrame Info (Data Types and Non-Null Counts) ---")
    df.info()

    print("\n--- Descriptive Statistics for Numerical Columns ---")
    display(df.describe())


--- First 5 Rows ---


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,SpeedI2_Diff,SpeedFL_Diff,SpeedST_Diff,PitStopDuration,PitLap,StintLength,TempDelta,WetTrack,GripLevel,WeatherStability
0,0 days 00:18:01.997000,VER,1,,1,1,0 days 00:15:56.201000,,,0 days 00:00:44.778000,...,,,,,True,44,1.628723,1,0.0,1.840767
1,0 days 00:19:35.532000,VER,1,0 days 00:01:33.535000,2,1,,,0 days 00:00:28.534000,0 days 00:00:36.657000,...,32.0,7.0,78.0,,False,44,1.628723,1,0.0,1.840767
2,0 days 00:21:07.677000,VER,1,0 days 00:01:32.145000,3,1,,,0 days 00:00:28.441000,0 days 00:00:36.572000,...,-2.0,-1.0,-1.0,,False,44,1.628723,1,0.0,1.840767
3,0 days 00:23:45.559000,VER,1,,4,1,,,0 days 00:00:48.387000,0 days 00:01:09.690000,...,-44.0,-1.0,-162.0,,False,44,1.628723,1,0.0,1.840767
4,0 days 00:25:16.278000,VER,1,0 days 00:01:30.719000,5,1,,,0 days 00:00:28.109000,0 days 00:00:36.109000,...,46.0,3.0,162.0,,False,44,1.628723,1,0.0,1.840767



--- DataFrame Info (Data Types and Non-Null Counts) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55072 entries, 0 to 55071
Data columns (total 98 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Time                   55072 non-null  object 
 1   Driver                 55072 non-null  object 
 2   DriverNumber           55072 non-null  int64  
 3   LapTime                50601 non-null  object 
 4   LapNumber              55072 non-null  int64  
 5   Stint                  55072 non-null  int64  
 6   PitOutTime             5044 non-null   object 
 7   PitInTime              5053 non-null   object 
 8   Sector1Time            51226 non-null  object 
 9   Sector2Time            54787 non-null  object 
 10  Sector3Time            54035 non-null  object 
 11  Sector1SessionTime     51145 non-null  object 
 12  Sector2SessionTime     54787 non-null  object 
 13  Sector3SessionTime     54035 non-null  object 
 1

Unnamed: 0,DriverNumber,LapNumber,Stint,SpeedI1,SpeedI2,SpeedFL,SpeedST,TyreLife,Position,Year,...,SpeedI1_Diff,SpeedI2_Diff,SpeedFL_Diff,SpeedST_Diff,PitStopDuration,StintLength,TempDelta,WetTrack,GripLevel,WeatherStability
count,55072.0,55072.0,55072.0,55065.0,55065.0,55064.0,55065.0,55072.0,42596.0,55072.0,...,54043.0,54043.0,54043.0,54043.0,157.0,55072.0,55072.0,55072.0,55072.0,55072.0
mean,28.278363,26.010568,2.340264,251.686394,243.368463,267.080349,288.208314,17.079333,9.822307,2023.397189,...,-0.094891,0.033338,0.156315,0.239816,-135.313968,45.960415,11.392135,0.282539,0.557086,1.641662
std,23.648565,18.627357,1.233419,46.82497,46.498309,31.891366,40.282623,17.985965,5.436444,0.48932,...,34.574408,32.876078,21.336856,49.055999,96.218701,26.357281,6.02121,0.450238,0.555774,0.634641
min,1.0,1.0,1.0,34.0,38.0,1.0,27.0,0.0,1.0,2023.0,...,-240.0,-224.0,-304.865672,-298.0,-866.898,1.0,-0.055556,0.0,0.0,0.534153
25%,11.0,10.0,1.0,215.0,210.0,248.0,282.0,1.0,5.0,2023.0,...,-4.0,-2.0,-1.0,-4.0,-125.924,26.0,6.065068,0.0,0.0,1.220964
50%,22.0,22.0,2.0,268.0,254.0,273.0,297.0,10.0,10.0,2023.0,...,0.928571,0.0,0.0,0.0,-114.734,41.0,11.153333,0.0,0.383459,1.507202
75%,44.0,41.0,3.0,287.0,276.0,286.0,309.0,30.0,14.0,2024.0,...,5.0,3.0,1.0,5.0,-101.483,61.0,16.142,1.0,1.053067,1.985643
max,81.0,78.0,8.0,355.0,343.0,355.0,361.0,77.0,20.0,2024.0,...,227.0,245.0,314.0,294.0,-77.718,131.0,26.315847,1.0,1.817658,3.194976


In [10]:
if df is not None:
    print("\n--- Value Counts for Key Categorical Features ---")
    
    # Convert LapTime to seconds if not already (as done in training script)
    if 'LapTime' in df.columns:
        if pd.api.types.is_timedelta64_dtype(df['LapTime']):
            df['LapTime_s'] = df['LapTime'].dt.total_seconds()
        elif isinstance(df['LapTime'].iloc[0], str):
            # This handles '0 days 00:01:23.456000000' string format
            df['LapTime_s'] = df['LapTime'].apply(lambda x: pd.to_timedelta(x).total_seconds() if pd.notna(x) else np.nan)
    
    # Apply the same clean lap filtering as in the training script
    initial_rows = df.shape[0]
    if "IsCleanLap" in df.columns:
        df_clean = df[df["IsCleanLap"] == True].copy()
    else:
        df_clean = df[df["Deleted"] == False].copy()
        if "IsOutlap" in df_clean.columns:
            df_clean = df_clean[df_clean["IsOutlap"] == False]
        if "IsInlap" in df_clean.columns:
            df_clean = df_clean[df_clean["IsInlap"] == False]
            
    print(f"Filtered down to {df_clean.shape[0]} clean laps from {initial_rows} total laps.")

    # Check for NaN values in key columns after filtering
    key_cols = ['Driver', 'Team', 'Event', 'LapTime_s', 'TyreWearPercentage']
    print("\n--- NaN counts in key columns (after clean lap filter) ---")
    print(df_clean[key_cols].isnull().sum())
    
    df_clean.dropna(subset=['Driver', 'Team', 'Event', 'LapTime_s', 'TyreWearPercentage'], inplace=True)
    print(f"After dropping NaNs in key columns: {df_clean.shape[0]} rows remaining.")
    
    print("\nDriver Counts:")
    display(df_clean['Driver'].value_counts())
    
    print("\nTeam Counts:")
    display(df_clean['Team'].value_counts())

    print("\nEvent Counts:")
    display(df_clean['Event'].value_counts())
    
    # Check 'CompoundHardness' unique values if it's not a direct numerical feature
    if 'CompoundHardness' in df_clean.columns:
        print("\nCompoundHardness Unique Values:")
        display(df_clean['CompoundHardness'].value_counts())
    else:
        print("CompoundHardness column not found.")
        
    if 'Compound' in df_clean.columns:
        print("\nCompound Unique Values:")
        display(df_clean['Compound'].value_counts())

    print("\n--- Average Lap Times and Tire Wear per Driver and Team ---")

    if 'LapTime_s' in df_clean.columns:
        print("\nAverage Lap Time (seconds) per Driver:")
        avg_laptime_driver = df_clean.groupby('Driver')['LapTime_s'].mean().sort_values()
        display(avg_laptime_driver)

        print("\nAverage Lap Time (seconds) per Team:")
        avg_laptime_team = df_clean.groupby('Team')['LapTime_s'].mean().sort_values()
        display(avg_laptime_team)
    else:
        print("'LapTime_s' column not found or not correctly converted.")

    if 'TyreWearPercentage' in df_clean.columns:
        print("\nAverage Tyre Wear Percentage per Driver:")
        avg_tyrewear_driver = df_clean.groupby('Driver')['TyreWearPercentage'].mean().sort_values(ascending=False)
        display(avg_tyrewear_driver)

        print("\nAverage Tyre Wear Percentage per Team:")
        avg_tyrewear_team = df_clean.groupby('Team')['TyreWearPercentage'].mean().sort_values(ascending=False)
        display(avg_tyrewear_team)
    else:
        print("'TyreWearPercentage' column not found.")
        
    print("\n--- Distributions of Key Numerical Features ---")
    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 15))
    axes = axes.flatten()
    
    num_features = ['TyreLife', 'CompoundHardness', 'TrackTemp_Avg', 'AirTemp_Avg', 'Humidity_Avg', 'Rainfall']
    
    for i, col in enumerate(num_features):
        if col in df_clean.columns:
            sns.histplot(df_clean[col].dropna(), kde=True, ax=axes[i])
            axes[i].set_title(f'Distribution of {col}')
        else:
            axes[i].set_title(f'{col} not found')
            axes[i].axis('off')

    plt.tight_layout()
    plt.show()


--- Value Counts for Key Categorical Features ---
Filtered down to 44338 clean laps from 55072 total laps.

--- NaN counts in key columns (after clean lap filter) ---


KeyError: "['TyreWearPercentage'] not in index"