In [1]:
# Importando bibliotecas
import fastf1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
from typing import Tuple, List, Optional
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error

In [2]:
def add_weather_info(session, stints_df):
    """
    Adds mean weather information from the session to the stints dataframe
    
    Args:
        session: FastF1 session object containing weather data
        stints_df: DataFrame containing stint data
        
    Returns:
        DataFrame with added weather columns
    """
    # Get weather data from session
    weather = session.weather_data
    
    # Calculate mean values
    mean_weather = weather.mean()
    
    # Add weather columns with broadcast values
    for col in mean_weather.index:
        stints_df[col] = mean_weather[col]
        
    return stints_df


def add_starting_positions(session, stints_df):
    """
    Adds starting grid position for each driver in the stints dataframe
    
    Args:
        session: FastF1 session object containing grid position info
        stints_df: DataFrame containing stint data
        
    Returns:
        DataFrame with added StartingPosition column
    """
    # Get starting grid positions
    grid = session.results[['Abbreviation', 'GridPosition']]
    grid = grid.set_index('Abbreviation')
    grid_dict = grid['GridPosition'].to_dict()
    
    # Add starting position column based on driver
    stints_df['StartingPosition'] = stints_df['Driver'].map(grid_dict)
    
    return stints_df


def add_team_info(session, stints_df):
    """
    Adds team information for each driver in the stints dataframe
    
    Args:
        session: FastF1 session object containing driver info
        stints_df: DataFrame containing stint data
        
    Returns:
        DataFrame with added Team column
    """
    # Create driver to team mapping
    driver_teams = {}
    for driver in session.drivers:
        driver_info = session.get_driver(driver)
        driver_teams[driver_info['Abbreviation']] = driver_info['TeamName']
    
    # Add team column based on driver
    stints_df['Team'] = stints_df['Driver'].map(driver_teams)
    
    return stints_df


def get_stints_race(session):
    laps = session.laps
    drivers = session.drivers
    drivers = [session.get_driver(driver)["Abbreviation"] for driver in drivers]
    stints = laps[["Driver", "Stint", "Compound", "LapNumber"]]
    stints = stints.groupby(["Driver", "Stint", "Compound"])
    stints = stints.count().reset_index()
    stints = stints.rename(columns={"LapNumber": "StintLength"})
    return stints


for year in range(2024, 2025):
    # Get the calendar for the year
    calendar = fastf1.get_event_schedule(year, include_testing=False)
    
    # Initialize an empty list to store all stints data
    all_stints = []
    
    # Loop through each race in the calendar
    for idx, event in calendar.iterrows():
        try:
            # Load the race session
            session = fastf1.get_session(year, event['EventName'], 'R')
            session.load()
            
            # Get stints for this race
            race_stints = get_stints_race(session)

            # Add the team names for each row
            race_stints = add_team_info(session, race_stints)
            
            # Add the starting position for each driver
            race_stints = add_starting_positions(session, race_stints)

            # Add mean weather information
            race_stints = add_weather_info(session, race_stints)

            # Add year and circuit information
            race_stints['Year'] = year
            race_stints['Circuit'] = event['EventName']
            
            # Append to our list
            all_stints.append(race_stints)
            
            print(f"Processed {year} {event['EventName']}")
        except Exception as e:
            print(f"Error processing {year} {event['EventName']}: {e}")
    
    # If we have data for this year, concatenate it
    if all_stints:
        year_stints = pd.concat(all_stints, ignore_index=True)
        
        # Save the data for this year (optional)
        year_stints.to_csv(f"stints_data_{year}.csv", index=False)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '63', '4', '44', '81', '14', '18', '24', '20', '3', '22', '23', '27', '31', '10', '77', '2']
core           INFO 	Loading data for Saudi Arabian Grand Prix

Processed 2024 Bahrain Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '16', '81', '14', '63', '38', '4', '44', '27', '23', '20', '31', '2', '22', '3', '77', '24', '18', '10']
core           INFO 	Loading data for Australian Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Saudi Arabian Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 19 drivers: ['55', '16', '4', '81', '11', '18', '22', '14', '27', '20', '23', '3', '10', '77', '24', '31', '63', '44', '1']
core           INFO 	Loading data for Japanese Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Australian Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '4', '14', '63', '81', '44', '22', '27', '18', '20', '77', '31', '10', '2', '24', '3', '23']
core           INFO 	Loading data for Chinese Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Japanese Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '11', '16', '55', '63', '14', '81', '44', '27', '31', '23', '10', '24', '18', '20', '2', '3', '22', '77']
core           INFO 	Loading data for Miami Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Chinese Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '16', '11', '55', '44', '22', '63', '14', '31', '27', '10', '81', '24', '3', '77', '18', '23', '20', '2']
core           INFO 	Loading data for Emilia Romagna Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Miami Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '16', '81', '55', '44', '63', '11', '18', '22', '27', '20', '3', '31', '24', '10', '2', '77', '14', '23']
core           INFO 	Loading data for Monaco Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Emilia Romagna Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '81', '55', '4', '63', '1', '44', '22', '23', '10', '14', '3', '77', '18', '2', '24', '31', '11', '27', '20']
core           INFO 	Loading data for Canadian Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Monaco Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '63', '44', '81', '14', '18', '3', '10', '31', '27', '20', '77', '22', '24', '55', '23', '11', '16', '2']
core           INFO 	Loading data for Spanish Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Canadian Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '44', '63', '16', '55', '81', '11', '10', '31', '27', '14', '24', '18', '3', '77', '20', '23', '22', '2']
core           INFO 	Loading data for Austrian Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Spanish Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['63', '81', '55', '44', '1', '27', '11', '20', '3', '10', '16', '31', '18', '22', '23', '77', '24', '14', '2', '4']
core           INFO 	Loading data for British Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Austrian Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '1', '4', '81', '55', '27', '18', '14', '23', '22', '2', '20', '3', '16', '77', '31', '11', '24', '63', '10']
core           INFO 	Loading data for Hungarian Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 British Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['81', '4', '44', '16', '1', '55', '11', '63', '22', '18', '14', '3', '27', '23', '20', '77', '2', '31', '24', '10']
core           INFO 	Loading data for Belgian Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Hungarian Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '81', '16', '1', '4', '55', '11', '14', '31', '3', '18', '23', '10', '20', '77', '22', '2', '27', '24', '63']
core           INFO 	Loading data for Dutch Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Belgian Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '16', '81', '55', '11', '63', '44', '10', '14', '27', '3', '18', '23', '31', '2', '22', '20', '77', '24']
core           INFO 	Loading data for Italian Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Dutch Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '81', '4', '55', '44', '1', '63', '11', '23', '20', '14', '43', '3', '31', '10', '77', '27', '24', '18', '22']
core           INFO 	Loading data for Azerbaijan Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Italian Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['81', '16', '63', '4', '1', '14', '23', '43', '44', '50', '27', '10', '3', '24', '31', '77', '11', '55', '18', '22']
core           INFO 	Loading data for Singapore Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Azerbaijan Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '81', '63', '16', '44', '55', '14', '27', '11', '43', '22', '31', '18', '24', '77', '10', '3', '20', '23']
core           INFO 	Loading data for United States Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


Processed 2024 Singapore Grand Prix


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	No cached data found for weather_data. Loading data...
_api           INFO 	Fetching weather data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for race_control_messages. Loading data...
_api           INFO 	Fetching race control messages...
req            INFO 	Data has been written to cache!
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '1', '4', '81', '63', '11', '27', '30', '43', '20', '10', '14', '22', '18', '23', '77', '31', '24', '44

Processed 2024 United States Grand Prix


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

Processed 2024 Mexico City Grand Prix


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

Processed 2024 São Paulo Grand Prix


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

Processed 2024 Las Vegas Grand Prix


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

Processed 2024 Qatar Grand Prix


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

Processed 2024 Abu Dhabi Grand Prix


In [2]:
stints_2019 = pd.read_csv("stints_data_2019.csv", index_col=None)
stints_2020 = pd.read_csv("stints_data_2020.csv", index_col=None)
stints_2021 = pd.read_csv("stints_data_2021.csv", index_col=None)
stints_2022 = pd.read_csv("stints_data_2022.csv", index_col=None)
stints_2023 = pd.read_csv("stints_data_2023.csv", index_col=None)
stints_2024 = pd.read_csv("stints_data_2024.csv", index_col=None)

stints = pd.concat([stints_2019, stints_2020, stints_2021, stints_2022, stints_2023, stints_2024])
stints

Unnamed: 0,Driver,Stint,Compound,StintLength,Team,StartingPosition,Time,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed,Year,Circuit
0,ALB,1.0,SOFT,14,Toro Rosso,13.0,0 days 01:00:46.617934426,23.477869,70.453279,1015.334426,0.0,41.313115,155.327869,1.166393,2019,Australian Grand Prix
1,ALB,2.0,MEDIUM,43,Toro Rosso,13.0,0 days 01:00:46.617934426,23.477869,70.453279,1015.334426,0.0,41.313115,155.327869,1.166393,2019,Australian Grand Prix
2,BOT,1.0,SOFT,23,Mercedes,2.0,0 days 01:00:46.617934426,23.477869,70.453279,1015.334426,0.0,41.313115,155.327869,1.166393,2019,Australian Grand Prix
3,BOT,2.0,MEDIUM,35,Mercedes,2.0,0 days 01:00:46.617934426,23.477869,70.453279,1015.334426,0.0,41.313115,155.327869,1.166393,2019,Australian Grand Prix
4,GAS,1.0,MEDIUM,37,Red Bull Racing,17.0,0 days 01:00:46.617934426,23.477869,70.453279,1015.334426,0.0,41.313115,155.327869,1.166393,2019,Australian Grand Prix
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,VER,1.0,MEDIUM,29,Red Bull Racing,4.0,0 days 01:14:27.206128378,26.768243,51.445946,1017.426351,0.0,31.805405,137.033784,1.900676,2024,Abu Dhabi Grand Prix
1298,VER,2.0,HARD,29,Red Bull Racing,4.0,0 days 01:14:27.206128378,26.768243,51.445946,1017.426351,0.0,31.805405,137.033784,1.900676,2024,Abu Dhabi Grand Prix
1299,ZHO,1.0,MEDIUM,12,Kick Sauber,15.0,0 days 01:14:27.206128378,26.768243,51.445946,1017.426351,0.0,31.805405,137.033784,1.900676,2024,Abu Dhabi Grand Prix
1300,ZHO,2.0,HARD,27,Kick Sauber,15.0,0 days 01:14:27.206128378,26.768243,51.445946,1017.426351,0.0,31.805405,137.033784,1.900676,2024,Abu Dhabi Grand Prix


In [3]:
stints['Compound'].value_counts()

Compound
MEDIUM          2700
HARD            2204
SOFT            1590
INTERMEDIATE     503
WET               83
UNKNOWN           15
Name: count, dtype: int64

In [4]:
def transform_stints_data(df):
    """
    Transform stints data into a format where each row represents a driver's complete race strategy.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing stints data with columns: Driver, Stint, Compound, StintLength, Team,
        StartingPosition, Time, AirTemp, Humidity, Pressure, Rainfall, TrackTemp, WindDirection,
        WindSpeed, Year, Circuit
        
    Returns:
    --------
    pandas.DataFrame
        Transformed DataFrame with columns: Driver, Year, Circuit, Compounds, StintLengths, Team,
        StartingPosition, AirTemp, Humidity, Pressure, Rainfall, TrackTemp, WindDirection, WindSpeed
    """
    # Group by Driver, Year, and Circuit
    grouped = df.groupby(['Driver', 'Year', 'Circuit'])
    
    # Create the transformed DataFrame
    transformed = pd.DataFrame({
        'Compounds': grouped['Compound'].apply(list),
        'StintLengths': grouped['StintLength'].apply(list),
        'Team': grouped['Team'].first(),
        'StartingPosition': grouped['StartingPosition'].first(),
        'AirTemp': grouped['AirTemp'].first(),
        'Humidity': grouped['Humidity'].first(), 
        'Pressure': grouped['Pressure'].first(),
        'Rainfall': grouped['Rainfall'].first(),
        'TrackTemp': grouped['TrackTemp'].first(),
        'WindDirection': grouped['WindDirection'].first(),
        'WindSpeed': grouped['WindSpeed'].first()
    }).reset_index()

    transformed = transformed[~transformed['Compounds'].apply(lambda compounds: 'UNKNOWN' in compounds)]
    
    return transformed

# Example usage:
transformed_stints = transform_stints_data(stints)
transformed_stints

Unnamed: 0,Driver,Year,Circuit,Compounds,StintLengths,Team,StartingPosition,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,AIT,2020,Sakhir Grand Prix,"[MEDIUM, HARD, MEDIUM, SOFT]","[31, 23, 5, 28]",Williams,17.0,20.947244,58.169291,1015.941732,0.000000,23.469291,122.755906,1.948031
1,ALB,2019,Abu Dhabi Grand Prix,"[MEDIUM, HARD]","[13, 42]",Red Bull Racing,5.0,26.018045,64.471429,1015.383459,0.000000,29.566165,221.097744,1.264662
2,ALB,2019,Australian Grand Prix,"[SOFT, MEDIUM]","[14, 43]",Toro Rosso,13.0,23.477869,70.453279,1015.334426,0.000000,41.313115,155.327869,1.166393
3,ALB,2019,Austrian Grand Prix,"[MEDIUM, HARD]","[35, 35]",Toro Rosso,18.0,34.444068,17.965254,940.888983,0.000000,50.744068,188.855932,1.225424
4,ALB,2019,Azerbaijan Grand Prix,"[SOFT, MEDIUM]","[12, 38]",Toro Rosso,11.0,19.763566,51.408527,1013.775194,0.000000,39.654264,167.480620,1.249612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2548,ZHO,2024,Saudi Arabian Grand Prix,"[MEDIUM, SOFT]","[41, 8]",Kick Sauber,20.0,25.528082,62.335616,1012.682877,0.000000,31.593151,129.143836,1.302740
2549,ZHO,2024,Singapore Grand Prix,"[HARD, MEDIUM]","[34, 27]",Kick Sauber,20.0,30.786250,74.650000,1008.024375,0.000000,36.445625,192.493750,0.955000
2550,ZHO,2024,Spanish Grand Prix,"[SOFT, MEDIUM, HARD]","[9, 32, 24]",Kick Sauber,15.0,24.132468,63.441558,1001.356494,0.006494,41.096104,207.370130,2.123377
2551,ZHO,2024,São Paulo Grand Prix,"[INTERMEDIATE, INTERMEDIATE, WET, INTERMEDIATE...","[27, 1, 3, 1, 37]",Kick Sauber,19.0,21.662687,85.258706,926.456716,0.582090,25.769652,188.955224,0.735821


In [5]:
filtered_transformed_stints = transformed_stints[transformed_stints['Compounds'].apply(len) <= 4]
set(compound for compounds in filtered_transformed_stints['Compounds'] for compound in compounds)

{'HARD', 'INTERMEDIATE', 'MEDIUM', 'SOFT', 'WET'}

In [6]:
transformed_stints.to_csv(".\\stints_2019-2024.csv")

In [7]:
def prepare_df(df: pd.DataFrame, max_stints: int = 4) -> pd.DataFrame:
    """
    Prepares the DataFrame for modeling:
    1. Filters out entries with more than max_stints.
    2. Pads the 'Compounds' and 'StintLengths' lists.
    3. Explodes these lists into separate columns for each stint.
    """
    # 1. Filter rows with too many stints
    # Ensure original df is not modified if it's used elsewhere
    df_copy = df[df['StintLengths'].map(len) <= max_stints].copy()

    # 2. Pad lists to ensure uniform length
    df_copy['Compounds_padded'] = df_copy['Compounds'].map(
        lambda lst: lst + ['NONE'] * (max_stints - len(lst))
    )
    df_copy['Lengths_padded'] = df_copy['StintLengths'].map(
        lambda lst: lst + [0] * (max_stints - len(lst)) # Using 0 as padding for lengths
    )

    # 3. Explode lists into separate columns for each stint
    for i in range(max_stints):
        df_copy[f'Compound_{i+1}'] = df_copy['Compounds_padded'].str[i]
        df_copy[f'Length_{i+1}']   = df_copy['Lengths_padded'].str[i]
    return df_copy

In [8]:
def build_feature_matrix(df: pd.DataFrame, max_stints: int) -> Tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, 
    pd.DataFrame, pd.DataFrame, List[LabelEncoder], StandardScaler
]:
    """
    Builds feature and target matrices, splits data, and preprocesses:
    1. Defines feature and target column names based on max_stints.
    2. Splits data into training and testing sets *before* any fitting.
    3. Scales numeric features (StandardScaler fit on train, transform train/test).
    4. Encodes categorical compound targets (LabelEncoder fit on train, transform train/test).
    Returns:
        X_train_scaled, X_test_scaled,
        yc_train_encoded, yc_test_encoded,
        yl_train, yl_test,
        encoders (for compounds), scaler (for features)
    """
    # Define feature columns (ensure these columns exist in df)
    # Your original feature set:
    feature_cols = ['Year','StartingPosition','AirTemp','Humidity',
                    'Pressure','Rainfall','TrackTemp','WindDirection','WindSpeed']
    # Consider adding 'Driver', 'Circuit', 'Team' after appropriate encoding (e.g., OneHotEncoder)
    
    # Define target column names dynamically using max_stints
    compound_target_cols = [f'Compound_{i+1}' for i in range(max_stints)]
    length_target_cols = [f'Length_{i+1}' for i in range(max_stints)]

    # Ensure all specified columns are present in the DataFrame
    missing_cols = [col for col in feature_cols + compound_target_cols + length_target_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in DataFrame: {missing_cols}")

    X = df[feature_cols]
    y_comp_raw = df[compound_target_cols].copy() # Raw compound strings
    y_len = df[length_target_cols].copy()        # Lengths are already numeric

    # Split data *before* any fitting to prevent data leakage
    X_train, X_test, \
    yc_train_raw, yc_test_raw, \
    yl_train, yl_test = train_test_split(
        X, y_comp_raw, y_len,
        test_size=0.20,
        random_state=72,
        shuffle=True
    )

    # --- Preprocess Features (Scaling) ---
    scaler = StandardScaler()
    # Fit scaler ONLY on training data's numeric features
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train), 
        columns=X_train.columns, 
        index=X_train.index
    )
    # Transform test data using the SAME fitted scaler
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test), 
        columns=X_test.columns, 
        index=X_test.index
    )

    # --- Preprocess Compound Targets (Label Encoding) ---
    encoders: List[LabelEncoder] = []
    yc_train_encoded = yc_train_raw.copy()
    yc_test_encoded = yc_test_raw.copy()

    all_compounds = ['SOFT', 'MEDIUM', 'HARD', 'NONE', 'WET', 'INTERMEDIATE']
    for col in compound_target_cols:
        le = LabelEncoder()
        # Fit LabelEncoder ONLY on the training data for this target column
        le.fit(all_compounds)
        
        # Transform both training and testing data for this column
        yc_train_encoded[col] = le.transform(yc_train_raw[col])
        yc_test_encoded[col] = le.transform(yc_test_raw[col])
        
        encoders.append(le)
    
    return X_train_scaled, X_test_scaled, yc_train_encoded, yc_test_encoded, yl_train, yl_test, encoders, scaler

In [9]:
def train_and_eval(X_train: pd.DataFrame, X_test: pd.DataFrame,
                   yc_train: pd.DataFrame, yc_test: pd.DataFrame,  # Encoded compound targets
                   yl_train: pd.DataFrame, yl_test: pd.DataFrame,  # Stint length targets
                   encoders: List[LabelEncoder]  # <-- New argument: list of fitted LabelEncoders for compounds
                   ) -> tuple: # Adjusted return type hint if you want to be specific
    """
    Trains compound and stint length models, evaluates them, 
    and creates DataFrames for side-by-side comparison of predictions and actuals.

    Args:
        X_train: Training features.
        X_test: Testing features.
        yc_train: Encoded training compound labels.
        yc_test: Encoded testing compound labels.
        yl_train: Training stint lengths.
        yl_test: Testing stint lengths.
        encoders: List of fitted LabelEncoder objects for each compound target column.

    Returns:
        A tuple containing:
            - clf_model: Trained multi-output classifier for compounds.
            - reg_model: Trained multi-output regressor for stint lengths.
            - compounds_comparison_df: DataFrame comparing actual and predicted compounds (decoded).
            - lengths_comparison_df: DataFrame comparing actual and predicted stint lengths.
    """
    # 1) Compounds model (RandomForestClassifier)
    clf_model = MultiOutputClassifier(RandomForestClassifier(
        n_estimators=100, random_state=72, n_jobs=-1
    ))
    clf_model.fit(X_train, yc_train)
    # yc_pred will contain encoded predictions
    yc_pred = pd.DataFrame(clf_model.predict(X_test), columns=yc_test.columns, index=X_test.index)

    # 2) Lengths model (RandomForestRegressor)
    reg_model = MultiOutputRegressor(RandomForestRegressor(
        n_estimators=100, random_state=72, n_jobs=-1
    ))
    reg_model.fit(X_train, yl_train)
    # yl_pred will contain float predictions
    yl_pred = pd.DataFrame(reg_model.predict(X_test), columns=yl_test.columns, index=X_test.index)

    # 3) Metrics
    # Compounds: overall accuracy on encoded labels
    comp_acc = np.mean([
        accuracy_score(yc_test[col], yc_pred[col])
        for col in yc_test.columns
    ])
    # Lengths: mean absolute error
    length_mae = mean_absolute_error(yl_test, yl_pred)

    print(f"Compound accuracy (avg over slots, encoded): {comp_acc:.3f}")
    print(f"Length MAE (all slots): {length_mae:.3f}")

    # 4) Create Comparison DataFrames

    # --- Compounds Comparison (Decoded) ---
    # `encoders` list should correspond to the order of columns in yc_test/yc_pred
    compound_comparison_cols = {}
    for i, col_name in enumerate(yc_test.columns):
        encoder = encoders[i] # Get the specific encoder for this compound stint
        compound_comparison_cols[f'{col_name}_Actual'] = encoder.inverse_transform(yc_test[col_name])
        compound_comparison_cols[f'{col_name}_Predicted'] = encoder.inverse_transform(yc_pred[col_name])
    
    compounds_comparison_df = pd.DataFrame(compound_comparison_cols, index=X_test.index)
    # Reorder columns for better side-by-side view if many stints
    ordered_compound_cols = []
    for col_name in yc_test.columns: # e.g., Compound_1, Compound_2
        ordered_compound_cols.append(f'{col_name}_Actual')
        ordered_compound_cols.append(f'{col_name}_Predicted')
    compounds_comparison_df = compounds_comparison_df[ordered_compound_cols]


    # --- Stint Lengths Comparison ---
    # Round predicted lengths to the nearest integer for better comparison
    yl_pred_rounded = yl_pred.round().astype(int)

    length_comparison_cols = {}
    for col_name in yl_test.columns: # e.g., Length_1, Length_2
        length_comparison_cols[f'{col_name}_Actual'] = yl_test[col_name]
        length_comparison_cols[f'{col_name}_Predicted'] = yl_pred_rounded[col_name]
        
    lengths_comparison_df = pd.DataFrame(length_comparison_cols, index=X_test.index)
    # Reorder columns
    ordered_length_cols = []
    for col_name in yl_test.columns:
        ordered_length_cols.append(f'{col_name}_Actual')
        ordered_length_cols.append(f'{col_name}_Predicted')
    lengths_comparison_df = lengths_comparison_df[ordered_length_cols]


    # Display the head of the comparison DataFrames
    # print("\n--- Compound Predictions vs Actual (Decoded) ---")
    # print(compounds_comparison_df.head())
    # print("\n--- Stint Length Predictions vs Actual ---")
    # print(lengths_comparison_df.head())
    
    return clf_model, reg_model, compounds_comparison_df, lengths_comparison_df

In [10]:
MAX_STINTS = 4 # Define this once

df_prepared = prepare_df(transformed_stints, max_stints=MAX_STINTS)

# Now, build_feature_matrix also takes max_stints
X_train, X_test, \
yc_train, yc_test, \
yl_train, yl_test, \
compound_encoders, feature_scaler = build_feature_matrix(df_prepared, max_stints=MAX_STINTS)

# Train and evaluate
clf_model, reg_model, compounds_comp_df, lengths_comp_df = train_and_eval(
    X_train, X_test, 
    yc_train, yc_test, 
    yl_train, yl_test,
    compound_encoders  # <-- Pass the encoders here
)

Compound accuracy (avg over slots, encoded): 0.682
Length MAE (all slots): 6.298


In [11]:
def simplify_compound_columns(df: pd.DataFrame) -> pd.DataFrame:
    actual_cols = [col for col in df.columns if "Actual" in col]
    predicted_cols = [col for col in df.columns if "Predicted" in col]

    df["Compound_Actual"] = df[actual_cols].values.tolist()
    df["Compound_Predicted"] = df[predicted_cols].values.tolist()

    return df[["Compound_Actual", "Compound_Predicted"]]

simplified_df = simplify_compound_columns(compounds_comp_df)
simplified_df


Unnamed: 0,Compound_Actual,Compound_Predicted
1100,"[HARD, MEDIUM, MEDIUM, NONE]","[HARD, MEDIUM, SOFT, NONE]"
795,"[HARD, MEDIUM, NONE, NONE]","[MEDIUM, HARD, SOFT, NONE]"
219,"[MEDIUM, HARD, HARD, NONE]","[MEDIUM, HARD, HARD, NONE]"
2461,"[MEDIUM, HARD, NONE, NONE]","[MEDIUM, HARD, NONE, NONE]"
837,"[MEDIUM, HARD, MEDIUM, NONE]","[MEDIUM, HARD, NONE, NONE]"
...,...,...
564,"[MEDIUM, HARD, SOFT, NONE]","[MEDIUM, HARD, MEDIUM, NONE]"
1876,"[MEDIUM, HARD, NONE, NONE]","[MEDIUM, HARD, NONE, NONE]"
2443,"[MEDIUM, HARD, NONE, NONE]","[MEDIUM, HARD, SOFT, NONE]"
1836,"[SOFT, HARD, MEDIUM, SOFT]","[SOFT, MEDIUM, HARD, NONE]"


In [12]:
def simplify_lengths_columns(df: pd.DataFrame) -> pd.DataFrame:
    actual_cols = [col for col in df.columns if "Actual" in col]
    predicted_cols = [col for col in df.columns if "Predicted" in col]
    
    # Sum the actual and predicted columns
    df["Length_Actual"] = df[actual_cols].sum(axis=1)
    df["Length_Predicted"] = df[predicted_cols].sum(axis=1)
    # Calculate the difference directly from the summed columns
    df["Sum_difference"] = df["Length_Actual"] - df["Length_Predicted"]
    
    return df[["Length_Actual", "Length_Predicted", "Sum_difference"]]

# simplified_lengths_df = simplify_lengths_columns(lengths_comp_df)
# simplified_lengths_df
lengths_comp_df.dtypes


Length_1_Actual       int64
Length_1_Predicted    int64
Length_2_Actual       int64
Length_2_Predicted    int64
Length_3_Actual       int64
Length_3_Predicted    int64
Length_4_Actual       int64
Length_4_Predicted    int64
dtype: object