En este Jupyter Notebook construiremos un modelo que nos permita predecir quién ganará una carrera y en qué posición quedará un piloto

In [100]:
import sys
sys.path.append('../')

from src.race_models_support import *

In [1]:
import pandas as pd
import numpy as np
import requests

---

Esto veremos

In [None]:
season = 2023

url = f'http://api.jolpi.ca/ergast/f1/{season}/driverstandings/'

response = requests.get(url)
print(response.status_code)

content = response.json()

Obtener los circuitos de cada temporada

In [20]:
from fastf1.ergast import Ergast
import fastf1

ergast = Ergast()

---

Vamos a obtener los datos para crear el modelo predictivo

Cargamos los resultados de una sesión (carrera). No necesitamos telemetría ni tiempo atmosférico

In [101]:
session = fastf1.get_session(2014, 1, 'R')

In [15]:
def get_race_results(session, margin = 100):
    """
    Retrieves race results and processes the data to standardize timing and statuses.

    Parameters
    -----------
    - session (fastf1.core.Session): The FastF1 session object containing race data, including results.
    - margin (int, optional): The time margin (in seconds) added to non-finishers' times. Defaults to 100.

    Returns
    --------
    - (pd.DataFrame): A DataFrame containing processed race results with the following columns:
        - 'DriverId': Identifier for the driver.
        - 'TeamId': Identifier for the team.
        - 'Position': Final position of the driver.
        - 'GridPosition': Starting grid position of the driver.
        - 'Time': Race time in seconds (adjusted for winners and non-finishers).
        - 'Status': Status of the driver (e.g., 'Finished', 'Retired').
        - 'Points': Points scored by the driver.
    """

    # Get results dataframe
    results = session.results
    results = results.loc[:, ['DriverId', 'TeamId', 'Position', 'GridPosition', 'Time', 'Status', 'Points']]

    # Fix winner time to 0 and convert to seconds
    results.iloc[0, results.columns.get_loc('Time')] = pd.Timedelta(0)
    results['Time'] = results['Time'].dt.total_seconds()

    # Fix non-finishers time to avoid NaT
    max_finished_time = results.loc[results['Status'] == 'Finished', 'Time'].max()
    results.loc[results['Status'] != 'Finished', 'Time'] = max_finished_time + margin

    return results

    """
    # Weather
    compounds = session.laps['Compound'].value_counts(normalize=True)
    results['weather'] = get_weather_condition(compounds)

    # Flags, SC and VSC
    track_status = session.track_status['Message'].value_counts()
    results['yellows'] = track_status.get('Yellow', 0)
    results['reds'] = track_status.get('Red', 0)
    results['safety_cars'] = track_status.get('SCDeployed', 0)
    results['virtual_safety_cars'] = track_status.get('VSCDeployed', 0)
    """

    return results


In [188]:
from tqdm import tqdm
from time import sleep

In [203]:
season

2022

In [204]:
rnd

21

In [None]:
from ratelimit import limits, sleep_and_retry

# Límite: 500 llamadas por hora
@limits(calls=500, period=3600)
@sleep_and_retry
def llamada_api():
    # Aquí va el código que hace la llamada a la API
    print("Llamada realizada")
    # Simula el código real con una función que llama a la API

# Ejecuta las llamadas
for i in range(500):  # Realiza exactamente 500 llamadas
    llamada_api()


In [213]:
import fastf1

# Crear un diccionario para almacenar las sesiones
sesiones = {}

# Iterar sobre temporadas y rondas
seasons = [2023, 2024]
rounds = list(range(1, 4))  # Ejemplo: 22 rondas por temporada

for season in seasons:
    for rnd in rounds:
        try:
            print(f"Cargando temporada {season}, ronda {rnd}...")
            session = fastf1.get_session(season, rnd, 'R')
            session.load(telemetry=False, weather=False)
            # Guardar la sesión en el diccionario
            sesiones[(season, rnd)] = session
        except Exception as e:
            print(f"No se pudo cargar la temporada {season}, ronda {rnd}: {e}")

print("Carga completa.")


req            INFO 	No cached data found for season_schedule. Loading data...
_api           INFO 	Fetching season schedule...


Cargando temporada 2023, ronda 1...




No se pudo cargar la temporada 2023, ronda 1: Failed to load any schedule data.
Cargando temporada 2023, ronda 2...


req            INFO 	No cached data found for season_schedule. Loading data...
_api           INFO 	Fetching season schedule...


No se pudo cargar la temporada 2023, ronda 2: Failed to load any schedule data.
Cargando temporada 2023, ronda 3...


req            INFO 	No cached data found for season_schedule. Loading data...
_api           INFO 	Fetching season schedule...


No se pudo cargar la temporada 2023, ronda 3: Failed to load any schedule data.
Cargando temporada 2024, ronda 1...


req            INFO 	Using cached data for season_schedule


No se pudo cargar la temporada 2024, ronda 1: Invalid round: 1
Cargando temporada 2024, ronda 2...


req            INFO 	Using cached data for season_schedule


No se pudo cargar la temporada 2024, ronda 2: Invalid round: 2
Cargando temporada 2024, ronda 3...


req            INFO 	Using cached data for season_schedule


No se pudo cargar la temporada 2024, ronda 3: Invalid round: 3
Carga completa.


In [209]:
sesiones

{}

In [202]:
results_df = pd.DataFrame()
races_df = pd.DataFrame()

for i in range(2021, 2024):

    # Obtener las carreras de la temporada
    races = ergast.get_race_schedule(i)
    races = races.loc[:, ['season', 'round', 'circuitId']]

    # Añadimos a parte la información adicional
    info = {'weather': [], 'yellows': [], 'reds': [], 'sc': [], 'vsc': []}

    # Dentro de una temporada iteramos por carrera
    for race in tqdm(races.itertuples()):

        # Obtenemos la temporada y la ronda
        season = race[1]
        rnd = race[2]

        # Cargamos la sesión
        session = fastf1.get_session(season, rnd, 'R')
        session.load(telemetry=False, weather=False)

        # Obtenemos los resultados
        results = get_race_results(session)

        # Añadimos la información
        results['season'] = season
        results['round'] = rnd
        results['circuitId'] = race[3]

        # Juntamos con los resultados anteriores
        results_df = pd.concat([results_df, results])
        
        # Get 
        get_extra_info(session, info)
        

    info_df = pd.DataFrame(info)
    
    # Añadimos la info
    races = pd.concat([races, info_df], axis=1)
    races_df = pd.concat([races_df, races])


0it [00:00, ?it/s]core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '77', '4', '11', '16', '3', '55', '22', '18', '7', '99', '31', '63', '5', '47', '10', '6', '14', '9']
1it [00:05,  5.45s/it]core           INFO 	Loading data for Emilia Romagna Grand Prix - Race [v3.4.4]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching ses

DataNotLoadedError: The data you are trying to access has not been loaded yet. See `Session.load`

In [189]:
df_aux = results_df.drop(columns=['weather', 'yellows', 'reds', 'safety_cars', 'virtual_safety_cars'])

In [200]:
races_df

Unnamed: 0,season,round,circuitId,weather,yellows,reds,sc,vsc
0,2020,1,red_bull_ring,dry,7,0,3,0
1,2020,2,red_bull_ring,dry,1,0,1,0
2,2020,3,hungaroring,dry,4,0,0,0
3,2020,4,silverstone,dry,3,0,2,0
4,2020,5,silverstone,dry,2,0,0,0
5,2020,6,catalunya,dry,1,0,0,0
6,2020,7,spa,dry,4,0,1,0
7,2020,8,monza,dry,3,1,2,0
8,2020,9,mugello,dry,3,2,3,0
9,2020,10,sochi,dry,2,0,1,1


In [198]:
df_aux

Unnamed: 0,DriverId,TeamId,Position,GridPosition,Time,Status,Points,season,round,circuitId,DriverPointsCumulative,TeamPointsCumulative
77,bottas,mercedes,1.0,1.0,0.000,Finished,25.0,2020,1,red_bull_ring,25.0,25.0
33,max_verstappen,red_bull,20.0,2.0,131.650,Electronics,0.0,2020,1,red_bull_ring,0.0,0.0
3,ricciardo,renault,19.0,10.0,131.650,Overheating,0.0,2020,1,red_bull_ring,0.0,0.0
18,stroll,racing_point,18.0,9.0,131.650,Engine,0.0,2020,1,red_bull_ring,0.0,0.0
20,kevin_magnussen,haas,17.0,16.0,131.650,Brakes,0.0,2020,1,red_bull_ring,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
77,bottas,mercedes,2.0,2.0,15.976,Finished,18.0,2020,17,yas_marina,223.0,573.0
33,max_verstappen,red_bull,1.0,1.0,0.000,Finished,25.0,2020,17,yas_marina,214.0,319.0
51,pietro_fittipaldi,haas,19.0,17.0,202.738,+2 Laps,0.0,2020,17,yas_marina,0.0,3.0
31,ocon,renault,9.0,10.0,101.069,Finished,2.0,2020,17,yas_marina,62.0,181.0


In [197]:
results_df

Unnamed: 0,DriverId,TeamId,Position,GridPosition,Time,Status,Points,weather,yellows,reds,safety_cars,virtual_safety_cars,season,round,circuitId,DriverPointsCumulative,TeamPointsCumulative
77,bottas,mercedes,1.0,1.0,0.000,Finished,25.0,dry,7,0,3,0,2020,1,red_bull_ring,25.0,25.0
33,max_verstappen,red_bull,20.0,2.0,131.650,Electronics,0.0,dry,7,0,3,0,2020,1,red_bull_ring,0.0,0.0
3,ricciardo,renault,19.0,10.0,131.650,Overheating,0.0,dry,7,0,3,0,2020,1,red_bull_ring,0.0,0.0
18,stroll,racing_point,18.0,9.0,131.650,Engine,0.0,dry,7,0,3,0,2020,1,red_bull_ring,0.0,0.0
20,kevin_magnussen,haas,17.0,16.0,131.650,Brakes,0.0,dry,7,0,3,0,2020,1,red_bull_ring,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,bottas,mercedes,2.0,2.0,15.976,Finished,18.0,dry,1,0,1,1,2020,17,yas_marina,223.0,573.0
33,max_verstappen,red_bull,1.0,1.0,0.000,Finished,25.0,dry,1,0,1,1,2020,17,yas_marina,214.0,319.0
51,pietro_fittipaldi,haas,19.0,17.0,202.738,+2 Laps,0.0,dry,1,0,1,1,2020,17,yas_marina,0.0,3.0
31,ocon,renault,9.0,10.0,101.069,Finished,2.0,dry,1,0,1,1,2020,17,yas_marina,62.0,181.0


In [190]:
df_aux.merge(right=races, how='inner', on=['season', 'round', 'circuitId'])

Unnamed: 0,DriverId,TeamId,Position,GridPosition,Time,Status,Points,season,round,circuitId,DriverPointsCumulative,TeamPointsCumulative,weather,yellows,reds,sc,vsc
0,bottas,mercedes,1.0,1.0,0.000,Finished,25.0,2020,1,red_bull_ring,25.0,25.0,dry,7,0,3,0
1,max_verstappen,red_bull,20.0,2.0,131.650,Electronics,0.0,2020,1,red_bull_ring,0.0,0.0,dry,7,0,3,0
2,ricciardo,renault,19.0,10.0,131.650,Overheating,0.0,2020,1,red_bull_ring,0.0,0.0,dry,7,0,3,0
3,stroll,racing_point,18.0,9.0,131.650,Engine,0.0,2020,1,red_bull_ring,0.0,0.0,dry,7,0,3,0
4,kevin_magnussen,haas,17.0,16.0,131.650,Brakes,0.0,2020,1,red_bull_ring,0.0,0.0,dry,7,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,bottas,mercedes,2.0,2.0,15.976,Finished,18.0,2020,17,yas_marina,223.0,573.0,dry,1,0,1,1
336,max_verstappen,red_bull,1.0,1.0,0.000,Finished,25.0,2020,17,yas_marina,214.0,319.0,dry,1,0,1,1
337,pietro_fittipaldi,haas,19.0,17.0,202.738,+2 Laps,0.0,2020,17,yas_marina,0.0,3.0,dry,1,0,1,1
338,ocon,renault,9.0,10.0,101.069,Finished,2.0,2020,17,yas_marina,62.0,181.0,dry,1,0,1,1


In [None]:
def features(results):

    # Ordena el DataFrame por temporada, ronda y cualquier otro criterio necesario
    results = results.sort_values(by=['season', 'round'], ascending=[True, True])

    # Calcula los puntos acumulados por piloto para cada temporada
    results['DriverPointsCumulative'] = results.groupby(['season', 'DriverId'])['Points'].cumsum()

    # Calcula los puntos acumulados por equipo para cada temporada
    results['TeamPointsCumulative'] = results.groupby(['season', 'TeamId'])['Points'].cumsum()

    # Visualiza las primeras filas para verificar
    print(results[['season', 'round', 'DriverId', 'TeamId', 'Points', 'DriverPointsCumulative', 'TeamPointsCumulative']].head())


In [89]:
# Ordenar para sumar bien
results_df = results_df.sort_values(by='round', ascending=True)

# Agrupa los resultados por piloto y calcula los puntos acumulados
results_df['DriverPointsCumulative'] = results_df.groupby('DriverId')['Points'].cumsum()

# Agrupa los resultados por equipo y calcula los puntos acumulados
results_df['TeamPointsCumulative'] = results_df.groupby('TeamId')['Points'].cumsum()

# Visualiza las primeras filas para verificar
results_df

Unnamed: 0,DriverId,TeamId,Position,GridPosition,Time,Status,Points,weather,yellows,reds,safety_cars,virtual_safety_cars,season,round,circuitId,DriverPointsCumulative,TeamPointsCumulative
77,bottas,mercedes,1.0,1.0,0.000,Finished,25.0,dry,7,0,3,0,2020,1,red_bull_ring,25.0,25.0
33,max_verstappen,red_bull,20.0,2.0,131.650,Electronics,0.0,dry,7,0,3,0,2020,1,red_bull_ring,0.0,0.0
3,ricciardo,renault,19.0,10.0,131.650,Overheating,0.0,dry,7,0,3,0,2020,1,red_bull_ring,0.0,0.0
18,stroll,racing_point,18.0,9.0,131.650,Engine,0.0,dry,7,0,3,0,2020,1,red_bull_ring,0.0,0.0
20,kevin_magnussen,haas,17.0,16.0,131.650,Brakes,0.0,dry,7,0,3,0,2020,1,red_bull_ring,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,bottas,mercedes,2.0,2.0,15.976,Finished,18.0,dry,1,0,1,1,2020,17,yas_marina,223.0,573.0
33,max_verstappen,red_bull,1.0,1.0,0.000,Finished,25.0,dry,1,0,1,1,2020,17,yas_marina,214.0,319.0
51,pietro_fittipaldi,haas,19.0,17.0,202.738,+2 Laps,0.0,dry,1,0,1,1,2020,17,yas_marina,0.0,3.0
31,ocon,renault,9.0,10.0,101.069,Finished,2.0,dry,1,0,1,1,2020,17,yas_marina,62.0,181.0


In [234]:
import os 
# Para ver path actual
os.listdir('../data/')

['test', 'output']

Guardamos el fichero

In [238]:
results_df.to_csv('../data/test/results_test.csv')

Cargamos el fichero para preprocesar

In [241]:
df = pd.read_csv('../data/test/results_test.csv', index_col=0)

In [242]:
df

Unnamed: 0,DriverId,TeamId,Position,GridPosition,Time,Status,Points,weather,yellows,reds,safety_cars,virtual_safety_cars,season,round,circuitId
44,hamilton,mercedes,1.0,2.0,0.000,Finished,25.0,dry,4,0,1,1,2021,1,bahrain
33,max_verstappen,red_bull,2.0,1.0,0.745,Finished,18.0,dry,4,0,1,1,2021,1,bahrain
77,bottas,mercedes,3.0,3.0,37.383,Finished,16.0,dry,4,0,1,1,2021,1,bahrain
4,norris,mclaren,4.0,7.0,46.466,Finished,12.0,dry,4,0,1,1,2021,1,bahrain
11,perez,red_bull,5.0,0.0,52.047,Finished,10.0,dry,4,0,1,1,2021,1,bahrain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,mick_schumacher,haas,16.0,15.0,168.774,+1 Lap,0.0,dry,2,0,0,1,2022,20,rodriguez
20,kevin_magnussen,haas,17.0,19.0,168.774,+1 Lap,0.0,dry,2,0,0,1,2022,20,rodriguez
6,latifi,williams,18.0,18.0,168.774,+2 Laps,0.0,dry,2,0,0,1,2022,20,rodriguez
14,alonso,alpine,19.0,9.0,168.774,Engine,0.0,dry,2,0,0,1,2022,20,rodriguez


In [246]:
# Data processing  
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
import pickle

# Pandas options  
# -----------------------------------------------------------------------
pd.options.display.max_colwidth = None

# Path configuration for custom module imports  
# -----------------------------------------------------------------------
import sys  
sys.path.append('../')  # Adds the parent directory to the path for custom module imports  

# Ignore warnings  
# -----------------------------------------------------------------------
import warnings  
warnings.filterwarnings("ignore") 

# Machine learning imports
# -----------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

# Custom functions and classes
# -----------------------------------------------------------------------
from src.preprocess_support import Encoding

In [248]:
df.select_dtypes(include='number').columns

Index(['Position', 'GridPosition', 'Time', 'Points', 'yellows', 'reds',
       'safety_cars', 'virtual_safety_cars', 'season', 'round'],
      dtype='object')

Por el momento no vamos a escalar nada

In [None]:
# StandardScaler
numeric_features = []

numeric_transformer = StandardScaler()

scaled_data = numeric_transformer.fit_transform(df[numeric_features])
df[numeric_features] = scaled_data

### Encoding

In [260]:
df.dropna(inplace=True)

In [261]:
df.select_dtypes(include='O').columns

Index(['DriverId', 'TeamId', 'Status', 'weather', 'circuitId'], dtype='object')

In [273]:
encoding_methods = {"onehot": ['TeamId', 'weather'],
                    "target": ['DriverId'],
                    "ordinal" : {
                        'circuitId': df['circuitId'].unique().tolist(),
                        'Status': df['Status'].unique().tolist()
                        },
                    "frequency": []
                    }

encoder = Encoding(df, encoding_methods, 'Position')

In [274]:
df_preprocessed = encoder.execute_all_encodings()

In [275]:
df_preprocessed

Unnamed: 0,DriverId,Position,GridPosition,Time,Points,yellows,reds,safety_cars,virtual_safety_cars,season,...,TeamId_haas,TeamId_mclaren,TeamId_mercedes,TeamId_red_bull,TeamId_williams,weather_dry,weather_mixed,weather_wet,circuitId,Status
0,5.075382,1.0,2.0,0.000,25.0,4,0,1,1,2021,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,4.496650,2.0,1.0,0.745,18.0,4,0,1,1,2021,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,9.726671,3.0,3.0,37.383,16.0,4,0,1,1,2021,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,7.969041,4.0,7.0,46.466,12.0,4,0,1,1,2021,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,6.880219,5.0,0.0,52.047,10.0,4,0,1,1,2021,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
823,15.050453,16.0,15.0,168.774,0.0,2,0,0,1,2022,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,16.0,1.0
824,12.116546,17.0,19.0,168.774,0.0,2,0,0,1,2022,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,16.0,1.0
825,15.278210,18.0,18.0,168.774,0.0,2,0,0,1,2022,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,16.0,6.0
826,10.313456,19.0,9.0,168.774,0.0,2,0,0,1,2022,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,16.0,14.0
