En este Jupyter Notebook construiremos un modelo que nos permita predecir quién ganará una carrera y en qué posición quedará un piloto

In [1]:
import pandas as pd
import numpy as np

import fastf1
from fastf1.ergast import Ergast

from tqdm import tqdm
import os

import sys
sys.path.append('../')

from src.race_prediction_model.extract import *
from src.race_prediction_model.feature_engineering import add_features_to_results

In [2]:
# Config less verbosity
fastf1.set_log_level('ERROR')

Extracción de datos

In [64]:
df_races = pd.read_csv('../data/output/races.csv')

In [65]:
df_races

Unnamed: 0,season,round,circuitId
0,2010,1,bahrain
1,2010,2,albert_park
2,2010,3,sepang
3,2010,4,shanghai
4,2010,5,catalunya
...,...,...,...
300,2024,20,rodriguez
301,2024,21,interlagos
302,2024,22,vegas
303,2024,23,losail


Tenemos 305 carreras desde 2010 hasta 2024. Extraemos los resultados

In [66]:
results_final_df, sessions = extract_results_dataframe(df_races.head())

Processing results.: 0it [00:00, ?it/s]

Loading 2010 season. Round: 1...


Processing results.: 1it [00:04,  4.56s/it]

Loading 2010 season. Round: 2...


Processing results.: 2it [00:06,  2.84s/it]

Loading 2010 season. Round: 3...


Processing results.: 3it [00:07,  2.20s/it]

Loading 2010 season. Round: 4...
Call limit reached. Waiting...


Processing results.: 4it [01:09, 25.61s/it]

Loading 2010 season. Round: 5...


Processing results.: 5it [01:11, 14.21s/it]

Saving results in ../data/output/results.csv





In [67]:
results_final_df

Unnamed: 0_level_0,DriverId,TeamId,Position,GridPosition,Time,Status,Points,season,round,circuitId
DriverNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8,alonso,ferrari,1.0,3.0,0.000,Finished,25.0,2010,1,bahrain
7,massa,ferrari,2.0,2.0,16.099,Finished,18.0,2010,1,bahrain
2,hamilton,mclaren,3.0,4.0,23.182,Finished,15.0,2010,1,bahrain
5,vettel,red_bull,4.0,1.0,38.799,Finished,12.0,2010,1,bahrain
4,rosberg,mercedes,5.0,5.0,40.213,Finished,10.0,2010,1,bahrain
...,...,...,...,...,...,...,...,...,...,...
16,buemi,toro_rosso,20.0,14.0,173.677,Hydraulics,0.0,2010,5,catalunya
20,chandhok,hrt,21.0,24.0,173.677,Suspension,0.0,2010,5,catalunya
22,rosa,sauber,22.0,12.0,173.677,Collision,0.0,2010,5,catalunya
21,bruno_senna,hrt,23.0,21.0,173.677,Accident,0.0,2010,5,catalunya


Carga de datos

In [68]:
df_races = pd.read_csv('../data/output/races.csv', index_col=0)

df_results = pd.read_csv('../data/output/results.csv', index_col=0)

Unimos carreras y resultados

In [69]:
full_dataset = df_results.merge(right=df_races, how='inner', on=['season', 'round', 'circuitId'])

Lo guardo en una carpeta dummy

In [70]:
full_dataset.to_csv('../data/test/results_test.csv', index=False)

In [71]:
df = pd.read_csv('../data/test/results_test.csv')

---

### Feature engineering

Aquí añadiremos alguna variable que pueda mejorar el rendimiento de nuestros modelos

In [40]:
add_features_to_results(df)

df

Unnamed: 0,DriverId,TeamId,Position,GridPosition,Time,Status,Points,season,round,circuitId,...,yellows,reds,sc,vsc,DriverPointsCumulative,TeamPointsCumulative,Winner,Podium,WinsCumulative,PodiumsCumulative
0,hamilton,mercedes,1.0,2.0,0.000,Finished,25.0,2021,1,bahrain,...,4,0,1,1,25.0,25.0,1,1,1,1
1,max_verstappen,red_bull,2.0,1.0,0.745,Finished,18.0,2021,1,bahrain,...,4,0,1,1,18.0,18.0,0,1,0,1
2,bottas,mercedes,3.0,3.0,37.383,Finished,16.0,2021,1,bahrain,...,4,0,1,1,16.0,41.0,0,1,0,1
3,norris,mclaren,4.0,7.0,46.466,Finished,12.0,2021,1,bahrain,...,4,0,1,1,12.0,12.0,0,0,0,0
4,perez,red_bull,5.0,0.0,52.047,Finished,10.0,2021,1,bahrain,...,4,0,1,1,10.0,28.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,latifi,williams,16.0,16.0,167.527,Accident,0.0,2021,22,yas_marina,...,2,0,1,1,7.0,23.0,0,0,0,0
436,giovinazzi,alfa,17.0,14.0,167.527,Gearbox,0.0,2021,22,yas_marina,...,2,0,1,1,3.0,13.0,0,0,0,0
437,russell,williams,18.0,17.0,167.527,Gearbox,0.0,2021,22,yas_marina,...,2,0,1,1,16.0,23.0,0,0,0,1
438,raikkonen,alfa,19.0,18.0,167.527,Brakes,0.0,2021,22,yas_marina,...,2,0,1,1,10.0,13.0,0,0,0,0


### Preproceso

In [246]:
# Data processing  
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
import pickle

# Pandas options  
# -----------------------------------------------------------------------
pd.options.display.max_colwidth = None

# Path configuration for custom module imports  
# -----------------------------------------------------------------------
import sys  
sys.path.append('../')  # Adds the parent directory to the path for custom module imports  

# Ignore warnings  
# -----------------------------------------------------------------------
import warnings  
warnings.filterwarnings("ignore") 

# Machine learning imports
# -----------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

# Custom functions and classes
# -----------------------------------------------------------------------
from src.preprocess_support import Encoding

In [None]:
df.select_dtypes(include='number').columns

Por el momento no vamos a escalar nada

In [None]:
# StandardScaler
numeric_features = []

numeric_transformer = StandardScaler()

scaled_data = numeric_transformer.fit_transform(df[numeric_features])
df[numeric_features] = scaled_data

### Encoding

In [260]:
df.dropna(inplace=True)

In [None]:
df.select_dtypes(include='O').columns

In [273]:
encoding_methods = {"onehot": ['TeamId', 'weather'],
                    "target": ['DriverId'],
                    "ordinal" : {
                        'circuitId': df['circuitId'].unique().tolist(),
                        'Status': df['Status'].unique().tolist()
                        },
                    "frequency": []
                    }

encoder = Encoding(df, encoding_methods, 'Position')

In [274]:
df_preprocessed = encoder.execute_all_encodings()

In [None]:
df_preprocessed