En este Jupyter Notebook construiremos un modelo que nos permita predecir quién ganará una carrera y en qué posición quedará un piloto

In [39]:
import pandas as pd
import numpy as np

import fastf1
from fastf1.ergast import Ergast

from tqdm import tqdm
import os

import sys
sys.path.append('../')

from src.race_prediction_model.extract import *
from src.race_prediction_model.feature_engineering import add_features_to_results

In [2]:
# Config less verbosity
fastf1.set_log_level('ERROR')

In [23]:
session = fastf1.get_session(2022, 'Monza', 'R')
session.load()

Extracción de datos

In [3]:
results_final_df, races_final_df, sessions = extract_races_and_results_dataframes(2021)

Loading 2021 season. Round: 1...
Loading 2021 season. Round: 2...
Loading 2021 season. Round: 3...
Loading 2021 season. Round: 4...
Call limit reached. Waiting...
Loading 2021 season. Round: 5...
Loading 2021 season. Round: 6...
Loading 2021 season. Round: 7...
Call limit reached. Waiting...
Loading 2021 season. Round: 8...
Loading 2021 season. Round: 9...
Loading 2021 season. Round: 10...
Call limit reached. Waiting...
Loading 2021 season. Round: 11...
Loading 2021 season. Round: 12...
Loading 2021 season. Round: 13...
Call limit reached. Waiting...
Loading 2021 season. Round: 14...
Loading 2021 season. Round: 15...
Loading 2021 season. Round: 16...
Call limit reached. Waiting...
Loading 2021 season. Round: 17...
Loading 2021 season. Round: 18...
Loading 2021 season. Round: 19...
Call limit reached. Waiting...
Loading 2021 season. Round: 20...
Loading 2021 season. Round: 21...
Loading 2021 season. Round: 22...
Call limit reached. Waiting...
Saving results in ../data/output/results.csv

Carga de datos

In [11]:
df_races = pd.read_csv('../data/output/races.csv', index_col=0)

df_results = pd.read_csv('../data/output/results.csv', index_col=0)

Unimos carreras y resultados

In [12]:
full_dataset = df_results.merge(right=df_races, how='inner', on=['season', 'round', 'circuitId'])

Lo guardo en una carpeta dummy

In [13]:
full_dataset.to_csv('../data/test/results_test.csv', index=False)

In [14]:
df = pd.read_csv('../data/test/results_test.csv')

In [15]:
df

Unnamed: 0,DriverId,TeamId,Position,GridPosition,Time,Status,Points,season,round,circuitId,weather,yellows,reds,sc,vsc
0,hamilton,mercedes,1.0,2.0,0.000,Finished,25.0,2021,1,bahrain,dry,4,0,1,1
1,max_verstappen,red_bull,2.0,1.0,0.745,Finished,18.0,2021,1,bahrain,dry,4,0,1,1
2,bottas,mercedes,3.0,3.0,37.383,Finished,16.0,2021,1,bahrain,dry,4,0,1,1
3,norris,mclaren,4.0,7.0,46.466,Finished,12.0,2021,1,bahrain,dry,4,0,1,1
4,perez,red_bull,5.0,0.0,52.047,Finished,10.0,2021,1,bahrain,dry,4,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,latifi,williams,16.0,16.0,167.527,Accident,0.0,2021,22,yas_marina,dry,2,0,1,1
436,giovinazzi,alfa,17.0,14.0,167.527,Gearbox,0.0,2021,22,yas_marina,dry,2,0,1,1
437,russell,williams,18.0,17.0,167.527,Gearbox,0.0,2021,22,yas_marina,dry,2,0,1,1
438,raikkonen,alfa,19.0,18.0,167.527,Brakes,0.0,2021,22,yas_marina,dry,2,0,1,1


### Feature engineering

Aquí añadiremos alguna variable que pueda mejorar el rendimiento de nuestros modelos

In [40]:
add_features_to_results(df)

df

Unnamed: 0,DriverId,TeamId,Position,GridPosition,Time,Status,Points,season,round,circuitId,...,yellows,reds,sc,vsc,DriverPointsCumulative,TeamPointsCumulative,Winner,Podium,WinsCumulative,PodiumsCumulative
0,hamilton,mercedes,1.0,2.0,0.000,Finished,25.0,2021,1,bahrain,...,4,0,1,1,25.0,25.0,1,1,1,1
1,max_verstappen,red_bull,2.0,1.0,0.745,Finished,18.0,2021,1,bahrain,...,4,0,1,1,18.0,18.0,0,1,0,1
2,bottas,mercedes,3.0,3.0,37.383,Finished,16.0,2021,1,bahrain,...,4,0,1,1,16.0,41.0,0,1,0,1
3,norris,mclaren,4.0,7.0,46.466,Finished,12.0,2021,1,bahrain,...,4,0,1,1,12.0,12.0,0,0,0,0
4,perez,red_bull,5.0,0.0,52.047,Finished,10.0,2021,1,bahrain,...,4,0,1,1,10.0,28.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,latifi,williams,16.0,16.0,167.527,Accident,0.0,2021,22,yas_marina,...,2,0,1,1,7.0,23.0,0,0,0,0
436,giovinazzi,alfa,17.0,14.0,167.527,Gearbox,0.0,2021,22,yas_marina,...,2,0,1,1,3.0,13.0,0,0,0,0
437,russell,williams,18.0,17.0,167.527,Gearbox,0.0,2021,22,yas_marina,...,2,0,1,1,16.0,23.0,0,0,0,1
438,raikkonen,alfa,19.0,18.0,167.527,Brakes,0.0,2021,22,yas_marina,...,2,0,1,1,10.0,13.0,0,0,0,0


### Preproceso

In [246]:
# Data processing  
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
import pickle

# Pandas options  
# -----------------------------------------------------------------------
pd.options.display.max_colwidth = None

# Path configuration for custom module imports  
# -----------------------------------------------------------------------
import sys  
sys.path.append('../')  # Adds the parent directory to the path for custom module imports  

# Ignore warnings  
# -----------------------------------------------------------------------
import warnings  
warnings.filterwarnings("ignore") 

# Machine learning imports
# -----------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

# Custom functions and classes
# -----------------------------------------------------------------------
from src.preprocess_support import Encoding

In [None]:
df.select_dtypes(include='number').columns

Por el momento no vamos a escalar nada

In [None]:
# StandardScaler
numeric_features = []

numeric_transformer = StandardScaler()

scaled_data = numeric_transformer.fit_transform(df[numeric_features])
df[numeric_features] = scaled_data

### Encoding

In [260]:
df.dropna(inplace=True)

In [None]:
df.select_dtypes(include='O').columns

In [273]:
encoding_methods = {"onehot": ['TeamId', 'weather'],
                    "target": ['DriverId'],
                    "ordinal" : {
                        'circuitId': df['circuitId'].unique().tolist(),
                        'Status': df['Status'].unique().tolist()
                        },
                    "frequency": []
                    }

encoder = Encoding(df, encoding_methods, 'Position')

In [274]:
df_preprocessed = encoder.execute_all_encodings()

In [None]:
df_preprocessed