# 1. Ideia inicial

Será feita um regressão de probabilidade de vitoria do piloto. Assim, vamos dar uma probabilidade de vitoria para os 10 primeiros colocados de 100% a 0%, para o primeiro ao ultimo colocado, respectivamente.
- Os dados serão agregados por driver_standings.
- Serão criados 3 modelos um para antes da corrida, outro apos as qualificações e outro durante a corrida.
    - Antes: só vai levar em considerações os dados da pista e do piloto.
    - Qualificação: vai levar em consideração os dados da pista, do piloto e da qualificação(tempos de volta).
    - Corrida: vai levar em consideração os dados da pista, do piloto, da qualificação e da corrida(melhor volta, voltas lideradas, pit stops, etc).

In [31]:
import pickle

import numpy as np
import pandas as pd
import numpy
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import cross_validate

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


class EvaluatedClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, classifier, cv=10, graphic=False, compact=False, is_regression=False):
        self.classifier = classifier
        self.cv = cv
        self.graphic = graphic
        self.compact = compact
        self.is_regression = is_regression

    def fit(self, X, y, *args, **kwargs):
        self.classifier.fit(X, y, *args, **kwargs)
        return self

    def predict(self, X):
        return self.classifier.predict(X)

    def save(self, path):
        pickle.dump(self, open(path, 'wb'))

    def fit_predict_cv(self, X, y, show_result=True):
        res = self._cross_validate(X, y, show_result=show_result)
        self.fit(X, y)
        return res

    def _cross_validate(self, X, y, show_result=True):
        classification = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
        regression = ['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'explained_variance',
                      'neg_median_absolute_error']
        scoring = classification if not self.is_regression else regression

        result = cross_validate(self.classifier, X, y,
                                scoring=scoring, n_jobs=-1)
        if show_result:
            print(f"> Validação Cruzada (cv={self.cv}):")
            if self.is_regression:
                print(
                    f"R2: {result['test_r2'].mean():.3f} (+/- {result['test_r2'].std() * 2:.3f})\n" + \
                    f"MAE: {-result['test_neg_mean_absolute_error'].mean():.3f} (+/- {result['test_neg_mean_absolute_error'].std() * 2:.3f})\n" + \
                    f"MSE: {-result['test_neg_mean_squared_error'].mean():.3f} (+/- {result['test_neg_mean_squared_error'].std() * 2:.3f})\n" + \
                    f"Explained Variance: {result['test_explained_variance'].mean():.3f} (+/- {result['test_explained_variance'].std() * 2:.3f})\n" + \
                    f"Median Absolute Error: {-result['test_neg_median_absolute_error'].mean():.3f} (+/- {result['test_neg_median_absolute_error'].std() * 2:.3f})"
                )
            else:
                print(
                    f"Accuracy: {result['test_accuracy'].mean():.3f} (+/- {result['test_accuracy'].std() * 2:.3f})\n" + \
                    f"Precision: {result['test_precision'].mean():.3f} (+/- {result['test_precision'].std() * 2:.3f})\n" + \
                    f"Recall: {result['test_recall'].mean():.3f} (+/- {result['test_recall'].std() * 2:.3f})\n" + \
                    f"F1: {result['test_f1'].mean():.3f} (+/- {result['test_f1'].std() * 2:.3f})\n" + \
                    f"ROC AUC: {result['test_roc_auc'].mean():.3f} (+/- {result['test_roc_auc'].std() * 2:.3f})"
                )
        return result


In [118]:
pd.set_option('display.max_columns', None)

path = '../data/ergast/'
circuits = pd.read_csv(path + 'circuits.csv')
constructor_results = pd.read_csv(path + 'constructor_results.csv')
constructor_standings = pd.read_csv(path + 'constructor_standings.csv')
constructors = pd.read_csv(path + 'constructors.csv')
driver_standings = pd.read_csv(path + 'driver_standings_update.csv')
drivers = pd.read_csv(path + 'drivers.csv')
lap_times = pd.read_csv(path + 'lap_times.csv')
pit_stops = pd.read_csv(path + 'pit_stops.csv')
qualifying = pd.read_csv(path + 'qualifying.csv')
races = pd.read_csv(path + 'races.csv')
results = pd.read_csv(path + 'results_update.csv')
seasons = pd.read_csv(path + 'seasons.csv')
sprint_results = pd.read_csv(path + 'sprint_results.csv')
status = pd.read_csv(path + 'status.csv')

In [119]:
races_shift = races.copy()
races_shift['nextRaceId'] = races_shift['raceId'].shift(-1)

In [120]:
# Creating a driver_standings_previus, that going contains the previous raceId standings for a drive, if is the first of season fill points, position and wins with 0.
races_shift = races.copy()
races_shift['nextRaceId'] = races_shift['raceId'].shift(-1)

driver_standings_previus = driver_standings.copy()
driver_standings_previus['raceId'] = driver_standings_previus['raceId'].apply(lambda x: races_shift.loc[races_shift['raceId'] == x, 'nextRaceId'].values[0])

# Create a DataFrame with zeros for points, position, and wins for the first race of each season
driver_standings_first_round = driver_standings.loc[driver_standings['raceId'] == 1].copy()
driver_standings_first_round[['points', 'position', 'wins']] = 0

driver_standings_previus = driver_standings_previus.append(driver_standings_first_round)

races_first_round = races_shift.loc[races['round'] == 1, 'raceId'].to_list()

# for each raceId races_first_round, in driver_standings_previus fill fill points, position and wins with 0.
for race_id in races_first_round:
    driver_standings_previus.loc[driver_standings_previus['raceId'] == race_id, 'points'] = 0
    driver_standings_previus.loc[driver_standings_previus['raceId'] == race_id, 'position'] = 0
    driver_standings_previus.loc[driver_standings_previus['raceId'] == race_id, 'wins'] = 0

  driver_standings_previus = driver_standings_previus.append(driver_standings_first_round)


In [121]:
# check if number of rows is the same
driver_standings_previus.isnull().sum()

driverStandingsId    0
raceId               0
driverId             0
points               0
position             0
positionText         0
wins                 0
dtype: int64

In [122]:
df = pd.merge(results, drivers, on='driverId', suffixes=("", "_drivers"))
df = pd.merge(df, races, on='raceId', suffixes=("", "_races"))
df = pd.merge(df, circuits, on='circuitId', suffixes=("", "_circuits"))
df = pd.merge(df, constructors, on='constructorId', suffixes=("", "_constructors"))
df = pd.merge(df, status, on='statusId')
df = pd.merge(df, driver_standings_previus, on=['raceId', 'driverId'], suffixes=("", "_acc"))

In [123]:
df.to_csv('../data/partial/all.csv', index=False)

In [124]:
df_laps = pd.merge(lap_times, races, on="raceId", suffixes=("", "_race"))
df_laps = pd.merge(df_laps, circuits, on="circuitId", suffixes=("", "_circuits"))
df_laps['date'] = pd.to_datetime(df_laps['date'])

In [125]:
from concurrent.futures import ThreadPoolExecutor

df['dob'] = pd.to_datetime(df['dob'])
df['date'] = pd.to_datetime(df['date'])
# Calculate age compare to date
df['age'] = (df['date'] - df['dob']).dt.days / 365
df['age'] = df['age'].astype(int)

# set nulll where \N
df = df.replace('\\N', numpy.nan)

weather = pd.read_csv('../data/weather/weather.csv')
df = df.merge(weather, on=['raceId'])

# round to 2 decimal humidity and temperature
df['humidity'] = df['humidity'].round(2)
df['temperature'] = df['temperature'].round(2)

# based on table lap_times find the best milliseconds, that a reace made on a circuit, before a specificy race
# def best_lap_time(raceId, circuitId, race_date):
#     return df_laps[
#         (df_laps['raceId'] < raceId) & (df_laps['circuitId'] == circuitId) & (df_laps['date'] < race_date)][
#         'milliseconds'].min()
#
#
# def get_best_lap_time(row):
#     return best_lap_time(row['raceId'], row['circuitId'], row['date'])
#
#
# with ThreadPoolExecutor() as executor:
#     df['faster_lap_circuit_ever'] = list(executor.map(get_best_lap_time, df.to_dict('records')))

# Convert 'grid' and 'position' column to numeric
df['grid'] = pd.to_numeric(df['grid'], errors='coerce')
df['position'] = pd.to_numeric(df['position'], errors='coerce')

# Ensure 'date' is in datetime format
df['date'] = pd.to_datetime(df['date'])

# Create a new dataframe with raceId, driverId, grid and position
df_grid_position = df[['raceId', 'driverId', 'grid', 'position', 'wins']].sort_values(['driverId', 'raceId'])

# Calculate the expanding mean of 'grid' and 'position' grouped by 'driverId'
df_grid_position['AvgGrid'] = df_grid_position.groupby('driverId')['grid'].expanding().mean().reset_index(level=0,
                                                                                                          drop=True)
df_grid_position['AvgFn'] = df_grid_position.groupby('driverId')['position'].expanding().mean().reset_index(level=0,
                                                                                                            drop=True)

df_grid_position['wins_acc'] = df_grid_position.groupby('driverId')['wins'].expanding().sum().reset_index(level=0,
                                                                                                          drop=True)

# Merge the df_grid_position dataframe back into the main dataframe (df)
df = pd.merge(df, df_grid_position[['raceId', 'driverId', 'AvgGrid', 'AvgFn', 'wins_acc']], on=['raceId', 'driverId'])

# Ensure that AvgGrid and AvgFn are rounded to two decimal places
df['AvgGrid'] = df['AvgGrid'].round(2)
df['AvgFn'] = df['AvgFn'].round(2)
df['wins_acc'] = df['wins_acc'].round(2)


In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23726 entries, 0 to 23725
Data columns (total 69 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   resultId                  23726 non-null  int64         
 1   raceId                    23726 non-null  int64         
 2   driverId                  23726 non-null  int64         
 3   constructorId             23726 non-null  int64         
 4   number                    23724 non-null  object        
 5   grid                      23726 non-null  int64         
 6   position                  14186 non-null  float64       
 7   positionText              23726 non-null  object        
 8   positionOrder             23726 non-null  int64         
 9   points                    23726 non-null  float64       
 10  laps                      23726 non-null  int64         
 11  time                      6954 non-null   object        
 12  milliseconds      

In [127]:
clean = [
    'resultId',
    'position',
    'positionText',
    'points',
    'fastestLap',
    'time',  # time to finish the race
    'milliseconds',  # time in milisecond to finish the race
    'fastestLapSpeed',
    'fastestLapTime',
    'rank',  # rank of fast lap in a race
    'statusId',
    'status',
    # 'grid',
    # 'laps',

    'raceId',
    'year',
    'date',
    'time_races',
    'name',  #Nome do grand pix
    'url_races',

    'driverId',
    'driverRef',
    'number_drivers',
    'dob',
    'code',
    'url',
    'forename',
    'surname',

    'driverStandingsId',
    'number',  # car number
    'positionText_acc',  # position acumulated unit a race

    'fp1_time',
    'fp1_date',
    'fp2_time',
    'fp2_date',
    'fp3_time',
    'fp3_date',

    'quali_time',
    'quali_date',
    'sprint_date',
    'sprint_time',

    'constructorRef',
    'name_constructors',
    'url_constructors',

    'circuitRef',
    'name_circuits',
    'location',
    'url_circuits',

    'lat',
    'lng',

    'wmo_code'
]

for col in clean:
    if col not in df.columns:
        print(col)

df_clean = df.copy()

df_clean.drop(columns=clean, inplace=True)

df_clean = df_clean.rename(
    columns={'positionOrder': 'position', 'points_acc': 'points_season', 'position_acc': 'position_season',
             'wins': 'wins_season', 'alt': 'height', 'country': 'country_circuit'
             })

# df_clean drop null at position
df_clean = df_clean.dropna(subset=['position'])


In [128]:
# from ydata_profiling import ProfileReport
#
# profile = ProfileReport(df, title='Pandas Profiling Report', )
# profile.to_file("profile/final.html")

In [129]:
position_prob = [i for i in range(10, 110, 10)]
# inver position_prob
position_prob.reverse()

df_clean['position'] = df_clean.position.astype(int)
df_clean['position'] = df_clean['position'].apply(lambda x: position_prob[x - 1] if 1 <= x <= 10 else 0)

In [130]:
df_clean.to_csv('../data/partial/clean.csv', index=False)

In [131]:
nationality_to_country = {
    'British': 'UK',
    'Italian': 'Italy',
    'French': 'France',
    'German': 'Germany',
    'Brazilian': 'Brazil',
    'American': 'USA',
    'Finnish': 'Finland',  # Not in the country list, added manually
    'Spanish': 'Spain',
    'Australian': 'Australia',
    'Austrian': 'Austria',
    'Japanese': 'Japan',
    'Belgian': 'Belgium',
    'Swedish': 'Sweden',
    'Swiss': 'Switzerland',
    'Dutch': 'Netherlands',
    'Canadian': 'Canada',
    'Mexican': 'Mexico',
    'New Zealander': 'New Zealand',  # Not in the country list, added manually
    'Argentine': 'Argentina',
    'Russian': 'Russia',
    'South African': 'South Africa',
    'Danish': 'Denmark',  # Not in the country list, added manually
    'Monegasque': 'Monaco',
    'Colombian': 'Colombia',  # Not in the country list, added manually
    'Venezuelan': 'Venezuela',  # Not in the country list, added manually
    'Polish': 'Poland',  # Not in the country list, added manually
    'Irish': 'Ireland',  # Not in the country list, added manually
    'Portuguese': 'Portugal',
    'Thai': 'Thailand',  # Not in the country list, added manually
    'Indian': 'India',
    'Chilean': 'Chile',  # Not in the country list, added manually
    'Chinese': 'China',
    'Hungarian': 'Hungary',
    'Rhodesian': 'Zimbabwe',  # Rhodesia is the former name of Zimbabwe
    'Malaysian': 'Malaysia',
    'Liechtensteiner': 'Liechtenstein',  # Not in the country list, added manually
    'Indonesian': 'Indonesia',  # Not in the country list, added manually
    'Uruguayan': 'Uruguay',  # Not in the country list, added manually
    'East German': 'Germany',
    'Czech': 'Czech Republic',  # Not in the country list, added manually
    'American-Italian': 'USA',
    'Argentine-Italian': 'Argentina',
    'Hong Kong': 'Hong Kong',  # Not in the country list, added manually
    'Bahrain': 'Bahrain',
    'Turkey': 'Turkey',
    'Singapore': 'Singapore',
    'UAE': 'UAE',
    'Korea': 'Korea',
    'Azerbaijan': 'Azerbaijan',
    'Morocco': 'Morocco',
    'Qatar': 'Qatar',
    'Saudi Arabia': 'Saudi Arabia',
}

# based on the map above create a encoder for each country
country_encoder = [i for i in nationality_to_country.values()]

In [132]:
df_clean['nationality'] = df_clean['nationality'].apply(lambda x: nationality_to_country[x])
df_clean['nationality_constructors'] = df_clean['nationality_constructors'].apply(lambda x: nationality_to_country[x])

# parse nationality, nationality_constructors and country_circuit to category

df_clean['nationality'] = df_clean['nationality'].apply(lambda x: country_encoder.index(x))
df_clean['nationality_constructors'] = df_clean['nationality_constructors'].apply(lambda x: country_encoder.index(x))
df_clean['country_circuit'] = df_clean['country_circuit'].apply(lambda x: country_encoder.index(x))

In [133]:
from sklearn.preprocessing import LabelEncoder

# label encoder weather_conditions
le = LabelEncoder()

df_clean['weather_condition'] = le.fit_transform(df_clean['weather_condition'])
df_clean['weather_condition'] = df_clean['weather_condition'].astype('int')

In [134]:
df_clean['height'] = df_clean['height'].astype('float')
# columns types
df_clean.dtypes

constructorId                 int64
grid                          int64
position                      int64
laps                          int64
nationality                   int64
round                         int64
circuitId                     int64
country_circuit               int64
height                      float64
nationality_constructors      int64
points_season               float64
position_season               int64
wins_season                   int64
age                           int64
weather_condition             int64
humidity                    float64
temperature                 float64
AvgGrid                     float64
AvgFn                       float64
wins_acc                    float64
dtype: object

In [135]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23726 entries, 0 to 23725
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   constructorId             23726 non-null  int64  
 1   grid                      23726 non-null  int64  
 2   position                  23726 non-null  int64  
 3   laps                      23726 non-null  int64  
 4   nationality               23726 non-null  int64  
 5   round                     23726 non-null  int64  
 6   circuitId                 23726 non-null  int64  
 7   country_circuit           23726 non-null  int64  
 8   height                    23726 non-null  float64
 9   nationality_constructors  23726 non-null  int64  
 10  points_season             23726 non-null  float64
 11  position_season           23726 non-null  int64  
 12  wins_season               23726 non-null  int64  
 13  age                       23726 non-null  int64  
 14  weathe

In [136]:
# Remove faster_lap_circuit_ever column
# df_clean = df_clean.drop(columns=['faster_lap_circuit_ever'])

df_clean = df_clean.dropna(subset=['AvgFn'])
# remove duplicate rows
# df_clean = df_clean.dropna()

In [137]:
# detect null values
df_clean.isnull().sum()

constructorId               0
grid                        0
position                    0
laps                        0
nationality                 0
round                       0
circuitId                   0
country_circuit             0
height                      0
nationality_constructors    0
points_season               0
position_season             0
wins_season                 0
age                         0
weather_condition           0
humidity                    0
temperature                 0
AvgGrid                     0
AvgFn                       0
wins_acc                    0
dtype: int64

In [138]:
df_clean.columns.tolist()

['constructorId',
 'grid',
 'position',
 'laps',
 'nationality',
 'round',
 'circuitId',
 'country_circuit',
 'height',
 'nationality_constructors',
 'points_season',
 'position_season',
 'wins_season',
 'age',
 'weather_condition',
 'humidity',
 'temperature',
 'AvgGrid',
 'AvgFn',
 'wins_acc']

In [139]:
X, y = df_clean.drop(columns=['position']), df_clean['position']

In [140]:
# pca
from sklearn.decomposition import PCA

pca = PCA(n_components='mle', svd_solver='full')
X_pca = pca.fit_transform(X)

In [141]:
# Randon Forest
from sklearn.ensemble import RandomForestRegressor

_ = EvaluatedClassifier(RandomForestRegressor(), is_regression=True).fit_predict_cv(X, y)

> Validação Cruzada (cv=10):
R2: 0.599 (+/- 0.121)
MAE: 14.181 (+/- 2.709)
MSE: 458.098 (+/- 170.904)
Explained Variance: 0.602 (+/- 0.113)
Median Absolute Error: 9.020 (+/- 1.949)


In [142]:
_ = EvaluatedClassifier(RandomForestRegressor(n_jobs=-1), is_regression=True).fit_predict_cv(X, y)

> Validação Cruzada (cv=10):
R2: 0.598 (+/- 0.124)
MAE: 14.190 (+/- 2.753)
MSE: 458.825 (+/- 174.537)
Explained Variance: 0.601 (+/- 0.116)
Median Absolute Error: 9.060 (+/- 1.831)


In [143]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

_ = EvaluatedClassifier(GradientBoostingRegressor(
    learning_rate=0.2,
    n_estimators=100,
), is_regression=True).fit_predict_cv(X, y)

> Validação Cruzada (cv=10):
R2: 0.597 (+/- 0.126)
MAE: 14.944 (+/- 2.006)
MSE: 460.617 (+/- 180.710)
Explained Variance: 0.599 (+/- 0.120)
Median Absolute Error: 9.730 (+/- 1.366)


In [144]:
# Gradient Boosting

# finded through RandomizedSearchCV
parms = {'validation_fraction': 0.1, 'tol': 0.0001, 'subsample': 1.0, 'random_state': 42, 'n_iter_no_change': 20,
         'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5,
         'loss': 'huber', 'learning_rate': 0.1, 'criterion': 'friedman_mse', 'ccp_alpha': 0.1, 'alpha': 0.9}

_ = EvaluatedClassifier(GradientBoostingRegressor(**parms), is_regression=True).fit_predict_cv(X, y)

> Validação Cruzada (cv=10):
R2: 0.610 (+/- 0.136)
MAE: 14.217 (+/- 2.381)
MSE: 445.944 (+/- 190.628)
Explained Variance: 0.612 (+/- 0.129)
Median Absolute Error: 8.387 (+/- 1.268)


In [145]:
# XGBoost
from xgboost import XGBRegressor

model_eval = EvaluatedClassifier(XGBRegressor(
    # n_estimators=1000,
    n_jobs=-1,
    learning_rate=0.1,
    # max_depth=8,
), is_regression=True)

_ = model_eval.fit_predict_cv(X, y)

> Validação Cruzada (cv=10):
R2: 0.614 (+/- 0.148)
MAE: 13.905 (+/- 2.747)
MSE: 441.192 (+/- 201.549)
Explained Variance: 0.617 (+/- 0.137)
Median Absolute Error: 8.206 (+/- 1.877)


In [150]:
# XGBoost
from xgboost import XGBRegressor

# Finded through RandomizedSearchCV
parms = {'subsample': 0.7, 'reg_lambda': 0.1, 'reg_alpha': 0.2, 'n_jobs': -1, 'n_estimators': 1500,
         'min_child_weight': 7, 'max_depth': 8, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.5}

model_eval = EvaluatedClassifier(XGBRegressor(**parms), is_regression=True)
_ = model_eval.fit_predict_cv(X_pca, y)

> Validação Cruzada (cv=10):
R2: 0.594 (+/- 0.117)
MAE: 14.986 (+/- 2.678)
MSE: 463.959 (+/- 165.778)
Explained Variance: 0.596 (+/- 0.112)
Median Absolute Error: 9.873 (+/- 1.754)


In [147]:
# Support Vector Regressor
from sklearn.svm import SVR

_ = EvaluatedClassifier(SVR(max_iter=-1), is_regression=True).fit_predict_cv(X, y)

> Validação Cruzada (cv=10):
R2: 0.203 (+/- 0.274)
MAE: 21.120 (+/- 3.749)
MSE: 911.605 (+/- 390.605)
Explained Variance: 0.301 (+/- 0.090)
Median Absolute Error: 12.230 (+/- 1.066)


In [148]:
 # MLP
from sklearn.neural_network import MLPRegressor

# Finded through HalvingRandomSearchCV
parms = {'solver': 'adam', 'momentum': 0.95, 'max_iter': 500, 'learning_rate_init': 0.001, 'learning_rate': 'adaptive',
         'hidden_layer_sizes': (100, 50), 'epsilon': 1e-06, 'beta_2': 0.9, 'beta_1': 0.9, 'batch_size': 32,
         'alpha': 0.1, 'activation': 'tanh'}

_ = EvaluatedClassifier(MLPRegressor(**parms), is_regression=True).fit_predict_cv(X, y)

> Validação Cruzada (cv=10):
R2: 0.248 (+/- 0.188)
MAE: 22.861 (+/- 4.264)
MSE: 858.001 (+/- 266.221)
Explained Variance: 0.251 (+/- 0.180)
Median Absolute Error: 17.465 (+/- 4.526)
> Validação Cruzada (cv=10):
R2: 0.549 (+/- 0.121)
MAE: 14.994 (+/- 2.691)
MSE: 514.886 (+/- 174.013)
Explained Variance: 0.556 (+/- 0.112)
Median Absolute Error: 8.471 (+/- 1.535)


In [149]:
# Export the model
model_eval.save('../model/model_grid.pkl')