# TFG - Fórmula 1 - Classic AI

Autor: Manuel Ventura

Modelado de datos para predecir resultados de Fórmula 1 utilizando técnicas clásicas de IA. Utilizamos la codificación one-hot de las columnas categóricas.

In [34]:
import numpy as np
import pandas as pd
import time
# Regresores
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from lightgbm import LGBMRegressor
from xgboost.sklearn import XGBRegressor
# Metricas
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

from joblib import dump, load

## Carga de datos

In [2]:
df = pd.read_csv('data_ready/LEARNING_DF_ONEHOT.csv')
df = df.loc[df['year']>=2000].copy()

In [3]:
df

Unnamed: 0,driverId_1,driverId_2,driverId_3,driverId_4,driverId_5,driverId_6,driverId_7,driverId_8,driverId_9,driverId_10,...,round,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,age,experience,driversPointsBeforeRace,constPointsBeforeRace
0,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,8469,364,0.0,0.0
1,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,1,8476,371,10.0,14.0
2,1,0,0,0,0,0,0,0,0,0,...,3,0,0,1,0,0,8490,385,14.0,24.0
3,1,0,0,0,0,0,0,0,0,0,...,4,1,0,0,0,0,8511,406,14.0,28.0
4,1,0,0,0,0,0,0,0,0,0,...,5,1,0,0,0,0,8525,420,20.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25895,0,0,0,0,0,0,0,0,0,0,...,2,1,0,0,0,0,8113,14,0.0,1.0
25896,0,0,0,0,0,0,0,0,0,0,...,3,1,0,0,0,0,8127,28,0.0,1.0
25897,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,8003,0,0.0,0.0
25898,0,0,0,0,0,0,0,0,0,0,...,2,1,0,0,0,0,8017,14,0.0,0.0


## Escalado del dataset

In [4]:
# Clipping
df['grid'] = df['grid'].clip(upper=20)
df['position'] = df['position'].clip(upper=20)

# MinMaxScaler
scaler_mm = MinMaxScaler(feature_range=(1,20))
columns_to_scale = ['year', 'age', 'experience', 'driversPointsBeforeRace', 'constPointsBeforeRace']
df[columns_to_scale] = scaler_mm.fit_transform(df[columns_to_scale])

# Fit Boolean Columns
columns_to_replace = ['weather_warm', 'weather_cold', 'weather_dry', 'weather_wet', 'weather_cloudy']
df[columns_to_replace] = df[columns_to_replace].replace(1,20)


y = df['position']
X = df.drop(['position'], axis=1)

## Test con validación cruzada

In [35]:
forest = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
svr = SVR()
grad_boost = GradientBoostingRegressor(n_estimators=1000)
linear = LinearRegression(n_jobs=-1)
logistic = LogisticRegression(n_jobs=-1)
sgd = SGDRegressor(loss='huber')
lgbm = LGBMRegressor(n_estimators=100, n_jobs=-1)
xgb = XGBRegressor(n_estimators=1000, n_jobs=-1)

In [36]:
def test_regressors(clf, X, y):
    st = time.time()
    score = -np.mean(cross_val_score(clf, X, y, cv=10, scoring='neg_mean_absolute_error'))
    et = time.time()
    time_taken = et - st
    return (clf.__class__.__name__, score, time_taken)

In [37]:
results = pd.DataFrame(columns=['estimator', 'score', 'time'])

estimators = [forest, svr, grad_boost, linear, logistic, sgd, lgbm, xgb]

In [38]:
for clf in estimators:
    tupla = test_regressors(clf, X, y)
    results.loc[len(results.index)] = [tupla[0], tupla[1], tupla[2]]

In [39]:
results

Unnamed: 0,estimator,score,time
0,RandomForestRegressor,3.770766,228.125885
1,SVR,3.574285,396.396057
2,GradientBoostingRegressor,3.909671,377.632848
3,LinearRegression,418910700000.0,3.950926
4,LogisticRegression,5.478742,80.821629
5,SGDRegressor,3.652619,2.618739
6,LGBMRegressor,3.777989,2.947724
7,XGBRegressor,4.183351,490.418708
