# TFG - Fórmula 1 - Classic AI

Autor: Manuel Ventura

Modelado de datos para predecir resultados de Fórmula 1 utilizando técnicas clásicas de IA. Utilizamos las columnas categóricas sin codificar, con las labels.

In [1]:
import numpy as np
import pandas as pd
import time
# Regresores
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from lightgbm import LGBMRegressor
from xgboost.sklearn import XGBRegressor
# Metricas
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

from joblib import dump, load

## Carga de datos

In [2]:
df = pd.read_csv('data_ready/LEARNING_DF_NORMAL.csv')
df = df.loc[df['year']>=2000].copy()

In [3]:
df

Unnamed: 0,driverId,constructorId,grid,position,year,round,circuitId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,age,experience,driversPointsBeforeRace,constPointsBeforeRace
0,1,1,1,1,2008,1,1,0,0,0,0,0,8469,364,0.0,0.0
1,1,1,9,5,2008,2,2,0,0,0,0,1,8476,371,10.0,14.0
2,1,1,3,13,2008,3,3,0,0,1,0,0,8490,385,14.0,24.0
3,1,1,5,3,2008,4,4,1,0,0,0,0,8511,406,14.0,28.0
4,1,1,3,2,2008,5,5,1,0,0,0,0,8525,420,20.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25895,858,3,20,16,2023,2,77,1,0,0,0,0,8113,14,0.0,1.0
25896,858,3,18,16,2023,3,1,1,0,0,0,0,8127,28,0.0,1.0
25897,857,1,18,20,2023,1,3,1,0,0,0,0,8003,0,0.0,0.0
25898,857,1,8,15,2023,2,77,1,0,0,0,0,8017,14,0.0,0.0


## Escalado del dataset

Datos  escalados a la misma escala que la salida.

In [4]:
# Clipping
df['grid'] = df['grid'].clip(upper=20)
df['position'] = df['position'].clip(upper=20)

# MinMaxScaler
scaler_mm = MinMaxScaler(feature_range=(1,20))
columns_to_scale = ['year', 'age', 'experience', 'driversPointsBeforeRace', 'constPointsBeforeRace']
df[columns_to_scale] = scaler_mm.fit_transform(df[columns_to_scale])

# Fit Boolean Columns
columns_to_replace = ['weather_warm', 'weather_cold', 'weather_dry', 'weather_wet', 'weather_cloudy']
df[columns_to_replace] = df[columns_to_replace].replace(1,20)


y = df['position']
X = df.drop(['position'], axis=1)

## Test con validación cruzada



In [5]:
forest = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
svr = SVR()
grad_boost = GradientBoostingRegressor(n_estimators=1000)
linear = LinearRegression(n_jobs=-1)
logistic = LogisticRegression(n_jobs=-1)
sgd = SGDRegressor(loss='huber')
lgbm = LGBMRegressor(n_estimators=100, n_jobs=-1)
xgb = XGBRegressor(n_estimators=1000, n_jobs=-1)

In [6]:
def test_regressors(clf, X, y):
    st = time.time()
    score = -np.mean(cross_val_score(clf, X, y, cv=10, scoring='neg_mean_absolute_error'))
    et = time.time()
    time_taken = et - st
    return (clf.__class__.__name__, score, time_taken)

In [7]:
results = pd.DataFrame(columns=['estimator', 'score', 'time'])

estimators = [forest, svr, grad_boost, linear, logistic, sgd, lgbm, xgb]

In [8]:
for clf in estimators:
    tupla = test_regressors(clf, X, y)
    results.loc[len(results.index)] = [tupla[0], tupla[1], tupla[2]]

In [9]:
results

Unnamed: 0,estimator,score,time
0,RandomForestRegressor,3.889797,55.041918
1,SVR,4.997768,34.788339
2,GradientBoostingRegressor,4.063868,76.778738
3,LinearRegression,3.798663,0.055572
4,LogisticRegression,5.770174,5.820316
5,SGDRegressor,6.965231,0.674924
6,LGBMRegressor,3.78039,0.893409
7,XGBRegressor,4.312992,34.032385


# Resultados con todos los datos normalizados

Datos escalados en el intervalo $[0,1]$.

In [14]:
df = pd.read_csv('data_ready/LEARNING_DF_NORMAL.csv')
df = df.loc[df['year']>=2000].copy()

In [16]:
# Clipping
df['grid'] = df['grid'].clip(upper=20)
df['position'] = df['position'].clip(upper=20)

# MinMaxScaler
scaler_mm = MinMaxScaler(feature_range=(0,1))
columns_to_scale = df.columns.values.tolist()
df[columns_to_scale] = scaler_mm.fit_transform(df[columns_to_scale])

y = df['position']
X = df.drop(['position'], axis=1)

In [22]:
forest = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
svr = SVR()
grad_boost = GradientBoostingRegressor(n_estimators=1000)
linear = LinearRegression(n_jobs=-1)
logistic = LogisticRegression(solver='saga', n_jobs=-1)
sgd = SGDRegressor(loss='huber')
lgbm = LGBMRegressor(n_estimators=100, n_jobs=-1)
xgb = XGBRegressor(n_estimators=1000, n_jobs=-1)

In [23]:
def test_regressors(clf, X, y):
    st = time.time()
    score = -np.mean(cross_val_score(clf, X, y, cv=10, scoring='neg_mean_absolute_error'))
    et = time.time()
    time_taken = et - st
    return (clf.__class__.__name__, score, time_taken)

In [24]:
results = pd.DataFrame(columns=['estimator', 'score', 'time'])

estimators = [forest, svr, grad_boost, linear, logistic, sgd, lgbm, xgb]

In [None]:
for clf in estimators:
    tupla = test_regressors(clf, X, y)
    results.loc[len(results.index)] = [tupla[0], tupla[1], tupla[2]]

In [27]:
results

Unnamed: 0,estimator,score,time
0,RandomForestRegressor,0.2049,52.414689
1,SVR,0.191126,25.534818
2,GradientBoostingRegressor,0.214687,78.225143
3,LinearRegression,0.19993,0.031627
4,LogisticRegression,,0.015621
5,SGDRegressor,0.194608,0.094116
6,LGBMRegressor,0.198848,0.877815
7,XGBRegressor,0.229765,33.851747
