In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
data = pd.read_csv(Path('../Resources/PythonExport/data_final.csv'))
print(data.shape)
data.head()

(25040, 19)


Unnamed: 0.1,Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,Win
0,0,2008,1,1,18,1,albert_park,1,hamilton,1,1,1,1,1,0,0,0,0,1
1,1,2008,1,2,18,1,albert_park,2,heidfeld,2,5,2,1,1,0,0,0,0,0
2,2,2008,1,3,18,1,albert_park,3,rosberg,3,7,3,1,1,0,0,0,0,0
3,3,2008,1,4,18,1,albert_park,4,alonso,4,11,4,1,1,0,0,0,0,0
4,4,2008,1,5,18,1,albert_park,5,kovalainen,1,3,5,1,1,0,0,0,0,0


In [3]:
df = data.copy()

In [4]:
# Train the data with all outcomes before 2019
train = df[df.year <2019]
X_train = train[['year', 'round', 'circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy']]
y_train = train.Win

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [5]:
# Create the test dataset
test = df[(df.year == 2019)]
test = test.reset_index()
test = test.drop(["index"], axis=1)

X_test = test[['year', 'round', 'circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy']]
y_test = test.Win
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [6]:
rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
rf_model = rf_model.fit(X_train, y_train)

In [8]:
predictions = rf_model.predict(X_test)
prediction1 = rf_model.predict_proba(X_test)

In [10]:
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score (Random Forrest): ', "%.2f" % (acc_score*100))

Accuracy Score (Random Forrest):  95.00


In [11]:
Z = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
# Create second Data Frame for prediction probabilities
Z1 = pd.DataFrame(prediction1, columns = ['proba_0', 'proba_1'])
# Merging the Z and Z1 dataframes 
Z_final = pd.merge(Z, Z1, left_index=True, right_index=True, how='outer')

In [12]:
# Merge Z_final with test. Only bring in columns from test that we need
Z_final = (Z_final.merge(test, left_index=True, right_index=True, how='outer').reindex(
            columns=['round', 'circuitRef','driverRef','Prediction', 'Actual', 'proba_1']))
print(Z_final.shape)
Z_final.head()

(420, 6)


Unnamed: 0,round,circuitRef,driverRef,Prediction,Actual,proba_1
0,1,albert_park,bottas,0,1,0.0
1,1,albert_park,hamilton,0,0,0.3
2,1,albert_park,max_verstappen,0,0,0.1
3,1,albert_park,vettel,0,0,0.5
4,1,albert_park,leclerc,0,0,0.0


In [13]:
# We will groupby 'round' and take the max probability for a driver to win
maxprob = Z_final.groupby(['round']).agg({'proba_1':'max'})
maxprob = maxprob.reset_index()
maxprob = maxprob.rename(columns={'proba_1':'proba_1_max'})

# We will then merge the proba_1_max column to Z_Final
Z_final = pd.merge(Z_final, maxprob, how='left', on=['round'])
Z_final

Unnamed: 0,round,circuitRef,driverRef,Prediction,Actual,proba_1,proba_1_max
0,1,albert_park,bottas,0,1,0.0,0.5
1,1,albert_park,hamilton,0,0,0.3,0.5
2,1,albert_park,max_verstappen,0,0,0.1,0.5
3,1,albert_park,vettel,0,0,0.5,0.5
4,1,albert_park,leclerc,0,0,0.0,0.5
...,...,...,...,...,...,...,...
415,21,yas_marina,giovinazzi,0,0,0.0,0.6
416,21,yas_marina,russell,0,0,0.0,0.6
417,21,yas_marina,gasly,0,0,0.0,0.6
418,21,yas_marina,kubica,0,0,0.0,0.6


In [14]:
# We will create a new dataframe with our predicted winner
pred_winner = Z_final[Z_final['proba_1'] == Z_final['proba_1_max']]
pred_winner = pred_winner.reset_index()
pred_winner = pred_winner.rename(columns={'driverRef':'pred_driver'})

# We will create a second dataframe with our actual winners
actual_winner = Z_final[Z_final['Actual']==1]
actual_winner = actual_winner.reset_index()
actual_winner = actual_winner.rename(columns={'driverRef':'actual_driver'})

In [15]:
# We can merge pred_winners and actual_winners into one dataframe
winners = (actual_winner.merge(pred_winner, left_on='round', right_on='round').reindex(
            columns=['round', 'actual_driver', 'pred_driver']))
print(winners.shape)
winners.head()

(23, 3)


Unnamed: 0,round,actual_driver,pred_driver
0,1,bottas,vettel
1,2,hamilton,vettel
2,3,hamilton,hamilton
3,4,bottas,hamilton
4,5,hamilton,max_verstappen


In [16]:
score = len(winners[winners['actual_driver']==winners['pred_driver']])
score

10