In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from pathlib import Path

np.set_printoptions(precision=4)

In [2]:
data = pd.read_csv(Path('../Resources/PythonExport/data_final.csv'))
print(data.shape)
data.head()

(7987, 14)


Unnamed: 0.1,Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId,Win
0,0,2008,1,1,18,1,albert_park,1,hamilton,1,1,1,1,1
1,1,2008,1,2,18,1,albert_park,2,heidfeld,2,5,2,1,0
2,2,2008,1,3,18,1,albert_park,3,rosberg,3,7,3,1,0
3,3,2008,1,4,18,1,albert_park,4,alonso,4,11,4,1,0
4,4,2008,1,5,18,1,albert_park,5,kovalainen,1,3,5,1,0


In [3]:
df = data.copy()

In [4]:
train = df[df.year <2019]
X_train = train[['circuitId','driverId','constructorId', 'grid']]
y_train = train.Win

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [5]:
test = df[(df.year == 2019)]
X_test = test[['circuitId','driverId','constructorId', 'grid']]
y_test = test.Win
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [6]:
test

Unnamed: 0.1,Unnamed: 0,year,round,resultId,raceId,circuitId,circuitRef,driverId,driverRef,constructorId,grid,position,statusId,Win
7147,7147,2019,1,24203,1010,1,albert_park,822,bottas,131,2,1,1,1
7148,7148,2019,1,24204,1010,1,albert_park,1,hamilton,131,1,2,1,0
7149,7149,2019,1,24205,1010,1,albert_park,830,max_verstappen,9,4,3,1,0
7150,7150,2019,1,24206,1010,1,albert_park,20,vettel,6,3,4,1,0
7151,7151,2019,1,24207,1010,1,albert_park,844,leclerc,6,5,5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7562,7562,2019,21,24621,1030,24,yas_marina,841,giovinazzi,51,16,16,11,0
7563,7563,2019,21,24622,1030,24,yas_marina,847,russell,3,18,17,11,0
7564,7564,2019,21,24623,1030,24,yas_marina,842,gasly,5,11,18,12,0
7565,7565,2019,21,24624,1030,24,yas_marina,9,kubica,3,19,19,12,0


In [7]:
classifier = LogisticRegression(solver='liblinear', max_iter=1000)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000, solver='liblinear')

In [8]:
# Predict Outcome
prediction = classifier.predict(X_test)
# Predict Outcome probabilities
prediction1 = classifier.predict_proba(X_test)

In [9]:
# Create Data Frame for Predicted Outcome
Z = pd.DataFrame({"Prediction": prediction, "Actual": y_test})
Z

Unnamed: 0,Prediction,Actual
7147,0,1
7148,0,0
7149,0,0
7150,0,0
7151,0,0
...,...,...
7562,0,0
7563,0,0
7564,0,0
7565,0,0


In [10]:
# Create second Data Frame for prediction probabilities
Z1 = pd.DataFrame(prediction1, columns = ['proba_0', 'proba_1'])
Z1

Unnamed: 0,proba_0,proba_1
0,0.870432,0.129568
1,0.575882,0.424118
2,0.963486,0.036514
3,0.844804,0.155196
4,0.979085,0.020915
...,...,...
415,0.999941,0.000059
416,0.999982,0.000018
417,0.999170,0.000830
418,0.999970,0.000030


In [11]:
## As we can see above, Z1 and Z do not have the same Index so we need to fix that. 

# Copy the index column to another column and name it index1
Z1["index1"] = Z1.index
# Add 13881 to index1 to match with Z1 dataframe below
Z1['index1'] = Z1["index1"] + 7147
# Set index1 as the index column of the dataframe
Z1 = Z1.set_index('index1')
# remove index column header
Z1.index.name = None

Z1.tail()

Unnamed: 0,proba_0,proba_1
7562,0.999941,5.9e-05
7563,0.999982,1.8e-05
7564,0.99917,0.00083
7565,0.99997,3e-05
7566,0.999255,0.000745


In [12]:
# Merging the Z and Z1 dataframes 
Z_final = pd.merge(Z, Z1, left_index=True, right_index=True, how='outer')
Z_final

Unnamed: 0,Prediction,Actual,proba_0,proba_1
7147,0,1,0.870432,0.129568
7148,0,0,0.575882,0.424118
7149,0,0,0.963486,0.036514
7150,0,0,0.844804,0.155196
7151,0,0,0.979085,0.020915
...,...,...,...,...
7562,0,0,0.999941,0.000059
7563,0,0,0.999982,0.000018
7564,0,0,0.999170,0.000830
7565,0,0,0.999970,0.000030


In [13]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, prediction))

0.9404761904761905


In [14]:
# Merge Z_final with test. Only bring in columns from test that we need
Z_final = (Z_final.merge(test, left_index=True, right_index=True, how='outer').reindex(
            columns=['round', 'circuitRef','driverRef','Prediction', 'Actual', 'proba_1']))
print(Z_final.shape)
Z_final.head()

(420, 6)


Unnamed: 0,round,circuitRef,driverRef,Prediction,Actual,proba_1
7147,1,albert_park,bottas,0,1,0.129568
7148,1,albert_park,hamilton,0,0,0.424118
7149,1,albert_park,max_verstappen,0,0,0.036514
7150,1,albert_park,vettel,0,0,0.155196
7151,1,albert_park,leclerc,0,0,0.020915


In [15]:
X = Z_final.loc[Z_final['round']==1]
X

Unnamed: 0,round,circuitRef,driverRef,Prediction,Actual,proba_1
7147,1,albert_park,bottas,0,1,0.129568
7148,1,albert_park,hamilton,0,0,0.424118
7149,1,albert_park,max_verstappen,0,0,0.036514
7150,1,albert_park,vettel,0,0,0.155196
7151,1,albert_park,leclerc,0,0,0.020915
7152,1,albert_park,kevin_magnussen,0,0,0.011146
7153,1,albert_park,hulkenberg,0,0,0.000827
7154,1,albert_park,raikkonen,0,0,0.00756
7155,1,albert_park,stroll,0,0,7.9e-05
7156,1,albert_park,kvyat,0,0,9e-05


In [16]:
# We will groupby 'round' and take the max probability for a driver to win
maxprob = Z_final.groupby(['round']).agg({'proba_1':'max'})
maxprob = maxprob.reset_index()
maxprob = maxprob.rename(columns={'proba_1':'proba_1_max'})

# We will then merge the proba_1_max column to Z_Final
Z_final = pd.merge(Z_final, maxprob, how='left', on=['round'])
Z_final

Unnamed: 0,round,circuitRef,driverRef,Prediction,Actual,proba_1,proba_1_max
0,1,albert_park,bottas,0,1,0.129568,0.424118
1,1,albert_park,hamilton,0,0,0.424118,0.424118
2,1,albert_park,max_verstappen,0,0,0.036514,0.424118
3,1,albert_park,vettel,0,0,0.155196,0.424118
4,1,albert_park,leclerc,0,0,0.020915,0.424118
...,...,...,...,...,...,...,...
415,21,yas_marina,giovinazzi,0,0,0.000059,0.435555
416,21,yas_marina,russell,0,0,0.000018,0.435555
417,21,yas_marina,gasly,0,0,0.000830,0.435555
418,21,yas_marina,kubica,0,0,0.000030,0.435555


In [17]:
# We will create a new dataframe with our predicted winner
pred_winner = Z_final[Z_final['proba_1'] == Z_final['proba_1_max']]
pred_winner = pred_winner.reset_index()
pred_winner = pred_winner.rename(columns={'driverRef':'pred_driver'})

# We will create a second dataframe with our actual winners
actual_winner = Z_final[Z_final['Actual']==1]
actual_winner = actual_winner.reset_index()
actual_winner = actual_winner.rename(columns={'driverRef':'actual_driver'})

In [18]:
# We can merge pred_winners and actual_winners into one dataframe
winners = (actual_winner.merge(pred_winner, left_on='round', right_on='round').reindex(
            columns=['round', 'actual_driver', 'pred_driver']))
print(winners.shape)
winners.head()

(21, 3)


Unnamed: 0,round,actual_driver,pred_driver
0,1,bottas,hamilton
1,2,hamilton,vettel
2,3,hamilton,hamilton
3,4,bottas,raikkonen
4,5,hamilton,hamilton


In [19]:
score = len(winners[winners['actual_driver']==winners['pred_driver']]) / len(winners)
score

0.38095238095238093