In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm, datasets
from sklearn.svm import SVC
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
data = pd.read_csv(Path('../Resources/PythonExport/data_final.csv'))
print(data.shape)
data.head()

(7940, 17)


Unnamed: 0.1,Unnamed: 0,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,Win,podium
0,0,2008,1,1,18,1,1,1,1,1,1,0,0,0,0,1,1
1,1,2008,1,2,18,1,2,2,5,2,1,0,0,0,0,0,2
2,2,2008,1,3,18,1,3,3,7,3,1,0,0,0,0,0,3
3,3,2008,1,4,18,1,4,4,11,4,1,0,0,0,0,0,0
4,4,2008,1,5,18,1,5,1,3,5,1,0,0,0,0,0,0


In [3]:
df = data.copy()

In [4]:
# Train the data with all outcomes before 2019
train = df[df['year']<=2018]
X_train = train[['year', 'round', 'circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy']]
y_train = train.Win

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [5]:
# Create the test dataset
test = df[(df.year == 2019)]
test = test.reset_index()
test = test.drop(["index"], axis=1)

X_test = test[['year', 'round','circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy']]
y_test = test.Win
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [6]:
rbf = svm.SVC(kernel='rbf',probability=True).fit(X_train, y_train)

In [7]:
rbf_pred = rbf.predict(X_test)
rbf_pred1 = rbf.predict_proba(X_test)

In [8]:
rbf_accuracy = accuracy_score(y_test, rbf_pred)
rbf_f1 = f1_score(y_test, rbf_pred, average='weighted')
print('Accuracy (Rbf Kernel): ', "%.2f" % (rbf_accuracy*100))
print('F1 (Rbf Kernel): ', "%.2f" % (rbf_f1*100))

Accuracy (Rbf Kernel):  95.00
F1 (Rbf Kernel):  92.56


In [9]:
Z = pd.DataFrame({"Prediction": rbf_pred, "Actual": y_test})
# Create second Data Frame for prediction probabilities
Z1 = pd.DataFrame(rbf_pred1, columns = ['proba_0', 'proba_1'])
# Merging the Z and Z1 dataframes 
Z_final = pd.merge(Z, Z1, left_index=True, right_index=True, how='outer')

In [10]:
# Merge Z_final with test. Only bring in columns from test that we need
Z_final = (Z_final.merge(test, left_index=True, right_index=True, how='outer').reindex(
            columns=['raceId', 'round', 'circuitId','driverId','Prediction', 'Actual', 'proba_1', 'Win', 'podium']))
print(Z_final.shape)
Z_final.head()

(420, 9)


Unnamed: 0,raceId,round,circuitId,driverId,Prediction,Actual,proba_1,Win,podium
0,1010,1,1,822,0,1,0.027366,1,1
1,1010,1,1,1,0,0,0.072876,0,2
2,1010,1,1,830,0,0,0.060218,0,3
3,1010,1,1,20,0,0,0.06128,0,0
4,1010,1,1,844,0,0,0.060402,0,0


In [11]:
# We will groupby 'round' and take the max probability for a driver to win
maxprob = Z_final.groupby(['raceId']).agg({'proba_1':'max'})
maxprob = maxprob.reset_index()
maxprob = maxprob.rename(columns={'proba_1':'proba_1_max'})

# We will then merge the proba_1_max column to Z_Final
Z_final = pd.merge(Z_final, maxprob, how='left', on=['raceId'])
Z_final

Unnamed: 0,raceId,round,circuitId,driverId,Prediction,Actual,proba_1,Win,podium,proba_1_max
0,1010,1,1,822,0,1,0.027366,1,1,0.072876
1,1010,1,1,1,0,0,0.072876,0,2,0.072876
2,1010,1,1,830,0,0,0.060218,0,3,0.072876
3,1010,1,1,20,0,0,0.061280,0,0,0.072876
4,1010,1,1,844,0,0,0.060402,0,0,0.072876
...,...,...,...,...,...,...,...,...,...,...
415,1030,21,24,841,0,0,0.022312,0,0,0.500000
416,1030,21,24,847,0,0,0.026308,0,0,0.500000
417,1030,21,24,842,0,0,0.030151,0,0,0.500000
418,1030,21,24,9,0,0,0.017742,0,0,0.500000


In [12]:
# Number of times our predicted winner is the actual winner
winner = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1]:
            winner.append(1)
        else:
            winner.append(0)
    else:
        winner.append(0)
        
Z_final['winner'] = winner
Z_final['winner'].sum()

12

In [13]:
# Number of times our predicted winner is in the top 2
top2 = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1,2]:
            top2.append(1)
        else:
            top2.append(0)
    else:
        top2.append(0)
        
Z_final['top2'] = top2
Z_final['top2'].sum()

15

In [14]:
# Number of times our predicted winner is in the top 2
top3 = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1,2,3]:
            top3.append(1)
        else:
            top3.append(0)
    else:
        top3.append(0)
        
Z_final['top3'] = top3
Z_final['top3'].sum()

16

In [15]:
# Extract Z_final.CSV file for all race position predictions
Z_final.to_csv('../Resources/PythonExport/race_positions_raw.csv')
test.to_csv('../Resources/PythonExport/race_positions_raw1.csv')