In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv(Path('../Resources/PythonExport/race_final.csv'))
print(df.shape)
df.head()

(4297, 14)


Unnamed: 0.1,Unnamed: 0,resultId,raceId,name,year,circuitId,driverId,constructorId,starting_position,finishing_status,firstlap_position,lap,train_test,ending_position
0,20323,20323,337,Bahrain Grand Prix,2010,3,4,6,3,1,2,1,1,1
1,20324,20324,337,Bahrain Grand Prix,2010,3,13,6,2,1,3,1,1,2
2,20325,20325,337,Bahrain Grand Prix,2010,3,1,1,4,1,5,1,1,3
3,20326,20326,337,Bahrain Grand Prix,2010,3,20,9,1,1,1,1,1,4
4,20327,20327,337,Bahrain Grand Prix,2010,3,3,131,5,1,4,1,1,5


In [3]:
# We are only taking rows where the finishing_status = 1, aka driver finished the race
#df1 = df.loc[(df["finishing_status"] == 1)]

In [4]:
# Creating our train and test data sets using the train_test column
df_train = df.loc[df["train_test"] == 1]
df_test = df.loc[df["train_test"] == 2]
print(df_train.shape)
print(df_test.shape)

(3877, 14)
(420, 14)


In [5]:
X_train = df_train[['circuitId','driverId','constructorId', 'starting_position', 'firstlap_position']]
y_train = df_train['ending_position']

In [6]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [7]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000)
classifier

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [8]:
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [9]:
X_test = df_test[['circuitId','driverId','constructorId', 'starting_position', 'firstlap_position']]
y_test = df_test['ending_position']

In [10]:
df_test['ending_position'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        0, 18, 19, 20])

In [11]:
scaler1 = StandardScaler().fit(X_test)
X_test_scaled = scaler.transform(X_test)

In [12]:
y_pred = classifier.predict(X_test_scaled)
Z = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [13]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.16904761904761906


In [14]:
# Merging the Z dataframe and df_test datframe 
df_test1 = pd.merge(df_test, Z, left_index=True, right_index=True, how='outer')

In [15]:
# Number of races

positions = [1,2,3]

for i in positions:
    x = len(df_test1.loc[(df_test1["Prediction"] == i) & (df_test1["Actual"] == i)])
    y = len(df_test1.loc[df_test1["Actual"] == i])
    print(f"The model predicted {x} race positions accurately out of {y} for race position {i} for 2019 season")

The model predicted 15 race positions accurately out of 21 for race position 1 for 2019 season
The model predicted 7 race positions accurately out of 21 for race position 2 for 2019 season
The model predicted 3 race positions accurately out of 21 for race position 3 for 2019 season


## Support Vector Machines - multi class¶

In [16]:
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np

In [17]:
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1).fit(X_train_scaled, y_train)
poly = svm.SVC(kernel='poly', degree=3, C=1).fit(X_train_scaled, y_train)

In [18]:
poly_pred = poly.predict(X_test_scaled)
rbf_pred = rbf.predict(X_test_scaled)

In [19]:
poly_accuracy = accuracy_score(y_test, poly_pred)
poly_f1 = f1_score(y_test, poly_pred, average='weighted')
print('Accuracy (Polynomial Kernel): ', "%.2f" % (poly_accuracy*100))
print('F1 (Polynomial Kernel): ', "%.2f" % (poly_f1*100))

Accuracy (Polynomial Kernel):  16.67
F1 (Polynomial Kernel):  8.57


In [20]:
rbf_accuracy = accuracy_score(y_test, rbf_pred)
rbf_f1 = f1_score(y_test, rbf_pred, average='weighted')
print('Accuracy (RBF Kernel): ', "%.2f" % (rbf_accuracy*100))
print('F1 (RBF Kernel): ', "%.2f" % (rbf_f1*100))

Accuracy (RBF Kernel):  14.76
F1 (RBF Kernel):  5.71


In [21]:
# Making predictions using the testing data.
Z1 = pd.DataFrame({"Prediction": rbf_pred, "Actual": y_test})

In [22]:
# Merging the Z dataframe and df_test datframe 
df_test2 = pd.merge(df_test, Z1, left_index=True, right_index=True, how='outer')

In [23]:
# Number of races

positions = [1,2,3]

for i in positions:
    x = len(df_test2.loc[(df_test1["Prediction"] == i) & (df_test1["Actual"] == i)])
    y = len(df_test2.loc[df_test1["Actual"] == i])
    print(f"The model predicted {x} race positions accurately out of {y} for race position {i} for 2019 season")

The model predicted 15 race positions accurately out of 21 for race position 1 for 2019 season
The model predicted 7 race positions accurately out of 21 for race position 2 for 2019 season
The model predicted 3 race positions accurately out of 21 for race position 3 for 2019 season


## Random Forrest - multi class

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
# Create a random forest classifier.
##rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 
rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)

In [26]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [27]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
Z2 = pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [28]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.14523809523809525

In [29]:
# Merging the Z dataframe and df_test datframe 
df_test3 = pd.merge(df_test, Z2, left_index=True, right_index=True, how='outer')

In [30]:
# Number of races

positions = [1,2,3]

for i in positions:
    x = len(df_test3.loc[(df_test1["Prediction"] == i) & (df_test1["Actual"] == i)])
    y = len(df_test3.loc[df_test1["Actual"] == i])
    print(f"The model predicted {x} race positions accurately out of {y} for race position {i} for 2019 season")

The model predicted 15 race positions accurately out of 21 for race position 1 for 2019 season
The model predicted 7 race positions accurately out of 21 for race position 2 for 2019 season
The model predicted 3 race positions accurately out of 21 for race position 3 for 2019 season


In [31]:
importances = rf_model.feature_importances_
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.23947617727936282, 'circuitId'),
 (0.2350281650040956, 'firstlap_position'),
 (0.2155385919357343, 'starting_position'),
 (0.17481675553160927, 'driverId'),
 (0.135140310249198, 'constructorId')]

## XGBoost

In [32]:
#from numpy import mean
#from numpy import std
#from sklearn.datasets import make_classification
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import RepeatedStratifiedKFold
#from xgboost import XGBRFClassifier

# define the model
#model2 = XGBRFClassifier(n_estimators=100, subsample=0.9, colsample_bynode=0.2)
# define the model evaluation procedure
#cv = RepeatedStratifiedKFold(n_splits=9, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
#n_scores = cross_val_score(model2, X_train_scaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
#print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))