In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv(Path('../Resources/PythonExport/threebin_final.csv'))
print(df.shape)
df.head()

(4297, 15)


Unnamed: 0.1,Unnamed: 0,resultId,raceId,name,year,circuitId,driverId,constructorId,starting_position,finishing_status,firstlap_position,lap,train_test,ending_position,ending_bin
0,20323,20323,337,Bahrain Grand Prix,2010,3,4,6,3,1,2.0,1.0,1,1,1
1,20324,20324,337,Bahrain Grand Prix,2010,3,13,6,2,1,3.0,1.0,1,2,1
2,20325,20325,337,Bahrain Grand Prix,2010,3,1,1,4,1,5.0,1.0,1,3,1
3,20326,20326,337,Bahrain Grand Prix,2010,3,20,9,1,1,1.0,1.0,1,4,2
4,20327,20327,337,Bahrain Grand Prix,2010,3,3,131,5,1,4.0,1.0,1,5,2


In [3]:
# We are only taking rows where the finishing_status = 1, aka driver finished the race
df1 = df.loc[(df["finishing_status"] == 1)]

In [4]:
# Creating our train and test data sets using the train_test column
df_train = df1.loc[df1["train_test"] == 1]
df_test = df1.loc[df1["train_test"] == 2]
print(df_train.shape)
print(df_test.shape)

(3092, 15)
(360, 15)


In [5]:
X_train = df_train[['circuitId','driverId','constructorId', 'starting_position', 'firstlap_position']]
y_train = df_train['ending_bin']

In [6]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [7]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000)
classifier

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [8]:
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [9]:
X_test = df_test[['circuitId','driverId','constructorId', 'starting_position', 'firstlap_position']]
y_test = df_test['ending_bin']

In [10]:
scaler1 = StandardScaler().fit(X_test)
X_test_scaled = scaler.transform(X_test)

In [11]:
y_pred = classifier.predict(X_test_scaled)
Z = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [12]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7166666666666667


In [13]:
# Merging the Z dataframe and df_test datframe 
df_test1 = pd.merge(df_test, Z, left_index=True, right_index=True, how='outer')

In [14]:
# Number of races

bins = [1,2,3]

for i in bins:
    x = len(df_test1.loc[(df_test1["Prediction"] == i) & (df_test1["Actual"] == i)])
    y = len(df_test1.loc[df_test1["Actual"] == i])
    print(f"The model predicted {x} race positions accurately out of {y} for bin {i} for 2019 season")

The model predicted 46 race positions accurately out of 63 for bin 1 for 2019 season
The model predicted 96 race positions accurately out of 147 for bin 2 for 2019 season
The model predicted 116 race positions accurately out of 150 for bin 3 for 2019 season


In [15]:
importance1 = classifier.coef_[0]
importance1

array([-1.09250115, -0.19513351,  0.45766828,  0.67821542,  1.67060666])

In [16]:
sorted(zip(importance1, X_train.columns), reverse=True)

[(1.6706066551109284, 'firstlap_position'),
 (0.6782154211743238, 'starting_position'),
 (0.45766827934752774, 'constructorId'),
 (-0.19513350712799624, 'driverId'),
 (-1.092501150637339, 'circuitId')]

## Support Vector Machines - multi class

In [17]:
import numpy as np

In [18]:
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [19]:
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1).fit(X_train_scaled, y_train)
poly = svm.SVC(kernel='poly', degree=3, C=1).fit(X_train_scaled, y_train)

In [20]:
poly_pred = poly.predict(X_test_scaled)
rbf_pred = rbf.predict(X_test_scaled)

In [21]:
poly_accuracy = accuracy_score(y_test, poly_pred)
poly_f1 = f1_score(y_test, poly_pred, average='weighted')
print('Accuracy (Polynomial Kernel): ', "%.2f" % (poly_accuracy*100))
print('F1 (Polynomial Kernel): ', "%.2f" % (poly_f1*100))

Accuracy (Polynomial Kernel):  69.17
F1 (Polynomial Kernel):  69.23


In [22]:
rbf_accuracy = accuracy_score(y_test, rbf_pred)
rbf_f1 = f1_score(y_test, rbf_pred, average='weighted')
print('Accuracy (RBF Kernel): ', "%.2f" % (rbf_accuracy*100))
print('F1 (RBF Kernel): ', "%.2f" % (rbf_f1*100))

Accuracy (RBF Kernel):  63.89
F1 (RBF Kernel):  63.19


## Random Forrest

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 
## rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)

In [25]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [26]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
Z = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
Z

Unnamed: 0,Prediction,Actual
3877,1,1
3878,1,1
3879,2,1
3880,1,2
3881,2,2
...,...,...
4291,3,3
4292,3,3
4293,3,3
4294,3,3


In [27]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.6805555555555556

In [28]:
importances = rf_model.feature_importances_
importances

array([0.18491573, 0.14755016, 0.11447309, 0.25160403, 0.30145699])

In [29]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.30145698637455515, 'firstlap_position'),
 (0.2516040316478057, 'starting_position'),
 (0.18491573142489443, 'circuitId'),
 (0.14755016054374692, 'driverId'),
 (0.11447309000899786, 'constructorId')]

## XG Boost

In [32]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBRFClassifier

# define the model
model2 = XGBRFClassifier(n_estimators=100, subsample=0.9, colsample_bynode=0.2)
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=9, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model2, X_train_scaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Mean Accuracy: 0.748 (0.018)
