In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

In [26]:
df = pd.read_csv(Path('../Resources/PythonExport/race_final.csv'))
print(df.shape)
df

(4297, 14)


Unnamed: 0.1,Unnamed: 0,resultId,raceId,name,year,circuitId,driverId,constructorId,starting_position,finishing_status,firstlap_position,lap,train_test,ending_position
0,20323,20323,337,Bahrain Grand Prix,2010,3,4,6,3,1,2,1,1,1
1,20324,20324,337,Bahrain Grand Prix,2010,3,13,6,2,1,3,1,1,2
2,20325,20325,337,Bahrain Grand Prix,2010,3,1,1,4,1,5,1,1,3
3,20326,20326,337,Bahrain Grand Prix,2010,3,20,9,1,1,1,1,1,4
4,20327,20327,337,Bahrain Grand Prix,2010,3,3,131,5,1,4,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4292,24635,24621,1030,Abu Dhabi Grand Prix,2019,24,841,51,16,1,13,1,2,16
4293,24636,24622,1030,Abu Dhabi Grand Prix,2019,24,847,3,18,1,19,1,2,17
4294,24637,24623,1030,Abu Dhabi Grand Prix,2019,24,842,5,11,1,20,1,2,18
4295,24638,24624,1030,Abu Dhabi Grand Prix,2019,24,9,3,19,1,18,1,2,19


In [27]:
df["circuitId"].value_counts()

4     218
6     218
15    218
11    218
7     218
9     218
13    218
17    218
22    218
14    218
24    216
1     216
18    214
3     194
2     178
69    166
10    132
70    124
71    123
32    102
35     94
73     82
12     72
68     70
5      48
20     46
34     40
Name: circuitId, dtype: int64

In [31]:
df.dtypes

Unnamed: 0            int64
resultId              int64
raceId                int64
name                 object
year                  int64
circuitId             int64
driverId              int64
constructorId         int64
starting_position     int64
finishing_status      int64
firstlap_position     int64
lap                   int64
train_test            int64
ending_position       int64
dtype: object

In [3]:
# We are only taking rows where the finishing_status = 1, aka driver finished the race
#df1 = df.loc[(df["finishing_status"] == 1)]

In [4]:
# Creating our train and test data sets using the train_test column
df_train = df.loc[df["train_test"] == 1]
df_test = df.loc[df["train_test"] == 2]
print(df_train.shape)
print(df_test.shape)

(3877, 14)
(420, 14)


In [30]:
df_test["circuitId"].value_counts()

1     20
17    20
71    20
70    20
69    20
34    20
32    20
24    20
22    20
18    20
15    20
3     20
14    20
13    20
11    20
10    20
9     20
7     20
6     20
4     20
73    20
Name: circuitId, dtype: int64

In [5]:
X_train = df_train[['circuitId','driverId','constructorId', 'starting_position', 'firstlap_position']]
y_train = df_train['ending_position']

In [6]:
X_test = df_test[['circuitId','driverId','constructorId', 'starting_position', 'firstlap_position']]
y_test = df_test['ending_position']

In [7]:
mlr = LinearRegression()  
mlr.fit(X_train, y_train)

LinearRegression()

In [8]:
print("Intercept: ", mlr.intercept_)
print("Coefficients:")
list(zip(X_train, mlr.coef_))

Intercept:  1.3623136658997765
Coefficients:


[('circuitId', 0.00338719461197072),
 ('driverId', 0.0011511350339709664),
 ('constructorId', 0.0035930987376151585),
 ('starting_position', 0.1273152163669493),
 ('firstlap_position', 0.39311980311426176)]

In [9]:
X_test = df_test[['circuitId','driverId','constructorId', 'starting_position', 'firstlap_position']]
y_test = df_test['ending_position']

In [10]:
y_pred_mlr= mlr.predict(X_test)

In [11]:
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value
3877,1,3.43038
3878,2,2.751103
3879,3,4.435221
3880,4,2.971587
3881,5,4.960993


In [12]:
mlr_diff["Predicted Round"] = mlr_diff["Predicted value"].round(0)
mlr_diff["Predicted Round"] = mlr_diff["Predicted Round"].astype(int)
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value,Predicted Round
3877,1,3.43038,3
3878,2,2.751103,3
3879,3,4.435221,4
3880,4,2.971587,3
3881,5,4.960993,5


In [13]:
mlr_diff.dtypes

Actual value         int64
Predicted value    float64
Predicted Round      int64
dtype: object

In [14]:
df_test1 = pd.merge(df_test, mlr_diff, left_index=True, right_index=True, how='outer')

In [15]:
# Predicting the Podium
positions = [1,2,3,4,5,6,7,8,9,10]

for i in positions:
    x = len(df_test1.loc[(df_test1["Predicted Round"] == i) & (df_test1["Actual value"] == i)])
    y = len(df_test1.loc[df_test1["Actual value"] == i])
    print(f"The model predicted {x} race positions accurately out of {y} for race position {i} for 2019 season")

The model predicted 0 race positions accurately out of 21 for race position 1 for 2019 season
The model predicted 2 race positions accurately out of 21 for race position 2 for 2019 season
The model predicted 3 race positions accurately out of 21 for race position 3 for 2019 season
The model predicted 2 race positions accurately out of 21 for race position 4 for 2019 season
The model predicted 6 race positions accurately out of 21 for race position 5 for 2019 season
The model predicted 7 race positions accurately out of 21 for race position 6 for 2019 season
The model predicted 7 race positions accurately out of 21 for race position 7 for 2019 season
The model predicted 6 race positions accurately out of 21 for race position 8 for 2019 season
The model predicted 4 race positions accurately out of 21 for race position 9 for 2019 season
The model predicted 2 race positions accurately out of 21 for race position 10 for 2019 season


In [16]:
# Summary of the Model
model = sm.OLS.from_formula("ending_position ~ circuitId + driverId + constructorId + starting_position + firstlap_position", data=df_train)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,ending_position,R-squared:,0.318
Model:,OLS,Adj. R-squared:,0.317
Method:,Least Squares,F-statistic:,360.4
Date:,"Tue, 13 Jul 2021",Prob (F-statistic):,6.17e-318
Time:,20:35:11,Log-Likelihood:,-11821.0
No. Observations:,3877,AIC:,23650.0
Df Residuals:,3871,BIC:,23690.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.3623,0.195,6.995,0.000,0.980,1.744
circuitId,0.0034,0.004,0.848,0.397,-0.004,0.011
driverId,0.0012,0.000,5.206,0.000,0.001,0.002
constructorId,0.0036,0.001,3.337,0.001,0.001,0.006
starting_position,0.1273,0.022,5.901,0.000,0.085,0.170
firstlap_position,0.3931,0.021,18.847,0.000,0.352,0.434

0,1,2,3
Omnibus:,252.147,Durbin-Watson:,0.821
Prob(Omnibus):,0.0,Jarque-Bera (JB):,305.847
Skew:,-0.642,Prob(JB):,3.86e-67
Kurtosis:,3.494,Cond. No.,1390.0


## Support Vector Regression

In [17]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [18]:
scaler1 = StandardScaler().fit(X_test)
X_test_scaled = scaler.transform(X_test)

In [19]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

SVR()

In [20]:
print('R squared: {:.2f}'.format(regressor.score(X_train, y_train)*100))

R squared: 10.32


## Random Forrest - multi class

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
# Create a random forest classifier.
##rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 
rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)

In [23]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [24]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
Z2 = pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [25]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

NameError: name 'accuracy_score' is not defined

In [None]:
# Merging the Z dataframe and df_test datframe 
df_test3 = pd.merge(df_test, Z2, left_index=True, right_index=True, how='outer')

In [None]:
# Number of races

positions = [1,2,3]

for i in positions:
    x = len(df_test3.loc[(df_test1["Prediction"] == i) & (df_test1["Actual"] == i)])
    y = len(df_test3.loc[df_test1["Actual"] == i])
    print(f"The model predicted {x} race positions accurately out of {y} for race position {i} for 2019 season")

In [None]:
importances = rf_model.feature_importances_
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

## XGBoost

In [None]:
#from numpy import mean
#from numpy import std
#from sklearn.datasets import make_classification
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import RepeatedStratifiedKFold
#from xgboost import XGBRFClassifier

# define the model
#model2 = XGBRFClassifier(n_estimators=100, subsample=0.9, colsample_bynode=0.2)
# define the model evaluation procedure
#cv = RepeatedStratifiedKFold(n_splits=9, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
#n_scores = cross_val_score(model2, X_train_scaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
#print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))