In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Read results.csv and races.csv 
results_df = pd.read_csv(Path('../Resources/Dataset/results.csv'))
races_df = pd.read_csv(Path('../Resources/Dataset/races.csv'))

In [3]:
# Drop columns we dont need
results_df = results_df.drop(columns =['number', 'positionText', 'positionOrder', 
                                       'points', 'laps', 'time', 'milliseconds', 
                                       'fastestLap', 'rank', 'fastestLapTime',
                                      'fastestLapSpeed'])                      

In [4]:
# Rename certain columns to make them easier to understand
results_df = results_df.rename(columns={"position": "ending_position", 
                                        "grid": "starting_position",
                                        "statusId": "finishing_status" })                 

In [5]:
print(results_df.shape)
results_df.head()

(25040, 7)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status
0,1,18,1,1,1,1,1
1,2,18,2,2,5,2,1
2,3,18,3,3,7,3,1
3,4,18,4,4,11,4,1
4,5,18,5,1,3,5,1


In [6]:
# Merge results_df with races_df on raceId. Only bring in columns from races_df that we need
results_df = (results_df.merge(races_df, left_on='raceId', right_on='raceId')
          .reindex(columns=['resultId', 'raceId', 'driverId', 'constructorId',
                                  'starting_position', 'ending_position', 'finishing_status', 
                                 'year', 'circuitId', 'name']))

In [7]:
print(results_df.shape)
results_df.head()

(25040, 10)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix


These represent finishing statuses. Drivers can finish multiple laps down from the lead driver. For our purpose, we only care that the driver finished the race so we will convert all these statuses to 1. 

1: Finished, 11: +1 Lap, 12: +2 Laps, 13: +3 Laps, 14: +4 Laps,15: +5 Laps, 16: +6 Laps,17: +7 Laps,
18: +8 Laps, 19: +9 Laps, 45: +11 Laps, 50: +17 Laps,12: +42 Laps, 53: +13 Laps,55: +12 Laps, 
58: +26 Laps, 88: +10 Laps, 111: +14 Laps, 112: +15 Laps, 113: +25 Laps, 114: +18 Laps, 115: +22 Laps,
116: +16 Laps, 117: +24 Laps, 118: +29 Laps, 119: +23 Laps, 120: +21 Laps, 122: +44 Laps, 123: +30 Laps, 
124: +19 Laps, 125: +46 Laps,127: +20 Laps,133: +49 Laps,134: +38 Laps

In [8]:
# We will finish all the statuses where drivers finish the race to 1
replace = {
    1:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1, 17:1, 18:1, 19:1, 45:1, 50:1, 128:1, 53:1, 55:1, 58:1, 
    88:1, 111:1, 112:1, 113:1, 114:1, 115:1, 116:1, 117:1, 118:1, 119:1, 120:1, 122:1, 123:1, 124:1, 
    125:1, 127:1, 133:1, 134:1  
}
results_df["finishing_status"] = results_df["finishing_status"].replace(replace)

In [9]:
# Transform finish_status column. Anything which is not 1 (driver did not finish race) is converted to 2
def change_status(x):
    if x == 1:
        return 1
    else:
        return 2
    
results_df["finishing_status"] = results_df["finishing_status"].apply(change_status)
print(results_df.shape)
results_df.head()

(25040, 10)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix


In [10]:
# Identify unique ending_positions to identify if any rows need to be dropped
results_df["ending_position"].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '\\N', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33'],
      dtype=object)

In [11]:
# Replace all "\\N" ending_positions to 0 
replace1 = {"\\N":0}
results_df["ending_position"] = results_df["ending_position"].replace(replace1)

In [12]:
# Create a train_test column. 2019 data will be test and 2010 to 2018 will be 

results_df['train_test'] = results_df['year']

def change_year(x):
    if x < 2010:
        return 3
    elif x == 2019:
        return 2 
    elif x > 2019:
        return 4
    else:
        return 1
results_df["train_test"] = results_df["train_test"].apply(change_year)

print(results_df.shape)
results_df.head()

(25040, 11)


Unnamed: 0,resultId,raceId,driverId,constructorId,starting_position,ending_position,finishing_status,year,circuitId,name,train_test
0,1,18,1,1,1,1,1,2008,1,Australian Grand Prix,3
1,2,18,2,2,5,2,1,2008,1,Australian Grand Prix,3
2,3,18,3,3,7,3,1,2008,1,Australian Grand Prix,3
3,4,18,4,4,11,4,1,2008,1,Australian Grand Prix,3
4,5,18,5,1,3,5,1,2008,1,Australian Grand Prix,3


In [13]:
# Rearrainging columns
results_df = results_df[['resultId',
                         'raceId',
                         'name',
                         'year',
                         'circuitId',
                         'driverId',
                         'constructorId',
                         'starting_position',
                         'finishing_status',
                         'train_test',
                         'ending_position']]
print(results_df.shape)
results_df.head()

(25040, 11)


Unnamed: 0,resultId,raceId,name,year,circuitId,driverId,constructorId,starting_position,finishing_status,train_test,ending_position
0,1,18,Australian Grand Prix,2008,1,1,1,1,1,3,1
1,2,18,Australian Grand Prix,2008,1,2,2,5,1,3,2
2,3,18,Australian Grand Prix,2008,1,3,3,7,1,3,3
3,4,18,Australian Grand Prix,2008,1,4,4,11,1,3,4
4,5,18,Australian Grand Prix,2008,1,5,1,3,1,3,5


In [14]:
results_final_df = results_df.loc[results_df["train_test"] <= 2]
print(results_final_df.shape)
results_final_df.head()

(4297, 11)


Unnamed: 0,resultId,raceId,name,year,circuitId,driverId,constructorId,starting_position,finishing_status,train_test,ending_position
20323,20323,337,Bahrain Grand Prix,2010,3,4,6,3,1,1,1
20324,20324,337,Bahrain Grand Prix,2010,3,13,6,2,1,1,2
20325,20325,337,Bahrain Grand Prix,2010,3,1,1,4,1,1,3
20326,20326,337,Bahrain Grand Prix,2010,3,20,9,1,1,1,4
20327,20327,337,Bahrain Grand Prix,2010,3,3,131,5,1,1,5


In [15]:
results_final_df.to_csv('../Resources/PythonExport/results_final.csv')

In [16]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler

In [18]:
df = pd.read_csv(Path('../Resources/PythonExport/results_final.csv'))
print(df.shape)
df.head()

(4297, 12)


Unnamed: 0.1,Unnamed: 0,resultId,raceId,name,year,circuitId,driverId,constructorId,starting_position,finishing_status,train_test,ending_position
0,20323,20323,337,Bahrain Grand Prix,2010,3,4,6,3,1,1,1
1,20324,20324,337,Bahrain Grand Prix,2010,3,13,6,2,1,1,2
2,20325,20325,337,Bahrain Grand Prix,2010,3,1,1,4,1,1,3
3,20326,20326,337,Bahrain Grand Prix,2010,3,20,9,1,1,1,4
4,20327,20327,337,Bahrain Grand Prix,2010,3,3,131,5,1,1,5


In [20]:
df1 = df.loc[(df["finishing_status"] == 1)]

In [21]:
# Creating our train and test data sets using the train_test column
df_train = df1.loc[df["train_test"] == 1]
df_test = df1.loc[df["train_test"] == 2]

In [23]:
X_train = df_train[['circuitId','driverId','constructorId', 'starting_position']]
y_train = df_train['ending_position']

In [24]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [27]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', multi_class='multinomial')
classifier

LogisticRegression(multi_class='multinomial')

In [28]:
classifier.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(multi_class='multinomial')

In [30]:
X_test = df_test[['circuitId','driverId','constructorId', 'starting_position']]
y_test = df_test['ending_position']

In [31]:
scaler1 = StandardScaler().fit(X_test)
X_test_scaled = scaler.transform(X_test)

In [32]:
y_pred = classifier.predict(X_test_scaled)
Z = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [33]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.15833333333333333


In [34]:
# Merging the Z dataframe and df_test datframe 
df_test1 = pd.merge(df_test, Z, left_index=True, right_index=True, how='outer')

In [41]:
df_pred_result = df_test1.loc[(df_test1["Prediction"] == 3) & (df_test1["ending_position"] == 3)]
print(df_pred_result.shape)
df_pred_result

(5, 14)


Unnamed: 0.1,Unnamed: 0,resultId,raceId,name,year,circuitId,driverId,constructorId,starting_position,finishing_status,train_test,ending_position,Prediction,Actual
3999,24322,24328,1016,Canadian Grand Prix,2019,7,844,6,3,1,2,3,3,3
4019,24342,24348,1017,French Grand Prix,2019,34,844,6,3,1,2,3,3,3
4059,24382,24388,1019,British Grand Prix,2019,9,844,6,3,1,2,3,3,3
4239,24582,24568,1028,United States Grand Prix,2019,69,830,9,3,1,2,3,3,3
4279,24622,24608,1030,Abu Dhabi Grand Prix,2019,24,844,6,3,1,2,3,3,3
