In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [49]:
df_train = pd.read_csv('./data/train.csv').drop_duplicates()
df_test = pd.read_csv('./data/test.csv').drop_duplicates()
df_test

Unnamed: 0,raceId,year,circuitId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driverId,constructorId,...,milliseconds,statusId,circuit_country,constructor_position,constructor_wins,constructor_nationality,driver_nationality,driver_wins,driver_age,results_positionOrder
0,1010,2019,-1.399483,1.0,0.0,0.0,0.0,0.0,-2.305908,-1.891242,...,0.164847,1,2,1.0,-0.307582,-1.511226,33,-0.307582,0.708333,2
1,1012,2019,-1.492647,0.0,0.0,0.0,0.0,1.0,-2.305908,-1.891242,...,0.177463,1,9,1.0,1.527107,-1.511226,33,1.527107,0.708333,1
2,1011,2019,-1.652281,1.0,0.0,0.0,0.0,0.0,-2.305908,-1.891242,...,0.181965,1,5,1.0,0.609763,-1.511226,33,0.609763,0.708333,1
3,1014,2019,-0.795461,1.0,0.0,0.0,0.0,0.0,-2.305908,-1.891242,...,0.184939,1,28,1.0,2.444452,-1.511226,33,2.444452,0.708333,1
4,1015,2019,-0.115243,0.0,0.0,0.0,0.0,1.0,-2.305908,-1.891242,...,0.200219,1,19,1.0,3.361796,-1.511226,33,3.361796,0.708333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2381,648,1970,-2.106547,0.0,0.0,0.0,0.0,1.0,0.141578,-1.184930,...,1.000000,0,10,7.0,-0.307582,-0.177747,14,-0.307582,0.458333,14
2382,643,1970,0.487922,0.0,0.0,1.0,0.0,0.0,0.194650,0.115553,...,1.000000,0,27,5.0,-0.307582,0.055528,46,-0.307582,1.000000,8
2383,643,1970,0.487922,0.0,0.0,1.0,0.0,0.0,0.066005,0.372834,...,1.000000,0,27,1.0,-0.307582,0.055528,27,-0.307582,0.708333,11
2384,654,1970,-0.181407,0.0,0.0,1.0,0.0,1.0,0.141578,-0.086398,...,1.000000,0,34,6.0,-0.307582,0.055528,33,-0.307582,0.625000,25


### Linear Regression

In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_selection import RFE
def linearRegression(X_train, Y_train):
    model = LinearRegression(fit_intercept=False)
    model.fit(X_train, Y_train)        

    return model

In [51]:
def get_Linear_Regression_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder']
    model = linearRegression(X_train, Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)        

        precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())


### Naive Bayes Regression - Gaussian

In [52]:
from sklearn.naive_bayes import GaussianNB
def gaussian_NB(X_train, Y_train):
    model = GaussianNB()
    model.fit(X_train, Y_train)

    return model

In [53]:
from sklearn.metrics import precision_score

def get_NB_Regression_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder']
    model = gaussian_NB(X_train, Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)        

        precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

### Decision Tree regressor

In [54]:
from sklearn.tree import DecisionTreeRegressor
def DTregressor(X_train, Y_train):
    model = DecisionTreeRegressor()
    model.fit(X_train, Y_train)

    return model

In [55]:
from sklearn.metrics import precision_score

def get_DT_Regression_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder']
    model = DTregressor(X_train, Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)        

        precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

### Random Forest regressor

In [56]:
from sklearn.ensemble import RandomForestRegressor
def RFregressor(X_train, Y_train):
    model = RandomForestRegressor()
    model.fit(X_train, Y_train)

    return model

In [57]:
from sklearn.metrics import precision_score

def get_RF_Regression_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder']
    model = RFregressor(X_train, Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)        

        precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

## Compiling all regressions

In [58]:
regression_scores = {}
regression_scores['Linear Regression'] = get_Linear_Regression_Score()
regression_scores['NB Regression'] = get_NB_Regression_Score()
regression_scores['DT Regression'] = get_DT_Regression_Score()
regression_scores['RF Regression'] = get_RF_Regression_Score()

In [59]:
pd.DataFrame(regression_scores, index = ['Precision', 'Accuracy', 'Recall', 'F1'])

Unnamed: 0,Linear Regression,NB Regression,DT Regression,RF Regression
Precision,0.660194,0.669903,0.825243,0.893204
Accuracy,0.88367,0.882404,0.897058,0.90308
Recall,0.220065,0.223301,0.275081,0.297735
F1,0.330097,0.334951,0.412621,0.446602


### Logistic Regression

In [60]:
from sklearn.linear_model import LogisticRegression

def logisticRegression(X_train, Y_train):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, Y_train)

    return model

In [61]:
from sklearn.metrics import precision_score

def get_Logistic_Regression_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder'].to_frame()    
    Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    model = logisticRegression(X_train, Y_train.values.ravel())

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # print(prediction_df)            
        precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())


In [62]:
# df_train
get_Logistic_Regression_Score()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.8737864077669902, 0.901288108224765, 0.2912621359223297, 0.4368932038834951)

### Naive Bayes Classifier

In [63]:
from sklearn.naive_bayes import GaussianNB

def gaussianNB(X_train, Y_train):
    model = GaussianNB()
    model.fit(X_train, Y_train)

    return model

In [64]:
from sklearn.metrics import precision_score

def get_Gaussian_NB_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder'].to_frame()    
    Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    model = gaussianNB(X_train, Y_train.values.ravel())

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # print(prediction_df)            
        precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())


In [65]:
get_Gaussian_NB_Score()

(0.8058252427184466,
 0.8955115726424042,
 0.2686084142394819,
 0.4029126213592233)

### Decision Tree Classifier

In [66]:
from sklearn.tree import DecisionTreeClassifier

def DT_classifier(X_train, Y_train):
    model = DecisionTreeClassifier()
    model.fit(X_train, Y_train)

    return model

In [67]:
from sklearn.metrics import precision_score

def get_DT_classifier_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder'].to_frame()    
    Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    model = DT_classifier(X_train, Y_train.values.ravel())

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # print(prediction_df)            
        precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())


In [68]:
get_DT_classifier_Score()

(0.6310679611650486,
 0.8805868034892437,
 0.2103559870550161,
 0.3155339805825243)

### Random Forest Classifier

In [69]:
from sklearn.ensemble import RandomForestClassifier

def RF_classifier(X_train, Y_train):
    model = RandomForestClassifier()
    model.fit(X_train, Y_train)

    return model

In [70]:
from sklearn.metrics import precision_score

def get_RF_classifier_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder'].to_frame()    
    Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    model = RF_classifier(X_train, Y_train.values.ravel())

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # print(prediction_df)            
        precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())


In [71]:
get_RF_classifier_Score()

(0.8737864077669902, 0.901332626812973, 0.2912621359223297, 0.4368932038834951)

## Compiling all classifications

In [72]:
classification_scores = {}
classification_scores['Logistic Regression'] = get_Logistic_Regression_Score()
classification_scores['NB classification'] = get_Gaussian_NB_Score()
classification_scores['DT classification'] = get_DT_classifier_Score()
classification_scores['RF classification'] = get_RF_classifier_Score()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [73]:
pd.DataFrame(classification_scores, index = ['Precision', 'Accuracy', 'Recall', 'F1'])

Unnamed: 0,Logistic Regression,NB classification,DT classification,RF classification
Precision,0.873786,0.805825,0.68932,0.873786
Accuracy,0.901288,0.895512,0.885733,0.901185
Recall,0.291262,0.268608,0.229773,0.291262
F1,0.436893,0.402913,0.34466,0.436893
