In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

In [2]:
df_train = pd.read_csv('./data/train.csv').drop_duplicates()
df_test = pd.read_csv('./data/test.csv').drop_duplicates()
df_test

Unnamed: 0,raceId,year,circuitId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driverId,constructorId,...,milliseconds,statusId,circuit_country,constructor_position,constructor_wins,constructor_nationality,driver_nationality,driver_wins,driver_age,results_positionOrder
0,338,2010,-1.429235,0.0,0.0,0.0,1.0,1.0,-0.851798,-0.883873,...,0.180472,1,2,2.0,0.608447,0.044247,33,0.608447,0.521739,1
1,339,2010,-1.398920,0.0,0.0,1.0,0.0,0.0,-0.851798,-0.883873,...,0.182133,1,17,2.0,0.608447,0.044247,33,0.608447,0.521739,8
2,340,2010,-1.555827,0.0,0.0,0.0,1.0,0.0,-0.851798,-0.883873,...,0.206682,1,9,1.0,1.527000,0.044247,33,1.527000,0.521739,1
3,337,2010,-1.721793,1.0,0.0,0.0,0.0,0.0,-0.851798,-0.883873,...,0.193454,1,5,2.0,-0.310106,0.044247,33,-0.310106,0.521739,7
4,341,2010,-0.830986,0.0,0.0,1.0,0.0,0.0,-0.851798,-0.883873,...,0.186854,1,28,1.0,1.527000,0.044247,33,1.527000,0.521739,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1998,616,1973,-0.150902,1.0,0.0,0.0,0.0,0.0,0.153611,1.177792,...,1.000000,0,3,10.0,-0.310106,0.044247,39,-0.310106,0.565217,9
1999,614,1973,-0.332844,0.0,0.0,1.0,0.0,0.0,0.153611,1.177792,...,1.000000,0,21,9.0,-0.310106,0.044247,39,-0.310106,0.565217,6
2000,606,1973,-0.450273,1.0,0.0,0.0,0.0,1.0,0.131847,1.041634,...,1.000000,0,7,9.0,-0.310106,0.044247,7,-0.310106,0.782609,12
2001,607,1973,0.365017,0.0,0.0,1.0,0.0,0.0,0.440952,1.177792,...,1.000000,0,27,8.0,-0.310106,0.044247,27,-0.310106,0.869565,18


### Linear Regression

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_selection import RFE
def linearRegression(X_train, Y_train):
    model = LinearRegression(fit_intercept=False)
    model.fit(X_train, Y_train)        

    return model

In [62]:
from sklearn.metrics import precision_score

def get_Linear_Regression_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder']
    model = linearRegression(X_train, Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)        

        score += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    modelScore = score / df_test['raceId'].nunique()
    return modelScore


### Naive Bayes Regression - Gaussian

In [5]:
from sklearn.naive_bayes import GaussianNB
def gaussian_NB(X_train, Y_train):
    model = GaussianNB()
    model.fit(X_train, Y_train)

    return model

In [6]:
from sklearn.metrics import precision_score

def get_NB_Regression_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder']
    model = gaussian_NB(X_train, Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)        

        score += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    modelScore = score / df_test['raceId'].nunique()
    return modelScore


### Decision Tree regressor

In [7]:
from sklearn.tree import DecisionTreeRegressor
def DTregressor(X_train, Y_train):
    model = DecisionTreeRegressor()
    model.fit(X_train, Y_train)

    return model

In [8]:
from sklearn.metrics import precision_score

def get_DT_Regression_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder']
    model = DTregressor(X_train, Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)        

        score += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    modelScore = score / df_test['raceId'].nunique()
    return modelScore


### Random Forest regressor

In [9]:
from sklearn.ensemble import RandomForestRegressor
def RFregressor(X_train, Y_train):
    model = RandomForestRegressor()
    model.fit(X_train, Y_train)

    return model

In [10]:
from sklearn.metrics import precision_score

def get_RF_Regression_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder']
    model = RFregressor(X_train, Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)        

        score += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    modelScore = score / df_test['raceId'].nunique()
    return modelScore


## Compiling all regressions

In [11]:
regression_scores = {}
regression_scores['Linear Regression'] = get_Linear_Regression_Score()
regression_scores['NB Regression'] = get_NB_Regression_Score()
regression_scores['DT Regression'] = get_DT_Regression_Score()
regression_scores['RF Regression'] = get_RF_Regression_Score()

In [13]:
pd.DataFrame(regression_scores.items(), columns = ['model', 'accuracy'])

Unnamed: 0,model,accuracy
0,Linear Regression,0.534091
1,NB Regression,0.625
2,DT Regression,0.852273
3,RF Regression,0.920455


### Logistic Regression

In [69]:
from sklearn.linear_model import LogisticRegression

def logisticRegression(X_train, Y_train):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, Y_train)

    return model

In [72]:
from sklearn.metrics import precision_score

def get_Logistic_Regression_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder'].to_frame()    
    Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    model = logisticRegression(X_train, Y_train.values.ravel())

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # print(prediction_df)            
        score += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    modelScore = score / df_test['raceId'].nunique()
    return modelScore


In [73]:
# df_train
get_Logistic_Regression_Score()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8636363636363636

### Naive Bayes Classifier

In [80]:
from sklearn.naive_bayes import GaussianNB

def gaussianNB(X_train, Y_train):
    model = GaussianNB()
    model.fit(X_train, Y_train)

    return model

In [81]:
from sklearn.metrics import precision_score

def get_Gaussian_NB_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder'].to_frame()    
    Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    model = gaussianNB(X_train, Y_train.values.ravel())

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # print(prediction_df)            
        score += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    modelScore = score / df_test['raceId'].nunique()
    return modelScore


In [83]:
get_Gaussian_NB_Score()

0.8295454545454546

### Decision Tree Classifier

In [84]:
from sklearn.tree import DecisionTreeClassifier

def DT_classifier(X_train, Y_train):
    model = DecisionTreeClassifier()
    model.fit(X_train, Y_train)

    return model

In [85]:
from sklearn.metrics import precision_score

def get_DT_classifier_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder'].to_frame()    
    Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    model = DT_classifier(X_train, Y_train.values.ravel())

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # print(prediction_df)            
        score += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    modelScore = score / df_test['raceId'].nunique()
    return modelScore


In [86]:
get_DT_classifier_Score()

0.6590909090909091

### Random Forest Classifier

In [87]:
from sklearn.ensemble import RandomForestClassifier

def RF_classifier(X_train, Y_train):
    model = RandomForestClassifier()
    model.fit(X_train, Y_train)

    return model

In [88]:
from sklearn.metrics import precision_score

def get_RF_classifier_Score():
    X_train = df_train.drop(columns = ['results_positionOrder'])
    Y_train = df_train['results_positionOrder'].to_frame()    
    Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    model = RF_classifier(X_train, Y_train.values.ravel())

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        prediction_df.sort_values('results1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)                
        prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # print(prediction_df)            
        score += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    modelScore = score / df_test['raceId'].nunique()
    return modelScore


In [89]:
get_RF_classifier_Score()

0.8977272727272727

## Compiling all classifications

In [97]:
classification_scores = {}
classification_scores['Logistic Regression'] = get_Logistic_Regression_Score()
classification_scores['NB classification'] = get_Gaussian_NB_Score()
classification_scores['DT classification'] = get_DT_classifier_Score()
classification_scores['RF classification'] = get_RF_classifier_Score()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [98]:
pd.DataFrame(classification_scores.items(), columns = ['model', 'accuracy'])

Unnamed: 0,model,accuracy
0,Logistic Regression,0.863636
1,NB classification,0.829545
2,DT classification,0.647727
3,RF classification,0.909091
