In [205]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

In [183]:
df_train = pd.read_csv('./data/train.csv').drop_duplicates()
df_test = pd.read_csv('./data/test.csv').drop_duplicates()
df_test

Unnamed: 0,raceId,year,circuitId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driverId,constructorId,...,milliseconds,statusId,circuit_country,constructor_position,constructor_wins,constructor_nationality,driver_nationality,driver_wins,driver_age,results_positionOrder
0,880,2013,-1.264318,1.0,0.0,0.0,0.0,0.0,-0.671359,-0.965034,...,0.176079,1,2,6.0,-0.310902,0.105238,33,-0.310902,0.666667,9
1,881,2013,-1.224938,0.0,0.0,1.0,1.0,1.0,-0.671359,-0.965034,...,1.000000,0,17,7.0,-0.310902,0.105238,33,-0.310902,0.666667,17
2,882,2013,-1.395410,1.0,0.0,0.0,0.0,0.0,-0.671359,-0.965034,...,0.187334,1,9,5.0,-0.310902,0.105238,33,-0.310902,0.666667,5
3,883,2013,-1.495754,1.0,0.0,0.0,0.0,0.0,-0.671359,-0.965034,...,0.186830,1,5,6.0,-0.310902,0.105238,33,-0.310902,0.666667,10
4,884,2013,-0.691174,1.0,0.0,0.0,0.0,0.0,-0.671359,-0.965034,...,0.194470,1,28,6.0,-0.310902,0.105238,33,-0.310902,0.666667,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499,567,1976,1.281412,1.0,0.0,0.0,0.0,0.0,0.129289,-0.955050,...,1.000000,0,10,17.0,-0.310902,0.105238,33,-0.310902,0.541667,20
2500,571,1976,-0.613072,0.0,0.0,1.0,0.0,0.0,0.645484,1.129065,...,1.000000,0,21,10.0,-0.310902,0.105238,29,-0.310902,0.791667,20
2501,563,1976,0.861440,1.0,0.0,0.0,0.0,0.0,0.129289,-0.595194,...,1.000000,0,28,15.0,-0.310902,0.105238,28,-0.310902,0.583333,20
2502,574,1976,-0.072666,1.0,0.0,0.0,0.0,0.0,0.129289,0.043700,...,1.000000,0,34,14.0,-0.310902,0.076599,2,-0.310902,0.375000,14


In [185]:
class customCrossValidation():

    def split(self, x):
        indices = []
        for i in range(5):
            years = x['year'].unique()
            np.random.shuffle(years)
            val_years = years[:6]
            train_years = years[6:]

            indices.append((x[x['year'].isin(train_years)].index, x[x['year'].isin(val_years)].index))
        return indices

##### Ridge Regression

In [186]:
class linearRegression() :
    
    def my_scoring(self, model, x, y):

        precision = 0
        accuracy = 0
        recall = 0
        f1 = 0
        for i in x['raceId'].unique():
            prediction_df = pd.DataFrame(model.predict(x[x['raceId'] == i]), columns = ['results'])
            Y_test = y[x[x['raceId'] == i].index]

            
            prediction_df['podium'] = Y_test.reset_index(drop = True)
            prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x in [1, 2, 3] else 0)
            prediction_df.sort_values('results', ascending = True, inplace = True)
            prediction_df.reset_index(inplace = True, drop = True)
            prediction_df['predicted'] = prediction_df.index
            prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

            precision += precision_score(prediction_df.actual, prediction_df.predicted)
            accuracy += accuracy_score(prediction_df.actual, prediction_df.predicted)
            recall += recall_score(prediction_df.actual, prediction_df.predicted)
            f1 += f1_score(prediction_df.actual, prediction_df.predicted)
        
        self.ridge_metrics = {'precision': precision/len(x['raceId'].unique()), 'accuracy': accuracy/len(x['raceId'].unique()), 'recall': recall/len(x['raceId'].unique()), 'f1': f1/len(x['raceId'].unique())}
        return precision/len(x['raceId'].unique())

    def find_best_param_ridge(self, x, y):

        self.x = x
        self.y = y

        splitter = customCrossValidation().split(x)
        
        hyper_params = [{'alpha': [0.001, 0.01, 0.1, 1, 5, 10, 100, 1000], 'solver': ['svd', 'cholesky', 'saga']}]

        model_ridge = Ridge()
        model_cv = GridSearchCV(estimator=model_ridge, param_grid=hyper_params, scoring=self.my_scoring, cv = splitter, return_train_score=True, verbose = 3)
        model_cv.fit(x, y)
        self.ridge_params = model_cv.best_params_

    def fit_ridge(self, x, y):
        model = Ridge(**self.ridge_params)
        model.fit(x, y)
        self.model = model
        return
        


In [187]:
lr = linearRegression()

In [189]:
# print(df_train)
lr.find_best_param_ridge(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END alpha=0.001, solver=svd;, score=(train=0.826, test=0.886) total time=   0.5s
[CV 2/5] END alpha=0.001, solver=svd;, score=(train=0.831, test=0.914) total time=   0.4s
[CV 3/5] END alpha=0.001, solver=svd;, score=(train=0.835, test=0.806) total time=   0.6s
[CV 4/5] END alpha=0.001, solver=svd;, score=(train=0.834, test=0.753) total time=   0.6s
[CV 5/5] END alpha=0.001, solver=svd;, score=(train=0.829, test=0.912) total time=   0.5s
[CV 1/5] END alpha=0.001, solver=cholesky;, score=(train=0.826, test=0.886) total time=   0.4s
[CV 2/5] END alpha=0.001, solver=cholesky;, score=(train=0.831, test=0.914) total time=   0.4s
[CV 3/5] END alpha=0.001, solver=cholesky;, score=(train=0.835, test=0.806) total time=   0.4s
[CV 4/5] END alpha=0.001, solver=cholesky;, score=(train=0.834, test=0.753) total time=   0.4s
[CV 5/5] END alpha=0.001, solver=cholesky;, score=(train=0.829, test=0.912) total time=   0.4s
[CV 1/5] END 

In [190]:
lr.ridge_params

{'alpha': 1000, 'solver': 'svd'}

In [191]:
lr.fit_ridge(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

In [192]:
lr.my_scoring(lr.model, df_test.drop(['results_positionOrder'], axis = 1), df_test['results_positionOrder'])

0.7830188679245284

In [194]:
pd.DataFrame(lr.ridge_metrics, index = ['ridge'], columns = ['precision', 'accuracy', 'recall', 'f1'])

Unnamed: 0,precision,accuracy,recall,f1
ridge,0.783019,0.895487,0.261006,0.391509


##### Lasso Regression

In [198]:
class lassoRegression():
    def my_scoring(self, model, x, y):

        precision = 0
        accuracy = 0
        recall = 0
        f1 = 0
        for i in x['raceId'].unique():
            prediction_df = pd.DataFrame(model.predict(x[x['raceId'] == i]), columns = ['results'])
            Y_test = y[x[x['raceId'] == i].index]

            
            prediction_df['podium'] = Y_test.reset_index(drop = True)
            prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x in [1, 2, 3] else 0)
            prediction_df.sort_values('results', ascending = True, inplace = True)
            prediction_df.reset_index(inplace = True, drop = True)
            prediction_df['predicted'] = prediction_df.index
            prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

            precision += precision_score(prediction_df.actual, prediction_df.predicted)
            accuracy += accuracy_score(prediction_df.actual, prediction_df.predicted)
            recall += recall_score(prediction_df.actual, prediction_df.predicted)
            f1 += f1_score(prediction_df.actual, prediction_df.predicted)
        
        self.ridge_metrics = {'precision': precision/len(x['raceId'].unique()), 'accuracy': accuracy/len(x['raceId'].unique()), 'recall': recall/len(x['raceId'].unique()), 'f1': f1/len(x['raceId'].unique())}
        return precision/len(x['raceId'].unique())

    def find_best_param_lasso(self, x, y):

        self.x = x
        self.y = y

        splitter = customCrossValidation().split(x)
        
        hyper_params = [{'alpha': [0.1, 1, 5, 10, 100, 1000, 2000], 'selection': ['cyclic', 'random']}]

        model_lasso = Lasso()
        model_cv = GridSearchCV(estimator=model_lasso, param_grid=hyper_params, scoring=self.my_scoring, cv = splitter, return_train_score=True, verbose = 3)
        model_cv.fit(x, y)
        self.lasso_params = model_cv.best_params_

    def fit_lasso(self, x, y):
        model = Lasso(**self.lasso_params)
        model.fit(x, y)
        self.model = model
        return
    

In [199]:
lasso = lassoRegression()

In [200]:
lasso.find_best_param_lasso(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END alpha=0.1, selection=cyclic;, score=(train=0.822, test=0.973) total time=   0.5s
[CV 2/5] END alpha=0.1, selection=cyclic;, score=(train=0.841, test=0.857) total time=   0.5s
[CV 3/5] END alpha=0.1, selection=cyclic;, score=(train=0.833, test=0.903) total time=   0.4s
[CV 4/5] END alpha=0.1, selection=cyclic;, score=(train=0.831, test=0.852) total time=   0.5s
[CV 5/5] END alpha=0.1, selection=cyclic;, score=(train=0.843, test=0.797) total time=   0.5s
[CV 1/5] END alpha=0.1, selection=random;, score=(train=0.822, test=0.973) total time=   0.4s
[CV 2/5] END alpha=0.1, selection=random;, score=(train=0.841, test=0.857) total time=   0.4s
[CV 3/5] END alpha=0.1, selection=random;, score=(train=0.833, test=0.903) total time=   0.3s
[CV 4/5] END alpha=0.1, selection=random;, score=(train=0.831, test=0.852) total time=   0.4s
[CV 5/5] END alpha=0.1, selection=random;, score=(train=0.843, test=0.797) total time=   0.6s

In [201]:
lasso.lasso_params

{'alpha': 1, 'selection': 'cyclic'}

In [202]:
lasso.fit_lasso(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

In [203]:
lasso.my_scoring(lasso.model, df_test.drop(['results_positionOrder'], axis = 1), df_test['results_positionOrder'])

0.6981132075471698

In [204]:
pd.DataFrame(lasso.ridge_metrics, index = ['lasso'], columns = ['precision', 'accuracy', 'recall', 'f1'])

Unnamed: 0,precision,accuracy,recall,f1
lasso,0.698113,0.88822,0.232704,0.349057


##### Naive Bayes Regression

In [None]:
class NaiveBayesRegressor() :
    
    def my_scoring(self, model, x, y):

        precision = 0
        accuracy = 0
        recall = 0
        f1 = 0
        for i in x['raceId'].unique():
            prediction_df = pd.DataFrame(model.predict(x[x['raceId'] == i]), columns = ['results'])
            Y_test = y[x[x['raceId'] == i].index]

            
            prediction_df['podium'] = Y_test.reset_index(drop = True)
            prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x in [1, 2, 3] else 0)
            prediction_df.sort_values('results', ascending = True, inplace = True)
            prediction_df.reset_index(inplace = True, drop = True)
            prediction_df['predicted'] = prediction_df.index
            prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

            precision += precision_score(prediction_df.actual, prediction_df.predicted)
            accuracy += accuracy_score(prediction_df.actual, prediction_df.predicted)
            recall += recall_score(prediction_df.actual, prediction_df.predicted)
            f1 += f1_score(prediction_df.actual, prediction_df.predicted)
        
        self.ridge_metrics = {'precision': precision/len(x['raceId'].unique()), 'accuracy': accuracy/len(x['raceId'].unique()), 'recall': recall/len(x['raceId'].unique()), 'f1': f1/len(x['raceId'].unique())}
        return precision/len(x['raceId'].unique())

    def find_best_param_ridge(self, x, y):

        self.x = x
        self.y = y

        splitter = customCrossValidation().split(x)
        
        hyper_params = [{'alpha': [0.001, 0.01, 0.1, 1, 5, 10, 100, 1000], 'solver': ['svd', 'cholesky', 'saga']}]

        model_ridge = Ridge()
        model_cv = GridSearchCV(estimator=model_ridge, param_grid=hyper_params, scoring=self.my_scoring, cv = splitter, return_train_score=True, verbose = 3)
        model_cv.fit(x, y)
        self.ridge_params = model_cv.best_params_

    def fit_ridge(self, x, y):
        model = Ridge(**self.ridge_params)
        model.fit(x, y)
        self.model = model
        return

    def my_scoring(self, model, x, y):

        precision = 0
        accuracy = 0
        recall = 0
        f1 = 0
        for i in x['raceId'].unique():
            prediction_df = pd.DataFrame(model.predict(x[x['raceId'] == i]), columns = ['results'])

    def find_best_param_nb(self, x, y):
            
            self.x = x
            self.y = y
    
            splitter = customCrossValidation().split(x)
            
            hyper_params = [{'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]}]
    
            model_nb = GaussianNB()
            model_cv = GridSearchCV(estimator=model_nb, param_grid=hyper_params, scoring=self.my_scoring, cv = splitter, return_train_score=True, verbose = 3)
            model_cv.fit(x, y)
            self.nb_params = model_cv.best_params_

    def fit_nb(self, x, y):
        model = GaussianNB(**self.nb_params)
        model.fit(x, y)
        self.model = model
        return