In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
df_train = pd.read_csv('./data_new/train_pre.csv').drop_duplicates()
df_test = pd.read_csv('./data_new/test_pre.csv').drop_duplicates()
df_test

Unnamed: 0,raceId,year,circuitId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driverId,constructorId,...,milliseconds,statusId,circuit_country,constructor_position,constructor_wins,constructor_nationality,driver_nationality,driver_wins,driver_age,results_positionOrder
0,880,2013,-1.264318,1.0,0.0,0.0,0.0,0.0,-0.671359,-0.965034,...,0.176079,1,2,6.0,-0.310902,0.105238,33,-0.310902,0.666667,9
1,881,2013,-1.224938,0.0,0.0,1.0,1.0,1.0,-0.671359,-0.965034,...,1.000000,0,17,7.0,-0.310902,0.105238,33,-0.310902,0.666667,17
2,882,2013,-1.395410,1.0,0.0,0.0,0.0,0.0,-0.671359,-0.965034,...,0.187334,1,9,5.0,-0.310902,0.105238,33,-0.310902,0.666667,5
3,883,2013,-1.495754,1.0,0.0,0.0,0.0,0.0,-0.671359,-0.965034,...,0.186830,1,5,6.0,-0.310902,0.105238,33,-0.310902,0.666667,10
4,884,2013,-0.691174,1.0,0.0,0.0,0.0,0.0,-0.671359,-0.965034,...,0.194470,1,28,6.0,-0.310902,0.105238,33,-0.310902,0.666667,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499,567,1976,1.281412,1.0,0.0,0.0,0.0,0.0,0.129289,-0.955050,...,1.000000,0,10,17.0,-0.310902,0.105238,33,-0.310902,0.541667,20
2500,571,1976,-0.613072,0.0,0.0,1.0,0.0,0.0,0.645484,1.129065,...,1.000000,0,21,10.0,-0.310902,0.105238,29,-0.310902,0.791667,20
2501,563,1976,0.861440,1.0,0.0,0.0,0.0,0.0,0.129289,-0.595194,...,1.000000,0,28,15.0,-0.310902,0.105238,28,-0.310902,0.583333,20
2502,574,1976,-0.072666,1.0,0.0,0.0,0.0,0.0,0.129289,0.043700,...,1.000000,0,34,14.0,-0.310902,0.076599,2,-0.310902,0.375000,14


In [3]:
class customCrossValidation():

    def split(self, x):
        indices = []
        for i in range(5):
            years = x['year'].unique()
            np.random.shuffle(years)
            val_years = years[:6]
            train_years = years[6:]

            indices.append((x[x['year'].isin(train_years)].index, x[x['year'].isin(val_years)].index))
        return indices

##### Ridge Regression

In [35]:
class linearRegression() :
    
    def my_scoring(self, model, x, y):

        precision = 0
        accuracy = 0
        recall = 0
        f1 = 0
        for i in x['raceId'].unique():
            prediction_df = pd.DataFrame(model.predict(x[x['raceId'] == i]), columns = ['results'])
            # print(np.unique(np.round(prediction_df['results'].values)))
            Y_test = y[x[x['raceId'] == i].index]

            
            prediction_df['podium'] = Y_test.reset_index(drop = True)
            prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x in [1, 2, 3] else 0)
            prediction_df.sort_values('results', ascending = True, inplace = True)
            prediction_df.reset_index(inplace = True, drop = True)
            prediction_df['predicted'] = prediction_df.index
            prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x in [0, 1, 2] else 0)

            precision += precision_score(prediction_df.actual, prediction_df.predicted)
            accuracy += accuracy_score(prediction_df.actual, prediction_df.predicted)
            recall += recall_score(prediction_df.actual, prediction_df.predicted)
            f1 += f1_score(prediction_df.actual, prediction_df.predicted)
        
        self.ridge_metrics = {'precision': precision/len(x['raceId'].unique()), 'accuracy': accuracy/len(x['raceId'].unique()), 'recall': recall/len(x['raceId'].unique()), 'f1': f1/len(x['raceId'].unique())}
        return precision/len(x['raceId'].unique())

    def find_best_param_ridge(self, x, y):

        self.x = x
        self.y = y

        splitter = customCrossValidation().split(x)
        
        # hyper_params = [{'alpha': [0.001, 0.01, 0.1, 1, 5, 10, 100, 1000], 'solver': ['svd', 'cholesky', 'saga']}]
        hyper_params = [{'alpha': [1000], 'solver': ['svd']}]

        model_ridge = Ridge()
        model_cv = GridSearchCV(estimator=model_ridge, param_grid=hyper_params, scoring=self.my_scoring, cv = splitter, return_train_score=True, verbose = 3)
        model_cv.fit(x, y)
        self.ridge_params = model_cv.best_params_

    def fit_ridge(self, x, y):
        model = Ridge(**self.ridge_params)
        model.fit(x, y)
        self.model = model
        return
        


In [36]:
lr = linearRegression()

In [37]:
# print(df_train)
lr.find_best_param_ridge(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[ 4.  5.  6.  7.  8. 13. 14. 16.]
[ 4.  5.  6.  7. 12. 13. 14. 15.]
[ 4.  5.  6.  7.  8. 13. 14. 15.]
[ 3.  4.  5.  6.  7. 13. 14. 15.]
[ 4.  5.  6. 11. 13. 14. 15. 16.]
[ 3.  5.  6.  7.  8. 11. 12. 13. 14. 15. 16.]
[ 3.  4.  5.  6.  7. 12. 14. 15. 16.]
[ 3.  4.  5.  6.  7. 11. 13. 14. 15. 16.]
[ 3.  4.  5.  6.  7.  8.  9. 12. 13. 15.]
[ 3.  4.  5.  6.  7. 11. 13. 14. 15. 16.]
[ 3.  4.  5.  6.  7.  8. 11. 14. 15. 16.]
[ 4.  5.  6.  7.  8. 10. 11. 13. 14. 15.]
[ 3.  4.  5.  6.  7.  9. 12. 13. 14. 15. 16.]
[ 3.  4.  5.  6.  7.  8. 12. 13. 14. 15. 16.]
[ 2.  4.  5.  6.  7.  8. 12. 14. 15. 16.]
[ 2.  3.  4.  5.  6.  7.  8.  9. 13. 14. 15. 16.]
[ 3.  4.  5.  6.  7.  8. 11. 12. 14. 15.]
[ 1.  4.  6.  7.  8. 13. 14. 15. 16. 17.]
[ 3.  4.  5. 13. 14. 15. 16. 17.]
[ 0.  5.  6.  7.  8. 11. 14. 15. 16. 17.]
[ 3.  5.  6.  7.  8.  9. 10. 15. 16. 17.]
[ 3.  5.  6.  7.  8. 15. 16. 17.]
[ 2.  5.  6.  7.  8.  9. 10. 12. 14. 15. 16.]
[ 3.  4.  5

KeyboardInterrupt: 

In [25]:
lr.ridge_params

{'alpha': 1000, 'solver': 'svd'}

In [26]:
lr.fit_ridge(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

In [27]:
lr.my_scoring(lr.model, df_test.drop(['results_positionOrder'], axis = 1), df_test['results_positionOrder'])

0.6006289308176098

In [28]:
pd.DataFrame(lr.ridge_metrics, index = ['ridge'], columns = ['precision', 'accuracy', 'recall', 'f1'])

Unnamed: 0,precision,accuracy,recall,f1
ridge,0.600629,0.89656,0.600629,0.600629


##### Lasso Regression

In [198]:
class lassoRegression():
    def my_scoring(self, model, x, y):

        precision = 0
        accuracy = 0
        recall = 0
        f1 = 0
        for i in x['raceId'].unique():
            prediction_df = pd.DataFrame(model.predict(x[x['raceId'] == i]), columns = ['results'])
            Y_test = y[x[x['raceId'] == i].index]

            
            prediction_df['podium'] = Y_test.reset_index(drop = True)
            prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x in [1, 2, 3] else 0)
            prediction_df.sort_values('results', ascending = True, inplace = True)
            prediction_df.reset_index(inplace = True, drop = True)
            prediction_df['predicted'] = prediction_df.index
            prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

            precision += precision_score(prediction_df.actual, prediction_df.predicted)
            accuracy += accuracy_score(prediction_df.actual, prediction_df.predicted)
            recall += recall_score(prediction_df.actual, prediction_df.predicted)
            f1 += f1_score(prediction_df.actual, prediction_df.predicted)
        
        self.ridge_metrics = {'precision': precision/len(x['raceId'].unique()), 'accuracy': accuracy/len(x['raceId'].unique()), 'recall': recall/len(x['raceId'].unique()), 'f1': f1/len(x['raceId'].unique())}
        return precision/len(x['raceId'].unique())

    def find_best_param_lasso(self, x, y):

        self.x = x
        self.y = y

        splitter = customCrossValidation().split(x)
        
        hyper_params = [{'alpha': [0.1, 1, 5, 10, 100, 1000, 2000], 'selection': ['cyclic', 'random']}]

        model_lasso = Lasso()
        model_cv = GridSearchCV(estimator=model_lasso, param_grid=hyper_params, scoring=self.my_scoring, cv = splitter, return_train_score=True, verbose = 3)
        model_cv.fit(x, y)
        self.lasso_params = model_cv.best_params_

    def fit_lasso(self, x, y):
        model = Lasso(**self.lasso_params)
        model.fit(x, y)
        self.model = model
        return
    

In [199]:
lasso = lassoRegression()

In [200]:
lasso.find_best_param_lasso(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END alpha=0.1, selection=cyclic;, score=(train=0.822, test=0.973) total time=   0.5s
[CV 2/5] END alpha=0.1, selection=cyclic;, score=(train=0.841, test=0.857) total time=   0.5s
[CV 3/5] END alpha=0.1, selection=cyclic;, score=(train=0.833, test=0.903) total time=   0.4s
[CV 4/5] END alpha=0.1, selection=cyclic;, score=(train=0.831, test=0.852) total time=   0.5s
[CV 5/5] END alpha=0.1, selection=cyclic;, score=(train=0.843, test=0.797) total time=   0.5s
[CV 1/5] END alpha=0.1, selection=random;, score=(train=0.822, test=0.973) total time=   0.4s
[CV 2/5] END alpha=0.1, selection=random;, score=(train=0.841, test=0.857) total time=   0.4s
[CV 3/5] END alpha=0.1, selection=random;, score=(train=0.833, test=0.903) total time=   0.3s
[CV 4/5] END alpha=0.1, selection=random;, score=(train=0.831, test=0.852) total time=   0.4s
[CV 5/5] END alpha=0.1, selection=random;, score=(train=0.843, test=0.797) total time=   0.6s

In [201]:
lasso.lasso_params

{'alpha': 1, 'selection': 'cyclic'}

In [202]:
lasso.fit_lasso(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

In [203]:
lasso.my_scoring(lasso.model, df_test.drop(['results_positionOrder'], axis = 1), df_test['results_positionOrder'])

0.6981132075471698

In [204]:
pd.DataFrame(lasso.ridge_metrics, index = ['lasso'], columns = ['precision', 'accuracy', 'recall', 'f1'])

Unnamed: 0,precision,accuracy,recall,f1
lasso,0.698113,0.88822,0.232704,0.349057


##### Decision Tree Regression

In [221]:
class CustomDecisionTreeRegressor():
    def my_scoring(self, model, x, y):

        precision = 0
        accuracy = 0
        recall = 0
        f1 = 0
        for i in x['raceId'].unique():
            prediction_df = pd.DataFrame(model.predict(x[x['raceId'] == i]), columns = ['results'])
            Y_test = y[x[x['raceId'] == i].index]

            
            prediction_df['podium'] = Y_test.reset_index(drop = True)
            prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x in [1, 2, 3] else 0)
            prediction_df.sort_values('results', ascending = True, inplace = True)
            prediction_df.reset_index(inplace = True, drop = True)
            prediction_df['predicted'] = prediction_df.index
            prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

            precision += precision_score(prediction_df.actual, prediction_df.predicted)
            accuracy += accuracy_score(prediction_df.actual, prediction_df.predicted)
            recall += recall_score(prediction_df.actual, prediction_df.predicted)
            f1 += f1_score(prediction_df.actual, prediction_df.predicted)
        
        self.dt_metrics = {'precision': precision/len(x['raceId'].unique()), 'accuracy': accuracy/len(x['raceId'].unique()), 'recall': recall/len(x['raceId'].unique()), 'f1': f1/len(x['raceId'].unique())}
        return precision/len(x['raceId'].unique())

    def find_best_param_dt(self, x, y):
            
            self.x = x
            self.y = y
    
            splitter = customCrossValidation().split(x)
            
            hyper_params = [{'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"], 'splitter': ['best', 'random'], 'max_depth': [4, 5, 6, 7, 8, 9, 10], 'max_features': ['sqrt', 'log2']}]
    
            model_dt = DecisionTreeRegressor()
            model_cv = GridSearchCV(estimator=model_dt, param_grid=hyper_params, scoring=self.my_scoring, cv = splitter, return_train_score=True, verbose = 3)
            model_cv.fit(x, y)
            self.dt_params = model_cv.best_params_
    
    def fit_dt(self, x, y):
        model = DecisionTreeRegressor(**self.dt_params)
        model.fit(x, y)
        self.model = model
        return
            

In [222]:
dt_regressor = CustomDecisionTreeRegressor()

In [223]:
dt_regressor.find_best_param_dt(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

Fitting 5 folds for each of 112 candidates, totalling 560 fits
[CV 1/5] END criterion=squared_error, max_depth=4, max_features=sqrt, splitter=best;, score=(train=0.876, test=0.769) total time=   0.5s
[CV 2/5] END criterion=squared_error, max_depth=4, max_features=sqrt, splitter=best;, score=(train=0.913, test=0.810) total time=   0.6s
[CV 3/5] END criterion=squared_error, max_depth=4, max_features=sqrt, splitter=best;, score=(train=0.905, test=0.918) total time=   0.5s
[CV 4/5] END criterion=squared_error, max_depth=4, max_features=sqrt, splitter=best;, score=(train=0.740, test=0.809) total time=   0.5s
[CV 5/5] END criterion=squared_error, max_depth=4, max_features=sqrt, splitter=best;, score=(train=0.877, test=0.828) total time=   0.5s
[CV 1/5] END criterion=squared_error, max_depth=4, max_features=sqrt, splitter=random;, score=(train=0.674, test=0.571) total time=   0.5s
[CV 2/5] END criterion=squared_error, max_depth=4, max_features=sqrt, splitter=random;, score=(train=0.494, test=

In [228]:
dt_regressor.dt_params

{'criterion': 'squared_error',
 'max_depth': 9,
 'max_features': 'sqrt',
 'splitter': 'best'}

In [225]:
dt_regressor.fit_dt(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

In [226]:
dt_regressor.my_scoring(dt_regressor.model, df_test.drop(['results_positionOrder'], axis = 1), df_test['results_positionOrder'])

0.8301886792452831

In [227]:
pd.DataFrame(dt_regressor.dt_metrics, index = ['dt_regressor'], columns = ['precision', 'accuracy', 'recall', 'f1'])

Unnamed: 0,precision,accuracy,recall,f1
dt_regressor,0.830189,0.900052,0.27673,0.415094


##### Random Forest Regressor

In [233]:
class CustomRandomFroestRegressor():
    def my_scoring(self, model, x, y):

        precision = 0
        accuracy = 0
        recall = 0
        f1 = 0
        for i in x['raceId'].unique():
            prediction_df = pd.DataFrame(model.predict(x[x['raceId'] == i]), columns = ['results'])
            Y_test = y[x[x['raceId'] == i].index]

            
            prediction_df['podium'] = Y_test.reset_index(drop = True)
            prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x in [1, 2, 3] else 0)
            prediction_df.sort_values('results', ascending = True, inplace = True)
            prediction_df.reset_index(inplace = True, drop = True)
            prediction_df['predicted'] = prediction_df.index
            prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

            precision += precision_score(prediction_df.actual, prediction_df.predicted)
            accuracy += accuracy_score(prediction_df.actual, prediction_df.predicted)
            recall += recall_score(prediction_df.actual, prediction_df.predicted)
            f1 += f1_score(prediction_df.actual, prediction_df.predicted)
        
        self.dt_metrics = {'precision': precision/len(x['raceId'].unique()), 'accuracy': accuracy/len(x['raceId'].unique()), 'recall': recall/len(x['raceId'].unique()), 'f1': f1/len(x['raceId'].unique())}
        return precision/len(x['raceId'].unique())

    def find_best_param_rf(self, x, y):
                
            self.x = x
            self.y = y
    
            splitter = customCrossValidation().split(x)
            
            hyper_params = [{'criterion': ["squared_error", "absolute_error", "poisson"], 'max_depth': [4, 5, 8, 9, 10], 'max_features': ['sqrt', 'log2'], 'n_estimators': [100, 150, 200, 300]}]
    
            model_rf = RandomForestRegressor()
            model_cv = GridSearchCV(estimator=model_rf, param_grid=hyper_params, scoring=self.my_scoring, cv = splitter, return_train_score=True, verbose = 3)
            model_cv.fit(x, y)
            self.rf_params = model_cv.best_params_

    def fit_rf(self, x, y):
        model = RandomForestRegressor(**self.rf_params)
        model.fit(x, y)
        self.model = model
        return

In [234]:
rf_regressor = CustomRandomFroestRegressor()

In [235]:
rf_regressor.find_best_param_rf(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END criterion=squared_error, max_depth=4, max_features=sqrt, n_estimators=100;, score=(train=0.920, test=0.932) total time=   1.5s
[CV 2/5] END criterion=squared_error, max_depth=4, max_features=sqrt, n_estimators=100;, score=(train=0.919, test=0.897) total time=   1.2s
[CV 3/5] END criterion=squared_error, max_depth=4, max_features=sqrt, n_estimators=100;, score=(train=0.913, test=0.935) total time=   1.2s
[CV 4/5] END criterion=squared_error, max_depth=4, max_features=sqrt, n_estimators=100;, score=(train=0.919, test=0.944) total time=   1.2s
[CV 5/5] END criterion=squared_error, max_depth=4, max_features=sqrt, n_estimators=100;, score=(train=0.921, test=0.925) total time=   1.2s
[CV 1/5] END criterion=squared_error, max_depth=4, max_features=sqrt, n_estimators=150;, score=(train=0.918, test=0.920) total time=   1.9s
[CV 2/5] END criterion=squared_error, max_depth=4, max_features=sqrt, n_estimators=150;, score=(t

KeyboardInterrupt: 

In [None]:
rf_regressor.rf_params

In [None]:
rf_regressor.fit_rf(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

In [None]:
rf_regressor.my_scoring(rf_regressor.model, df_test.drop(['results_positionOrder'], axis = 1), df_test['results_positionOrder'])

In [None]:
pd.dataframe(rf_regressor.dt_metrics, index = ['rf_regressor'], columns = ['precision', 'accuracy', 'recall', 'f1'])