In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

In [2]:
df_train = pd.read_csv('./data/train.csv').drop_duplicates()
df_test = pd.read_csv('./data/test.csv').drop_duplicates()
df_test

Unnamed: 0,raceId,year,circuitId,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driverId,constructorId,...,milliseconds,statusId,circuit_country,constructor_position,constructor_wins,constructor_nationality,driver_nationality,driver_wins,driver_age,results_positionOrder
0,900,2014,-1.383720,0.0,0.0,0.0,0.0,0.0,-0.908232,-0.904002,...,0.180212,1,2,1.0,-0.311669,0.035664,33,-0.311669,0.695652,3
1,901,2014,-1.338027,0.0,0.0,0.0,0.0,0.0,-0.908232,-0.904002,...,0.196924,1,17,2.0,-0.311669,0.035664,33,-0.311669,0.695652,6
2,903,2014,-1.468601,0.0,0.0,1.0,0.0,0.0,-0.908232,-0.904002,...,1.000000,0,9,5.0,-0.311669,0.035664,33,-0.311669,0.695652,11
3,902,2014,-1.639754,0.0,0.0,1.0,0.0,0.0,-0.908232,-0.904002,...,1.000000,0,5,3.0,-0.311669,0.035664,33,-0.311669,0.695652,17
4,904,2014,-0.814644,1.0,0.0,0.0,0.0,0.0,-0.908232,-0.904002,...,1.000000,0,28,6.0,-0.311669,0.035664,33,-0.311669,0.695652,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1809,760,1959,-0.780444,0.0,0.0,1.0,0.0,0.0,0.126607,0.043757,...,1.000000,0,33,4.0,-0.311669,0.035664,33,-0.311669,0.869565,28
1810,764,1959,-0.008501,1.0,0.0,0.0,0.0,0.0,0.126607,0.920124,...,1.000000,0,34,13.0,-0.311669,0.035664,34,-0.311669,0.391304,18
1811,764,1959,-0.008501,1.0,0.0,0.0,0.0,0.0,0.126607,0.027990,...,1.000000,0,34,10.0,-0.311669,0.035664,34,-0.311669,1.000000,16
1812,764,1959,-0.008501,1.0,0.0,0.0,0.0,0.0,0.126607,-0.303509,...,1.000000,0,34,9.0,-0.311669,-0.166354,34,-0.311669,1.000000,19


In [3]:
def linearRegression(X_train, Y_train, X_test, Y_test, i):
    folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
    hyper_params = [{'n_features_to_select': list(range(1, 20))}]

    model = LinearRegression()
    # model.fit(X_train, Y_train)
    print(i)
    rfe = RFE(model)

    
    model_cv = GridSearchCV(estimator=rfe, param_grid=hyper_params, scoring='r2', cv = 5, return_train_score=True, verbose = 1)    
    predicted = model_cv.fit(X_train, Y_train).predict(X_test)
    # predicted = cross_val_predict(model, X_test, Y_test, cv = folds)

    return predicted

In [48]:
class customCrossValidation():

    def split(self, x):
        indices = []
        for i in range(5):
            years = x['year'].unique()
            np.random.shuffle(years)
            val_years = years[:6]
            train_years = years[6:]

            indices.append((x[x['year'].isin(train_years)].index, x[x['year'].isin(val_years)].index))
        return indices

In [166]:
class linearRegression() :

    def my_scoring(self, model, x, y):
        
        score = 0
        for i in x['raceId'].unique():
            # X_test = x[x['raceId'] == i].drop(['results_postio'])
            prediction_df = pd.DataFrame(model.predict(x[x['raceId'] == i]), columns = ['results'])
            # X_test = x[x['raceId'] == i].drop(columns = ['results_positionOrder'])
            Y_test = y[x[x['raceId'] == i].index]

            
            prediction_df['podium'] = Y_test.reset_index(drop = True)
            prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x in [1, 2, 3] else 0)
            prediction_df.sort_values('results', ascending = True, inplace = True)
            prediction_df.reset_index(inplace = True, drop = True)
            prediction_df['predicted'] = prediction_df.index
            prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
            # print(prediction_df.actual, prediction_df.predicted)

            score += precision_score(prediction_df.actual, prediction_df.predicted)
        modelScore = score / x['raceId'].nunique()
        return modelScore

    def find_best_param_ridge(self, x, y):

        self.x = x
        self.y = y

        splitter = customCrossValidation().split(x)
        
        hyper_params = [{'alpha': [0.001, 0.01, 0.1, 0.5, 1, 5, 7, 10, 100, 1000], 'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'saga']}]

        model_ridge = Ridge()
        model_cv = GridSearchCV(estimator=model_ridge, param_grid=hyper_params, scoring=self.my_scoring, cv = splitter, return_train_score=True, verbose = 3)
        model_cv.fit(x, y)
        self.ridge_params = model_cv.best_params_

    def fit_ridge(self, x, y):
        model = Ridge(**self.ridge_params)
        model.fit(x, y)
        self.model = model
        return

    def predict_ridge(self, x):
        return self.model.predict(x)
        


In [167]:
lr = linearRegression()

In [168]:
# print(df_train)
lr.find_best_param_ridge(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END alpha=0.001, solver=svd;, score=(train=0.748, test=0.667) total time=   0.3s
[CV 2/5] END alpha=0.001, solver=svd;, score=(train=0.751, test=0.486) total time=   0.3s
[CV 3/5] END alpha=0.001, solver=svd;, score=(train=0.732, test=0.707) total time=   0.2s
[CV 4/5] END alpha=0.001, solver=svd;, score=(train=0.730, test=0.742) total time=   0.2s
[CV 5/5] END alpha=0.001, solver=svd;, score=(train=0.731, test=0.789) total time=   0.3s
[CV 1/5] END alpha=0.001, solver=cholesky;, score=(train=0.748, test=0.667) total time=   0.2s
[CV 2/5] END alpha=0.001, solver=cholesky;, score=(train=0.751, test=0.486) total time=   0.3s
[CV 3/5] END alpha=0.001, solver=cholesky;, score=(train=0.732, test=0.707) total time=   0.2s
[CV 4/5] END alpha=0.001, solver=cholesky;, score=(train=0.730, test=0.742) total time=   0.2s
[CV 5/5] END alpha=0.001, solver=cholesky;, score=(train=0.731, test=0.789) total time=   0.4s
[CV 1/5] END 

In [169]:
lr.ridge_params

{'alpha': 1000, 'solver': 'saga'}

In [170]:
lr.fit_ridge(df_train.drop(['results_positionOrder'], axis = 1), df_train['results_positionOrder'])

In [171]:
lr.predict_ridge(df_test.drop(['results_positionOrder'], axis = 1))

array([ 5.01043596,  5.45527227, 12.99576305, ..., 13.58415996,
       15.62503674, 13.02132398])

In [172]:
def my_scoring(model, x, y):
        
    score = 0
    for i in x['raceId'].unique():
        # X_test = x[x['raceId'] == i].drop(['results_postio'])
        prediction_df = pd.DataFrame(model.predict(x[x['raceId'] == i]), columns = ['results'])
        # X_test = x[x['raceId'] == i].drop(columns = ['results_positionOrder'])
        Y_test = y[x[x['raceId'] == i].index]

        
        prediction_df['podium'] = Y_test.reset_index(drop = True)
        prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x in [1, 2, 3] else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        # print(prediction_df.actual, prediction_df.predicted)

        score += precision_score(prediction_df.actual, prediction_df.predicted)
    modelScore = score / x['raceId'].nunique()
    return modelScore

In [173]:
my_scoring(lr.model, df_test.drop(['results_positionOrder'], axis = 1), df_test['results_positionOrder'])

0.8769230769230769