In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [25]:
df_train = pd.read_csv('./data_new/train_pre.csv').drop_duplicates()
df_test = pd.read_csv('./data_new/test_pre.csv').drop_duplicates()

In [26]:
df_train.columns

Index(['raceId', 'year', 'circuitId', 'weather_warm', 'weather_cold',
       'weather_dry', 'weather_wet', 'weather_cloudy', 'driverId',
       'constructorId', 'grid', 'results_positionOrder', 'circuit_country',
       'constructor_wins', 'constructor_nationality', 'driver_nationality',
       'driver_wins', 'driver_age', 'results_points'],
      dtype='object')

In [27]:
df_train['results_positionOrder'].value_counts()

20    4976
4      980
3      978
11     978
8      977
5      977
7      977
6      977
9      976
2      976
12     975
10     975
1      972
13     971
14     967
15     965
16     953
17     946
18     932
19     910
Name: results_positionOrder, dtype: int64

## Regression

### Linear Regression

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_selection import RFE
def linearRegression(X_train, Y_train):
    model = LinearRegression()
    model.fit(X_train, Y_train)        

    return model

In [29]:
def get_Linear_Regression_Score():
    # X_train = df_train.drop(columns = ['results_points'])
    # Y_train = df_train['results_points']
    # model = linearRegression(X_train, Y_train)

    # precision = 0
    # accuracy = 0
    # recall = 0
    # f1 = 0
    # for i in df_test['raceId'].unique():
    #     X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
    #     Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

    #     predicted_scores = model.predict(X_test)
    #     predicted_podium = np.argsort(predicted_scores)[::-1]
    #     actual_podium = np.argsort(Y_test)[::-1]
    #     # print(predicted_scores)

    #     predictions = np.zeros(len(Y_test))
    #     # predictions[predicted_podium] = 1
    #     actual = np.zeros(len(Y_test))
    #     # actual[actual_podium] = 1

    #     for i in range(len(Y_test)):
    #         predictions[predicted_podium[i]] = i + 1
    #         actual[actual_podium[i]] = i + 1

    #     precision += precision_score(actual, predictions, average = 'micro')
    #     accuracy += accuracy_score(actual, predictions)
    #     recall += recall_score(actual, predictions, average = 'micro')
    #     f1 += f1_score(actual, predictions, average = 'micro')  

    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = linearRegression(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   


    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())


### Decision Tree regressor

In [30]:
from sklearn.tree import DecisionTreeRegressor
def DTregressor(X_train, Y_train):
    model = DecisionTreeRegressor()
    model.fit(X_train, Y_train)

    return model

In [31]:
from sklearn.metrics import precision_score

def get_DT_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = DTregressor(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   
        # print(confusion_matrix(actual, predictions))

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

### Random Forest regressor

In [32]:
from sklearn.ensemble import RandomForestRegressor
def RFregressor(X_train, Y_train):
    model = RandomForestRegressor()
    model.fit(X_train, Y_train)

    return model

In [33]:
from sklearn.metrics import precision_score

def get_RF_Regression_Score():
    # X_train = df_train.drop(columns = ['results_points'])
    # Y_train = df_train['results_points']
    # model = RFregressor(X_train, Y_train)

    # precision = 0
    # accuracy = 0
    # recall = 0
    # f1 = 0
    # for i in df_test['raceId'].unique():
    #     X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
    #     Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

    #     predicted_scores = model.predict(X_test)
    #     predicted_podium = np.argsort(predicted_scores)[::-1]
    #     actual_podium = np.argsort(Y_test)[::-1]
    #     # print(predicted_scores)

    #     predictions = np.zeros(len(Y_test))
    #     # predictions[predicted_podium] = 1
    #     actual = np.zeros(len(Y_test))
    #     # actual[actual_podium] = 1

    #     for i in range(len(Y_test)):
    #         predictions[predicted_podium[i]] = i + 1
    #         actual[actual_podium[i]] = i + 1

    #     precision += precision_score(actual, predictions, average = 'micro')
    #     accuracy += accuracy_score(actual, predictions)
    #     recall += recall_score(actual, predictions, average = 'micro')
    #     f1 += f1_score(actual, predictions, average = 'micro')   
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = RFregressor(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

## Compiling all regressions

In [34]:
regression_scores = {}
regression_scores['Linear Regression'] = get_Linear_Regression_Score()
regression_scores['DT Regression'] = get_DT_Regression_Score()
regression_scores['RF Regression'] = get_RF_Regression_Score()

In [35]:
pd.DataFrame(regression_scores, index = ['Precision', 'Accuracy', 'Recall', 'F1'])

Unnamed: 0,Linear Regression,DT Regression,RF Regression
Precision,0.563333,0.466667,0.613333
Accuracy,0.887122,0.862207,0.900881
Recall,0.563333,0.466667,0.613333
F1,0.563333,0.466667,0.613333


## Classification

In [58]:
def get_score(pred, actual, margin):
    score = 0
    for i in range(len(pred)):
        if pred[i] == actual[i]:
            score += 1
        elif abs(pred[i] - actual[i]) <= margin:
            score += 1
    return score/len(pred)

### Logistic Regression

In [59]:
from sklearn.linear_model import LogisticRegression

def logisticRegression(X_train, Y_train):
    model = LogisticRegression(max_iter=2500, multi_class = 'ovr')
    model.fit(X_train, Y_train)

    return model

In [60]:
from sklearn.metrics import precision_score

def get_Logistic_Regression_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = logisticRegression(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        # prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        # prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        # prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        # prediction_df.sort_values('results1', ascending = False, inplace = True)
        # prediction_df.reset_index(inplace = True, drop = True)                
        # prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # # print(prediction_df)            
        # precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Naive Bayes Classifier

In [61]:
from sklearn.naive_bayes import GaussianNB

def gaussianNB(X_train, Y_train):
    model = GaussianNB()
    model.fit(X_train, Y_train)

    return model

In [62]:
from sklearn.metrics import precision_score

def get_Gaussian_NB_Score():
    # X_train = df_train.drop(columns = ['results_positionOrder'])
    # Y_train = df_train['results_positionOrder'].to_frame()    
    # Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    # model = gaussianNB(X_train, Y_train.values.ravel())

    # precision = 0
    # accuracy = 0
    # recall = 0
    # f1 = 0
    # for i in df_test['raceId'].unique():
    #     X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
    #     Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

    #     prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
    #     prediction_df['actual_position'] = Y_test.reset_index(drop = True)
    #     prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
    #     prediction_df.sort_values('results1', ascending = False, inplace = True)
    #     prediction_df.reset_index(inplace = True, drop = True)                
    #     prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
    #     # print(prediction_df)            
    #     precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    #     accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    #     recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    #     f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = gaussianNB(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        # prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        # prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        # prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        # prediction_df.sort_values('results1', ascending = False, inplace = True)
        # prediction_df.reset_index(inplace = True, drop = True)                
        # prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # # print(prediction_df)            
        # precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Decision Tree Classifier

In [63]:
from sklearn.tree import DecisionTreeClassifier

def DT_classifier(X_train, Y_train):
    model = DecisionTreeClassifier()
    model.fit(X_train, Y_train)

    return model

In [64]:
from sklearn.metrics import precision_score

def get_DT_classifier_Score():
    # X_train = df_train.drop(columns = ['results_positionOrder'])
    # Y_train = df_train['results_positionOrder'].to_frame()    
    # Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    # model = DT_classifier(X_train, Y_train.values.ravel())

    # precision = 0
    # accuracy = 0
    # recall = 0
    # f1 = 0
    # for i in df_test['raceId'].unique():
    #     X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
    #     Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

    #     prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
    #     prediction_df['actual_position'] = Y_test.reset_index(drop = True)
    #     prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
    #     prediction_df.sort_values('results1', ascending = False, inplace = True)
    #     prediction_df.reset_index(inplace = True, drop = True)                
    #     prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
    #     # print(prediction_df)            
    #     precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    #     accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    #     recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    #     f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = DT_classifier(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        # prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        # prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        # prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        # prediction_df.sort_values('results1', ascending = False, inplace = True)
        # prediction_df.reset_index(inplace = True, drop = True)                
        # prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # # print(prediction_df)            
        # precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Random Forest Classifier

In [65]:
from sklearn.ensemble import RandomForestClassifier

def RF_classifier(X_train, Y_train):
    model = RandomForestClassifier()
    model.fit(X_train, Y_train)

    return model

In [66]:
from sklearn.metrics import precision_score

def get_RF_classifier_Score():
    # X_train = df_train.drop(columns = ['results_positionOrder'])
    # Y_train = df_train['results_positionOrder'].to_frame()    
    # Y_train['results_positionOrder'] = Y_train['results_positionOrder'].map(lambda x: 1 if x in [1,2,3] else 0)    
    # model = RF_classifier(X_train, Y_train.values.ravel())

    # precision = 0
    # accuracy = 0
    # recall = 0
    # f1 = 0
    # for i in df_test['raceId'].unique():
    #     X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_positionOrder'])
    #     Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

    #     prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
    #     prediction_df['actual_position'] = Y_test.reset_index(drop = True)
    #     prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
    #     prediction_df.sort_values('results1', ascending = False, inplace = True)
    #     prediction_df.reset_index(inplace = True, drop = True)                
    #     prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
    #     # print(prediction_df)            
    #     precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    #     accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    #     recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
    #     f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = RF_classifier(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        # prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['results0', 'results1'])        
        # prediction_df['actual_position'] = Y_test.reset_index(drop = True)
        # prediction_df['actual_podium'] = prediction_df.actual_position.map(lambda x: 1 if x in [1,2,3] else 0)
        # prediction_df.sort_values('results1', ascending = False, inplace = True)
        # prediction_df.reset_index(inplace = True, drop = True)                
        # prediction_df['predicted_podium'] = prediction_df.index.map(lambda x: 1 if x == 0 else 0)            
        # # print(prediction_df)            
        # precision += precision_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # accuracy += accuracy_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # recall += recall_score(prediction_df.actual_podium, prediction_df.predicted_podium)
        # f1 += f1_score(prediction_df.actual_podium, prediction_df.predicted_podium)

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

## Compiling all classifications

In [67]:
classification_scores = {}
classification_scores['Logistic Regression'] = get_Logistic_Regression_Score()
classification_scores['NB classification'] = get_Gaussian_NB_Score()
classification_scores['DT classification'] = get_DT_classifier_Score()
classification_scores['RF classification'] = get_RF_classifier_Score()

In [68]:
pd.DataFrame(classification_scores, index = ['score'])

Unnamed: 0,Logistic Regression,NB classification,DT classification,RF classification
score,0.651798,0.646984,0.674035,0.693992
