In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, GradientBoostingClassifier, GradientBoostingRegressor

In [2]:
df_train = pd.read_csv('./data_new/train_pre.csv').drop_duplicates()
df_test = pd.read_csv('./data_new/test_pre.csv').drop_duplicates()

In [3]:
df_train.columns

Index(['raceId', 'year', 'circuitId', 'weather_warm', 'weather_cold',
       'weather_dry', 'weather_wet', 'weather_cloudy', 'driverId',
       'constructorId', 'grid', 'results_positionOrder', 'circuit_country',
       'constructor_wins', 'constructor_nationality', 'driver_nationality',
       'driver_wins', 'driver_age', 'results_points'],
      dtype='object')

In [4]:
df_train['results_positionOrder'].value_counts()

20    4976
4      980
3      978
11     978
8      977
5      977
7      977
6      977
9      976
2      976
12     975
10     975
1      972
13     971
14     967
15     965
16     953
17     946
18     932
19     910
Name: results_positionOrder, dtype: int64

## Regression

### Linear Regression

In [5]:
def linearRegression(X_train, Y_train):
    model = LinearRegression()
    model.fit(X_train, Y_train)        

    return model

In [6]:
def get_Linear_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = linearRegression(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   


    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())


### Decision Tree regressor

In [7]:
def DTregressor(X_train, Y_train):
    model = DecisionTreeRegressor()
    model.fit(X_train, Y_train)

    return model

In [8]:
def get_DT_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = DTregressor(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   
        # print(confusion_matrix(actual, predictions))

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

### Random Forest regressor

In [9]:
def RFregressor(X_train, Y_train):
    model = RandomForestRegressor()
    model.fit(X_train, Y_train)

    return model

In [10]:
def get_RF_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = RFregressor(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

### Support Vector regressor

In [11]:
def SVregressor(X_train, Y_train):
    model = SVR()
    model.fit(X_train, Y_train)

    return model

In [12]:
def get_SV_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = SVregressor(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

### Multilayer Perceptron regressor

In [13]:
def MLPregressor(X_train, Y_train):
    model = MLPRegressor()
    model.fit(X_train, Y_train)

    return model

In [14]:
def get_MLP_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = MLPregressor(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

### Bagging regressor

In [15]:
def Baggingregressor(X_train, Y_train):
    model = BaggingRegressor()
    model.fit(X_train, Y_train)

    return model

In [16]:
def get_Bagging_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = Baggingregressor(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

### Gradient Boosting regressor

In [17]:
def GradientBoostingregressor(X_train, Y_train):
    model = GradientBoostingRegressor()
    model.fit(X_train, Y_train)

    return model

In [18]:
def get_Gradient_Boosting_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = GradientBoostingregressor(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

## Compiling all regressions

In [19]:
regression_scores = {}
regression_scores['Linear Regression'] = get_Linear_Regression_Score()
regression_scores['DT Regression'] = get_DT_Regression_Score()
regression_scores['RF Regression'] = get_RF_Regression_Score()
regression_scores['SV Regression'] = get_SV_Regression_Score()
regression_scores['MLP Regression'] = get_MLP_Regression_Score()
regression_scores['Bagging Regression'] = get_Bagging_Regression_Score()
regression_scores['Gradient Boosting Regression'] = get_Gradient_Boosting_Regression_Score()

In [20]:
pd.DataFrame(regression_scores, index = ['Precision', 'Accuracy', 'Recall', 'F1'])

Unnamed: 0,Linear Regression,DT Regression,RF Regression,SV Regression,MLP Regression,Bagging Regression,Gradient Boosting Regression
Precision,0.563333,0.466667,0.59,0.553333,0.58,0.56,0.64
Accuracy,0.887122,0.862655,0.894624,0.884472,0.89223,0.886713,0.909106
Recall,0.563333,0.466667,0.59,0.553333,0.58,0.56,0.64
F1,0.563333,0.466667,0.59,0.553333,0.58,0.56,0.64


## Classification

In [21]:
def get_score(pred, actual, margin):
    score = 0
    for i in range(len(pred)):
        if pred[i] == actual[i]:
            score += 1
        elif abs(pred[i] - actual[i]) <= margin:
            score += 1
    return score/len(pred)

### Logistic Regression

In [22]:
def logisticRegression(X_train, Y_train):
    model = LogisticRegression(max_iter=2500, multi_class = 'ovr')
    model.fit(X_train, Y_train)

    return model

In [23]:
def get_Logistic_Regression_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = logisticRegression(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Naive Bayes Classifier

In [24]:
def gaussianNB(X_train, Y_train):
    model = GaussianNB()
    model.fit(X_train, Y_train)

    return model

In [25]:
def get_Gaussian_NB_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = gaussianNB(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Decision Tree Classifier

In [26]:
def DT_classifier(X_train, Y_train):
    model = DecisionTreeClassifier()
    model.fit(X_train, Y_train)

    return model

In [27]:
def get_DT_classifier_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = DT_classifier(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Random Forest Classifier

In [28]:
def RF_classifier(X_train, Y_train):
    model = RandomForestClassifier()
    model.fit(X_train, Y_train)

    return model

In [29]:
def get_RF_classifier_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = RF_classifier(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Support Vector Classifier

In [30]:
def SV_classifier(X_train, Y_train):
    model = SVC()
    model.fit(X_train, Y_train)

    return model

In [31]:
def get_SV_classifier_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = SV_classifier(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Multilayer Perceptron Classifier

In [32]:
def MLP_classifier(X_train, Y_train):
    model = MLPClassifier()
    model.fit(X_train, Y_train)

    return model

In [33]:
def get_MLP_classifier_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = MLP_classifier(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Bagging Classifier

In [38]:
def Bagging_classifier(X_train, Y_train):
    model = BaggingClassifier()
    model.fit(X_train, Y_train)

    return model

In [39]:
def get_Bagging_classifier_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = Bagging_classifier(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Gradient Boosting Classifier

In [40]:
def Gradient_Boosting_classifier(X_train, Y_train):
    model = GradientBoostingClassifier()
    model.fit(X_train, Y_train)

    return model

In [41]:
def get_Gradient_Boosting_classifier_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = Gradient_Boosting_classifier(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

## Compiling all classifications

In [42]:
classification_scores = {}
classification_scores['Logistic Regression'] = get_Logistic_Regression_Score()
classification_scores['NB classification'] = get_Gaussian_NB_Score()
classification_scores['DT classification'] = get_DT_classifier_Score()
classification_scores['RF classification'] = get_RF_classifier_Score()
classification_scores['SV classification'] = get_SV_classifier_Score()
classification_scores['MLP classification'] = get_MLP_classifier_Score()
classification_scores['Bagging classification'] = get_Bagging_classifier_Score()
classification_scores['Gradient Boosting classification'] = get_Gradient_Boosting_classifier_Score()

In [43]:
pd.DataFrame(classification_scores, index = ['score'])

Unnamed: 0,Logistic Regression,NB classification,DT classification,RF classification,SV classification,MLP classification,Bagging classification,Gradient Boosting classification
score,0.653183,0.646984,0.671507,0.704199,0.291274,0.501531,0.678781,0.664715
