In [463]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR

In [464]:
df_train = pd.read_csv('./data_new/train.csv').drop_duplicates()
df_test = pd.read_csv('./data_new/test.csv').drop_duplicates()

In [465]:
df_train.columns

Index(['raceId', 'year', 'circuitId', 'weather_warm', 'weather_cold',
       'weather_dry', 'weather_wet', 'weather_cloudy', 'driverId',
       'constructorId', 'grid', 'results_positionOrder', 'milliseconds',
       'statusId', 'circuit_country', 'constructor_points',
       'constructor_position', 'constructor_wins', 'constructor_nationality',
       'driver_nationality', 'driver_points', 'driver_wins', 'driver_age',
       'results_points'],
      dtype='object')

In [466]:
df_train['results_positionOrder'].value_counts()

20    4976
4      980
3      978
11     978
8      977
5      977
7      977
6      977
9      976
2      976
12     975
10     975
1      972
13     971
14     967
15     965
16     953
17     946
18     932
19     910
Name: results_positionOrder, dtype: int64

## Regression

### Linear Regression

In [467]:
def linearRegression(X_train, Y_train):
    model = LinearRegression()
    model.fit(X_train, Y_train)        

    return model

In [468]:
def get_Linear_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = linearRegression(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   


    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())


### Decision Tree regressor

In [469]:
def DTregressor(X_train, Y_train):
    model = DecisionTreeRegressor()
    model.fit(X_train, Y_train)

    return model

In [470]:
from sklearn.metrics import precision_score

def get_DT_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = DTregressor(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   
        # print(confusion_matrix(actual, predictions))

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

### Random Forest regressor

In [471]:
def RFregressor(X_train, Y_train):
    model = RandomForestRegressor()
    model.fit(X_train, Y_train)

    return model

In [472]:
from sklearn.metrics import precision_score

def get_RF_Regression_Score():
    X_train = df_train.drop(columns = ['results_points'])
    Y_train = df_train['results_points']
    model = RFregressor(X_train.drop(columns = ['results_positionOrder']), Y_train)

    precision = 0
    accuracy = 0
    recall = 0
    f1 = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i].drop(columns = ['results_points'])
        Y_test = df_test[df_test['raceId'] == i]['results_points'].to_numpy()

        predicted_scores = model.predict(X_test.drop(columns = ['results_positionOrder']))
        predicted_podium = np.argsort(predicted_scores)[::-1]
        actual_podium = X_test['results_positionOrder'].to_numpy()
        actual_podium = np.argsort(actual_podium)
        # print(predicted_scores)

        predictions = np.zeros(len(Y_test))
        # predictions[predicted_podium] = 1
        actual = np.zeros(len(Y_test))
        # actual[actual_podium] = 1

        for i in range(3):
            predictions[predicted_podium[i]] = 1
            actual[actual_podium[i]] = 1

        precision += precision_score(actual, predictions)
        accuracy += accuracy_score(actual, predictions)
        recall += recall_score(actual, predictions)
        f1 += f1_score(actual, predictions)   

    return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())

## Compiling all regressions

In [473]:
regression_scores = {}
regression_scores['Linear Regression'] = get_Linear_Regression_Score()
regression_scores['DT Regression'] = get_DT_Regression_Score()
regression_scores['RF Regression'] = get_RF_Regression_Score()

In [474]:
pd.DataFrame(regression_scores, index = ['Precision', 'Accuracy', 'Recall', 'F1'])

Unnamed: 0,Linear Regression,DT Regression,RF Regression
Precision,0.733333,0.656667,0.776667
Accuracy,0.929237,0.911477,0.941004
Recall,0.733333,0.656667,0.776667
F1,0.733333,0.656667,0.776667


## Classification

In [497]:
def get_score(pred, actual, margin):
    score = 0
    for i in range(len(pred)):
        if pred[i] == actual[i]:
            score += 1
        elif abs(pred[i] - actual[i]) <= margin:
            score += 1
    return score/len(pred)

### Logistic Regression

In [498]:
def logisticRegression(X_train, Y_train):
    model = LogisticRegression(max_iter=2500, multi_class = 'ovr')
    model.fit(X_train, Y_train)

    return model

In [499]:
def get_Logistic_Regression_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = logisticRegression(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Naive Bayes Classifier

In [500]:
def gaussianNB(X_train, Y_train):
    model = GaussianNB()
    model.fit(X_train, Y_train)

    return model

In [501]:
def get_Gaussian_NB_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = gaussianNB(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Decision Tree Classifier

In [502]:
def DT_classifier(X_train, Y_train):
    model = DecisionTreeClassifier()
    model.fit(X_train, Y_train)

    return model

In [503]:
def get_DT_classifier_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = DT_classifier(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

### Random Forest Classifier

In [504]:
def RF_classifier(X_train, Y_train):
    model = RandomForestClassifier()
    model.fit(X_train, Y_train)

    return model

In [505]:
def get_RF_classifier_Score():
    X_train = df_train
    Y_train = df_train['results_positionOrder']
    model = RF_classifier(X_train.drop(columns = ['results_positionOrder']), Y_train)

    score = 0
    for i in df_test['raceId'].unique():
        X_test = df_test[df_test['raceId'] == i]
        Y_test = df_test[df_test['raceId'] == i]['results_positionOrder']

        prediction = model.predict(X_test.drop(columns = ['results_positionOrder']))
        actual = Y_test.to_numpy()
        score += get_score(prediction, actual, 3)

    # return precision/len(df_test['raceId'].unique()), accuracy/len(df_test['raceId'].unique()), recall/len(df_test['raceId'].unique()), f1/len(df_test['raceId'].unique())
    return score/len(df_test['raceId'].unique())

## Compiling all classifications

In [506]:
classification_scores = {}
classification_scores['Logistic Regression'] = get_Logistic_Regression_Score()
classification_scores['NB classification'] = get_Gaussian_NB_Score()
classification_scores['DT classification'] = get_DT_classifier_Score()
classification_scores['RF classification'] = get_RF_classifier_Score()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [507]:
pd.DataFrame(classification_scores, index = ['score'])

Unnamed: 0,Logistic Regression,NB classification,DT classification,RF classification
score,0.692951,0.628397,0.727267,0.722302
