# Classifiers with Feature Engineering

In [62]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV 
from sklearn.linear_model import LogisticRegression
from src.utils import preprocess, feature_engineering

from keras.models import Sequential
from keras.layers import SimpleRNN, Dropout, Dense, Flatten, Activation
from keras.utils import np_utils

## Loading data

In [2]:
# Open files
Data_X_train = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Data_Y_train = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')

## Preprocessing

In [3]:
Data_X_train = feature_engineering(Data_X_train)
Data_X_train.head()

100%|██████████████████████████████████████████████████████████████████████████████| 1439/1439 [00:54<00:00, 26.45it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1440/1440 [03:40<00:00,  6.54it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1440/1440 [02:47<00:00,  8.58it/s]


Unnamed: 0,ID,score_1,offensive rebound_1,defensive rebound_1,offensive foul_1,defensive foul_1,assist_1,lost ball_1,steals_1,bad pass_1,...,assist_1440,lost ball_1440,steals_1440,bad pass_1440,block_1440,miss_1440,two pts_1440,three pts_1440,fg_1440,total rebound_1440
0,14186,-2,0,0,0,0,0,0,0,0,...,-3,3,3,-2,1,9,-6,-1,-7,4
1,13013,0,0,-1,0,0,0,0,0,0,...,1,0,0,1,-2,-1,-7,3,-4,6
2,7102,0,0,0,0,0,0,0,1,1,...,0,5,5,-2,3,-5,1,1,2,-6
3,7637,-2,0,0,0,0,0,0,0,0,...,-1,-1,2,2,-1,-1,-2,0,-2,10
4,12350,0,0,0,0,0,0,0,0,0,...,4,3,2,1,1,3,-2,2,0,5


In [4]:
nb_games, col = Data_X_train.shape
nb_features = int((col-1)/1440)

**Aggregate by 10 seconds**

In [87]:
X = Data_X_train.as_matrix()[:,1:]
Y = Data_Y_train.as_matrix()[:,1:].reshape(nb_games,)

In [83]:
X = X.reshape((nb_games, nb_features, 10, -1), order = 'F')
X = X.mean(axis = 2)
X = X.reshape(nb_games, nb_features*144)

## Add Score at the end to postprocess
score_end = Data_X_train['score_1440'].as_matrix()
X_tot = np.zeros((X.shape[0],X.shape[1]+1))
X_tot[:,:-1] = X
X_tot[:,-1] = score_end

**Split train/val**

In [84]:
portion_train = 0.7
n_train = int(portion_train*nb_games)
n_val = nb_games - n_train
id_train = np.random.choice(nb_games, n_train, replace=False)

# Def train and validation data
X_train = X_tot[id_train,:]
Y_train = Y[id_train,:].reshape(n_train,)
X_val = np.delete(X_tot, id_train, axis = 0)
Y_val = np.delete(Y, id_train, axis = 0).reshape(n_val,)

In [69]:
# # Find score column
# Data_X_train['score_1440']
# a = Data_X_train.as_matrix()
# a[:,21586]

In [70]:
# If the score difference is greater than 8 points (20147 = score_1140)
def postprocess(X, Y) :
    Y[X[:,-1] > 9] = 1
    Y[X[:,-1] < -9] = 0
    return Y

In [71]:
def score_function(y_true, y_pred):
    score = 0
    length1 = y_true.shape[0]
    for i in range(length1):
        if y_pred[i] == y_true[i]:
            score += 1
    return float(score)/float(length1)

In [89]:
def evaluate_model(model, X, Y, skf) :
    accs_train = []
    accs_val = []
    accs_val_post = []
    for train, val in skf.split(X, Y) :
        model.fit(X[train], Y[train])
        acc_train = model.score(X[train], Y[train])
        acc_val = model.score(X[val], Y[val])
        accs_train.append(acc_train)
        accs_val.append(acc_val)
        Y_post_val = postprocess(X[val],model.predict(X[val]))
        acc_val_post = score_function(Y[val], Y_post_val)
        accs_val_post.append(acc_val_post)
        print('(Training, Validation, Validation with postprocesing) accuracies: ({:.2f},{:.2f},{:.2f})'
                          .format(100*acc_train, 100*acc_val, 100*acc_val_post))

    print('Mean Training Accuracy: {0:.2f} +/- {1:.2f}'.format(100*np.mean(accs_train), 100*np.std(accs_train)))
    print('Mean Validation Accuracy: {0:.2f} +/- {1:.2f}'.format(100*np.mean(accs_val), 100*np.std(accs_val)))  
    print('Mean Validation Accuracy with postprocessing: {0:.2f} +/- {1:.2f}'.format(100*np.mean(accs_val_post), 100*np.std(accs_val_post)))  

## XGBoost classifier

In [60]:
xgb = GradientBoostingClassifier(max_depth=10, n_estimators = 1000)
xgb.fit(X_train, Y_train)
print('Training accurary : {0:.2f}'.format(xgb.score(X_train, Y_train)*100))
print('Validation accurary : {0:.2f}'.format(xgb.score(X_val, Y_val)*100))

## Logistic Regression

In [61]:
LR = LogisticRegression(C=0.00001)
LR.fit(X_train, Y_train)
print('Training accurary : {0:.2f}'.format(LR.score(X_train, Y_train)))
print('Validation accurary : {0:.2f}'.format(LR.score(X_val, Y_val)))

## Random Forest

**Grid Search**

In [63]:
parameters = {'n_estimators': 200, 'max_depth': None, 'max_features': 15, 
               'min_samples_split': 15, 'min_samples_leaf': 2, 'bootstrap': True, 
               'oob_score': True, 'criterion': 'entropy'}

In [77]:
RandomForest = RandomForestClassifier(**parameters)
RandomForest.fit(X_train, Y_train)

print('Training accurary : {0:.2f}'.format(RandomForest.score(X_train, Y_train)*100))

print('Validation accurary : {0:.2f}'.format(RandomForest.score(X_val, Y_val)*100))
Y_val_post = postprocess(X_val, eclf.predict(X_val))
print('Validation accurary with post-processing : {0:.2f}'.format(score_function(Y_val, Y_val_post)*100))

Training accurary : 96.97
Validation accurary : 73.50
Validation accurary with post-processing : 74.24


**With K-folds**

In [85]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 7)

In [90]:
evaluate_model(RandomForest, X_tot, Y, skf)

(Training, Validation, Validation with postprocesing) accuracies: (96.92,75.52,75.40)
(Training, Validation, Validation with postprocesing) accuracies: (97.24,74.56,74.28)
(Training, Validation, Validation with postprocesing) accuracies: (96.97,73.52,73.16)
(Training, Validation, Validation with postprocesing) accuracies: (97.10,74.47,74.16)
(Training, Validation, Validation with postprocesing) accuracies: (96.86,71.92,71.80)
Mean Training Accuracy: 97.02 +/- 0.14
Mean Validation Accuracy: 74.00 +/- 1.22
Mean Validation Accuracy with postprocessing: 73.76 +/- 1.21


## Combination of those models

In [73]:
xgb = GradientBoostingClassifier(max_depth=10, n_estimators = 1000)
LogReg = LogisticRegression(C=0.00001)
RandomForest = RandomForestClassifier(**parameters)

eclf = VotingClassifier(estimators=[
                ('lr', LogReg), ('xgb', xgb), ('RandomForest', RandomForest)], voting='soft')
eclf.fit(X_train, Y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1e-05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('xgb', Grad...imators=200, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [78]:
print('Training accurary : {0:.2f}'.format(eclf.score(X_train, Y_train)*100))
print('Validation accurary : {0:.2f}'.format(eclf.score(X_val, Y_val)*100))

Training accurary : 99.70
Validation accurary : 74.56
