# Models Classifier :
**Classifier on 4 best models**

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from src.utils import split_train_val, feature_engineering
from sklearn.externals import joblib

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras import losses
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Loading data

In [2]:
# Open files
Data_X_train = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Data_Y_train = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')

## Preprocessing

In [3]:
Data_X_train = feature_engineering(Data_X_train, two_points = True)

In [4]:
nb_games, col = Data_X_train.shape
nb_features = int((col-1)/1440)

In [5]:
X = Data_X_train.as_matrix()[:,1:]
Y = Data_Y_train.as_matrix()[:,1:]

**Split train/val**

In [6]:
portion_train = 0.7
n_train = int(portion_train*nb_games)
n_val = nb_games - n_train
id_train = np.random.choice(nb_games, n_train, replace=False)

# Def train and validation data
X_train = X[id_train,:]
Y_train = Y[id_train,:].reshape(n_train,)
X_val = np.delete(X, id_train, axis = 0)
Y_val = np.delete(Y, id_train, axis = 0).reshape(n_val,)

In [7]:
nb_games_train = len(X_train)
nb_games_val = len(X_val)

In [18]:
del Data_X_train

# XGboost, Random Forest & Logistic Regression

**Aggregate by 10 seconds**

In [20]:
score_end = X_train[:,25903]

X_train_models = X_train.reshape((nb_games_train, nb_features, 10, -1), order = 'F')
X_train_models = X_train_models.mean(axis = 2)
X_train_models = X_train_models.reshape(nb_games_train, nb_features*144)

# Add final score
X_train_tot = np.zeros((X_train_models.shape[0],X_train_models.shape[1]+1))
X_train_tot[:,:-1] = X_train_models
X_train_tot[:,-1] = score_end

## Validation
score_end_val = X_val[:,25903]

X_val_models = X_val.reshape((nb_games_val, nb_features, 10, -1), order = 'F')
X_val_models = X_val_models.mean(axis = 2)
X_val_models = X_val_models.reshape(nb_games_val, nb_features*144)

# Add final score
X_val_tot = np.zeros((X_val_models.shape[0],X_val_models.shape[1]+1))
X_val_tot[:,:-1] = X_val_models
X_val_tot[:,-1] = score_end_val

In [21]:
del X_train_models, X_val_models

### Random Forest

In [22]:
parameters = {'n_estimators': 200, 'max_depth': None, 'max_features': 15, 
               'min_samples_split': 15, 'min_samples_leaf': 2, 'bootstrap': True, 
               'oob_score': True, 'criterion': 'entropy'}

In [25]:
RandomForest = RandomForestClassifier(**parameters)
RandomForest.fit(X_train_tot, Y_train)

acc_train, acc_val = RandomForest.score(X_train_tot, Y_train), RandomForest.score(X_val_tot, Y_val)
print('Train: {0:.2f} - Val: {1:.2f}'.format(100*acc_train, 100*acc_val))

Y_pred_RandomForest = RandomForest.predict(X_val_tot)

Train: 97.01 - Val: 73.65


### XGBoost Classifier

In [26]:
xgb = GradientBoostingClassifier(max_depth=10, n_estimators = 1000)
xgb.fit(X_train_tot, Y_train)

acc_train, acc_val = xgb.score(X_train_tot, Y_train), xgb.score(X_val_tot, Y_val)
print('Train: {0:.2f} - Val: {1:.2f}'.format(100*acc_train, 100*acc_val))

Y_pred_xgb = xgb.predict(X_val_tot)

Train: 100.00 - Val: 73.18


### Logistic Regression

In [34]:
LR = LogisticRegression(C=0.0001)
LR.fit(X_train_tot, Y_train)

acc_train, acc_val = LR.score(X_train_tot, Y_train), LR.score(X_val_tot, Y_val)
print('Train: {0:.2f} - Val: {1:.2f}'.format(100*acc_train, 100*acc_val))

Y_pred_LR = LR.predict(X_val_tot)

Train: 73.74 - Val: 71.59


# Bi-directional_LSTM

In [35]:
X_train_lstm = X_train.reshape((nb_games_train, nb_features, 15, -1), order = 'F')
X_train_lstm = X_train_lstm.mean(axis = 2)
Y_train_lstm = np_utils.to_categorical(Y_train, 2)

X_val_lstm = X_val.reshape((nb_games_val, nb_features, 15, -1), order = 'F')
X_val_lstm = X_val_lstm.mean(axis = 2)

In [36]:
bi_lstm_model = Sequential()
bi_lstm_model.add(Bidirectional(LSTM(175, recurrent_dropout = 0.25), 
                                input_shape = (18, 96)))
bi_lstm_model.add(Dropout(0.75))
bi_lstm_model.add(Dense(units = 150, 
                        activation = 'relu'))
bi_lstm_model.add(Dropout(0.5))
bi_lstm_model.add(Dense(units = 2, 
                        activation='softmax'))
bi_lstm_model.compile(loss = losses.categorical_crossentropy,
                      optimizer = 'adam',
                      metrics = ['accuracy'])

#Training
bi_lstm_model.fit(X_train_lstm, Y_train_lstm, 
                  epochs = 15, batch_size = 64, verbose = False)

extract_cnn_features = K.function([bi_lstm_model.layers[0].input, K.learning_phase()],
                                  [bi_lstm_model.layers[3].output])
X_train_features = extract_cnn_features([X_train_lstm, 0])[0]
X_val_features = extract_cnn_features([X_val_lstm, 0])[0]

parameters = {'n_estimators': 500, 'max_depth': 40, 'min_samples_leaf': 2}
clf = RandomForestClassifier(**parameters)
clf.fit(X_train_features, Y_train.ravel())

acc_train, acc_val = clf.score(X_train_features, Y_train), clf.score(X_val_features, Y_val)
print('Train: {0:.2f} - Val: {1:.2f}'.format(100*acc_train, 100*acc_val))

Y_pred_lstm = clf.predict(X_val_features)

Train: 99.59 - Val: 73.50


# Concatenation

In [37]:
from sklearn.model_selection import cross_validate

In [84]:
Y_preds = np.column_stack((Y_pred_RandomForest, Y_pred_LR, Y_pred_xgb, Y_pred_lstm))

C = [0.00001 , 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 0.5, 1]
for c in C :
    LR = LogisticRegression(C=c)
    scores = cross_validate(LR, Y_preds, Y_val, cv = 5, return_train_score = True)
    print('C={0} - Train: {1:.2f} +/-{2:.2f} - Val: {3:.2f} +/-{4:.2f}'.format(c,
                                                                              100*np.mean(scores['train_score']),
                                                                              100*np.std(scores['train_score']),
                                                                              100*np.mean(scores['test_score']),
                                                                              100*np.std(scores['test_score'])))
print('Uniform Mean Score: {:.2f}'.format(100*np.mean(Y_preds.mean(axis=1).astype(int)==Y_val)))

C=1e-05 - Train: 71.93 +/-0.61 - Val: 71.93 +/-2.43
C=0.0001 - Train: 72.20 +/-0.40 - Val: 72.17 +/-2.81
C=0.0005 - Train: 73.71 +/-0.65 - Val: 73.71 +/-2.61
C=0.001 - Train: 73.71 +/-0.65 - Val: 73.71 +/-2.61
C=0.005 - Train: 73.83 +/-0.61 - Val: 73.79 +/-2.55
C=0.01 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=0.1 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=0.5 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=1 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
Uniform Mean Score: 72.33


In [85]:
Y_preds = np.column_stack((Y_pred_RandomForest, Y_pred_xgb, Y_pred_lstm))

C = [0.00001 , 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 0.5, 1]
for c in C :
    LR = LogisticRegression(C=c)
    scores = cross_validate(LR, Y_preds, Y_val, cv = 5, return_train_score = True)
    print('C={0} - Train: {1:.2f} +/-{2:.2f} - Val: {3:.2f} +/-{4:.2f}'.format(c,
                                                                              100*np.mean(scores['train_score']),
                                                                              100*np.std(scores['train_score']),
                                                                              100*np.mean(scores['test_score']),
                                                                              100*np.std(scores['test_score'])))
print('Uniform Mean Score: {:.2f}'.format(100*np.mean(Y_preds.mean(axis=1).astype(int)==Y_val)))

C=1e-05 - Train: 73.26 +/-0.62 - Val: 73.26 +/-2.47
C=0.0001 - Train: 73.26 +/-0.62 - Val: 73.26 +/-2.47
C=0.0005 - Train: 73.65 +/-0.52 - Val: 73.63 +/-2.95
C=0.001 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=0.005 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=0.01 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=0.1 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=0.5 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=1 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
Uniform Mean Score: 73.20


In [89]:
Y_preds = np.column_stack((Y_pred_RandomForest, Y_pred_xgb, Y_pred_lstm, score_end_val))

C = [0.00001 , 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 0.5, 1]
for c in C :
    LR = LogisticRegression(C=c)
    scores = cross_validate(LR, Y_preds, Y_val, cv = 5, return_train_score = True)
    print('C={0} - Train: {1:.2f} +/-{2:.2f} - Val: {3:.2f} +/-{4:.2f}'.format(c,
                                                                              100*np.mean(scores['train_score']),
                                                                              100*np.std(scores['train_score']),
                                                                              100*np.mean(scores['test_score']),
                                                                              100*np.std(scores['test_score'])))

C=1e-05 - Train: 69.43 +/-3.32 - Val: 69.86 +/-4.93
C=0.0001 - Train: 72.32 +/-1.42 - Val: 72.43 +/-4.48
C=0.0005 - Train: 73.55 +/-0.63 - Val: 73.57 +/-2.65
C=0.001 - Train: 73.87 +/-0.61 - Val: 73.97 +/-2.51
C=0.005 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=0.01 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=0.1 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=0.5 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56
C=1 - Train: 73.87 +/-0.64 - Val: 73.87 +/-2.56


In [91]:
from sklearn.externals import joblib
#Save Logistic Regression on top of other predictions
Y_preds = np.column_stack((Y_pred_RandomForest, Y_pred_xgb, Y_pred_lstm, score_end_val))
LR = LogisticRegression(C=c)
LR.fit(Y_preds, Y_val)
joblib.dump(LR, 'models/Log_Reg_On_Other_Clfs_Predictions.pkl')
LR.coef_, LR.intercept_

(array([[ 0.81162191,  0.70716802,  0.84358252, -0.0025334 ]]),
 array([-1.21358256]))