# Models Classifier :
**Classifier on 4 best models**

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from src.utils import split_train_val, feature_engineering
from sklearn.externals import joblib

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras import losses
from keras import backend as K

## Loading data

In [None]:
# Open files
Data_X_train = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Data_Y_train = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')

## Preprocessing

In [None]:
Data_X_train = feature_engineering(Data_X_train)

In [None]:
nb_games, col = Data_X_train.shape
nb_features = int((col-1)/1440)

In [None]:
X = Data_X_train.as_matrix()[:,1:]
Y = Data_Y_train.as_matrix()[:,1:]

**Split train/val**

In [None]:
portion_train = 0.7
n_train = int(portion_train*nb_games)
n_val = nb_games - n_train
id_train = np.random.choice(nb_games, n_train, replace=False)

# Def train and validation data
X_train = X[id_train,:]
Y_train = Y[id_train,:].reshape(n_train,)
X_val = np.delete(X, id_train, axis = 0)
Y_val = np.delete(Y, id_train, axis = 0).reshape(n_val,)

In [None]:
nb_games_train = len(X_train)
nb_games_val = len(X_val)

# XGboost, Random Forest & Logistic Regression

**Aggregate by 10 seconds**

In [None]:
score_end = X_train[:,15829]

X_train_models = X_train.reshape((nb_games_train, nb_features, 10, -1), order = 'F')
X_train_models = X_train_models.mean(axis = 2)
X_train_models = X_train_models.reshape(nb_games_train, nb_features*144)

# Add final score
X_train_tot = np.zeros((X_train_models.shape[0],X_train_models.shape[1]+1))
X_train_tot[:,:-1] = X_train_models
X_train_tot[:,-1] = score_end

## Validation
score_end_val = X_val[:,15829]

X_val_models = X_val.reshape((nb_games_val, nb_features, 10, -1), order = 'F')
X_val_models = X_val_models.mean(axis = 2)
X_val_models = X_val_models.reshape(nb_games_val, nb_features*144)

# Add final score
X_val_tot = np.zeros((X_val_models.shape[0],X_val_models.shape[1]+1))
X_val_tot[:,:-1] = X_val_models
X_val_tot[:,-1] = score_end_val

### Random Forest

In [None]:
parameters = {'n_estimators': 200, 'max_depth': None, 'max_features': 15, 
               'min_samples_split': 15, 'min_samples_leaf': 2, 'bootstrap': True, 
               'oob_score': True, 'criterion': 'entropy'}

In [None]:
RandomForest = RandomForestClassifier(**parameters)
RandomForest.fit(X_train_tot, Y_train)
Y_pred_RandomForest = RandomForest.predict(X_val_tot)

### XGBoost Classifier

In [None]:
xgb = GradientBoostingClassifier(max_depth=10, n_estimators = 1000)
xgb.fit(X_train_tot, Y_train)
Y_pred_xgb = xgb.predict(X_val_tot)

### Logistic Regression

In [None]:
LR = LogisticRegression(C=0.00001)
LR.fit(X_train_tot, Y_train)
Y_pred_LR = LR.predict(X_val_tot)

# Bi-directional_LSTM

In [None]:
X_train_lstm = X_train.reshape((nb_games_train, nb_features, 15, -1), order = 'F')
X_train_lstm = X_train_lstm.mean(axis = 2)
Y_train_lstm = np_utils.to_categorical(Y_train, 2)

X_val_lstm = X_val.reshape((nb_games_val, nb_features, 15, -1), order = 'F')
X_val_lstm = X_val_lstm.mean(axis = 2)

In [None]:
bi_lstm_model = Sequential()
bi_lstm_model.add(Bidirectional(LSTM(125, recurrent_dropout = 0.25), 
                                input_shape = (16, 96)))
bi_lstm_model.add(Dropout(0.75))
bi_lstm_model.add(Dense(units = 150, 
                        activation = 'relu'))
bi_lstm_model.add(Dropout(0.5))
bi_lstm_model.add(Dense(units = 2, 
                        activation='softmax'))
bi_lstm_model.compile(loss = losses.categorical_crossentropy,
                      optimizer = 'adam',
                      metrics = ['accuracy'])

#Training
bi_lstm_model.fit(X_train_lstm, Y_train_lstm, 
                  epochs = 15, batch_size = 64, verbose = False)

extract_cnn_features = K.function([bi_lstm_model.layers[0].input, K.learning_phase()],
                                  [bi_lstm_model.layers[3].output])
X_train_features = extract_cnn_features([X_train_lstm, 0])[0]
X_test_features = extract_cnn_features([X_test_lstm, 0])[0]

parameters = {'n_estimators': 500, 'max_depth': 40, 'min_samples_leaf': 2}
clf = RandomForestClassifier(**parameters)
clf.fit(X_train_features, Y_train.ravel())

Y_pred_lstm = clf.predict(X_val_lstm)

# Concatenation

In [None]:
Y_pred = np.column_stack((Y_pred_RandomForest, Y_pred_LR, Y_pred_xgb, Y_pred_lstm))

In [None]:
C = [0.00001 , 0.0001, 0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000]
for c in C :
    LR = LogisticRegression(C=c)
    LR.fit(Y_pred, Y_val)
    print(LR.score(Y_pred,Y_val))