# Submitted Models

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from src.utils import *
random.seed(7)

## Loading Data

In [2]:
# Open files
Data_X_train = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Data_Y_train = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')
Data_X_test = pd.read_csv('data/challenge_fichiers_dentrees_de_test_challenge_nba/test.csv')

**Data Preprocessing**

In [None]:
# X_train, Y_train, _, _ = preprocess(Data_X_train, Data_Y_train, portion_train=1)
X_train = Data_X_train.as_matrix()[:,1:]
X_test = Data_X_test.as_matrix()[:,1:]
Y_train = Data_Y_train.as_matrix()[:,1:]

## First model : Logistic Regression

**Logistic Regression - 01/03 - Test score : 71.626%** <br>

In [None]:
c = 1e-5
model_LogisticRegression = LogisticRegression(C=c)
model_LogisticRegression.fit(X_train, Y_train)
Y_test = model_LogisticRegression.predict(X_test)

In [None]:
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

In [None]:
# Write in file
Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/LogisticRegression_0103.csv',sep=';', index=False)

## Second model: CNN

**CNN on raw images - 01/03 - Test score : 72.20%**

In [9]:
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Conv2D, BatchNormalization, Dropout, Flatten, Dense, LSTM
from keras import losses

Using TensorFlow backend.


In [None]:
X_train_cnn = X_train.reshape((len(X_train), 11, 1440, 1), order = 'F')
Y_train_cnn = np_utils.to_categorical(Y_train, 2)
X_test_cnn = X_test.reshape((len(X_test), 11, 1440, 1), order = 'F')

In [None]:
conv_model = Sequential()
conv_model.add(BatchNormalization(axis=1, 
                                  input_shape = (11, 1440, 1)))
conv_model.add(Conv2D(filters = 16, 
                 kernel_size = (11, 10), 
                 activation = 'relu'))
conv_model.add(Dropout(0.75))
conv_model.add(Flatten())
conv_model.add(Dense(units = 50, activation = 'relu'))
conv_model.add(Dropout(0.5))
conv_model.add(Dense(units = 2, 
                activation='softmax'))
conv_model.compile(loss = losses.categorical_crossentropy,
                     optimizer = 'adam',
                     metrics = ['accuracy'])
conv_model.fit(X_train_cnn, Y_train_cnn, epochs = 200, batch_size = 32, verbose = False)
Y_test_cnn = conv_model.predict(X_test_cnn)

In [None]:
Y_test = np.argmax(Y_test_cnn, axis = 1)

ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_0103.csv',sep=';', index=False)

# Third model: LSTM
**4 main features - 05/03 - Test score: 68.81%**

In [None]:
X_train, Y_train, _, _ = preprocess(Data_X_train, Data_Y_train, portion_train=1, main_feature = True)

X_train_seq_10 = X_train.reshape((len(X_train), 4, 10, -1), order = 'F')
X_train_seq_10 = X_train_seq_10.mean(axis = 2)

In [None]:
X_test = extract_main_features(Data_X_test)

In [None]:
X_test_seq_10 = X_test.reshape((len(X_test), 4, 10, -1), order = 'F')
X_test_seq_10 = X_test_seq_10.mean(axis = 2)

In [None]:
Y_train = np_utils.to_categorical(Y_train, 2)

In [None]:
lstm_model = Sequential()
lstm_model.add(LSTM(75, input_shape = (4, 144), dropout = 0.5))
lstm_model.add(Dense(units = 10,
                     activation = 'relu'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(units = 2, 
                activation='softmax'))

#Construct Loss
lstm_model.compile(loss = losses.categorical_crossentropy,
                     optimizer = 'adam',
                     metrics = ['accuracy'])

#Train
training = lstm_model.fit(X_train_seq_10, Y_train, epochs = 200, batch_size = 32, verbose = False)

lstm_model.save('models/lstm_seq_10_4_main_features.h5')

In [None]:
Y_test = lstm_model.predict(X_test_seq_10)
Y_test = np.argmax(Y_test, axis = 1)

ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/LSTM_0503.csv',sep=';', index=False)

# Fourth Model: CNN
**CNN on raw images - 06/03 - Test score: 71.38%**

In [None]:
X_train_cnn = X_train.reshape((len(X_train), 11, 1440, 1), order = 'F')
Y_train_cnn = np_utils.to_categorical(Y_train, 2)
X_test_cnn = X_test.reshape((len(X_test), 11, 1440, 1), order = 'F')

In [None]:
Y_train = np_utils.to_categorical(Y_train, 2)

In [None]:
conv_model = Sequential()
conv_model.add(BatchNormalization(axis=1, 
                                  input_shape = (11, 1440, 1)))
conv_model.add(Conv2D(filters = 32, 
                 kernel_size = (11, 10), 
                 activation = 'relu',
                 strides = (1, 2)))
conv_model.add(Dropout(0.5))
conv_model.add(Flatten())
conv_model.add(Dense(units = 50, 
                     activation = 'relu'))
conv_model.add(Dropout(0.5))
conv_model.add(Dense(units = 2, 
                activation='softmax'))

#Construct Loss
conv_model.compile(loss = losses.categorical_crossentropy,
                     optimizer = 'adam',
                     metrics = ['accuracy'])

#Train
training = conv_model.fit(X_train_cnn, Y_train, epochs = 200, batch_size = 32, verbose = False)
conv_model.save('models/cnn_7.h5')

In [None]:
Y_test = conv_model.predict(X_test_cnn)
Y_test = np.argmax(Y_test, axis = 1)

ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_0603.csv',sep=';', index=False)

# Fifth model: CNN
**CNN on raw images full features - 07/03 - Test score: 71.44%**

In [None]:
X_train_cnn = X_train.reshape((len(X_train), 11, 1440, 1), order = 'F')
Y_train_cnn = np_utils.to_categorical(Y_train, 2)
X_test_cnn = X_test.reshape((len(X_test), 11, 1440, 1), order = 'F')

Y_train = np_utils.to_categorical(Y_train, 2)

In [None]:
conv_model = Sequential()
conv_model.add(BatchNormalization(axis=1, 
                                  input_shape = (11, 1440, 1)))
conv_model.add(Conv2D(filters = 16, 
                 kernel_size = (11, 10), 
                 activation = 'relu',
                 strides = (1, 10)))
conv_model.add(Dropout(0.75))
conv_model.add(Flatten())
conv_model.add(Dense(units = 50, 
                     activation = 'relu'))
conv_model.add(Dropout(0.5))
conv_model.add(Dense(units = 2, 
                activation='softmax'))

#Construct Loss
conv_model.compile(loss = losses.categorical_crossentropy,
                     optimizer = 'adam',
                     metrics = ['accuracy'])

#Train
training = conv_model.fit(X_train_cnn, Y_train, epochs = 200, batch_size = 32, verbose = False)
conv_model.save('models/cnn_8.h5')

In [None]:
Y_test = conv_model.predict(X_test_cnn)
Y_test = np.argmax(Y_test, axis = 1)

ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_0703.csv',sep=';', index=False)

# Sixth model: CNN features + Logistic Regression
**Extract last layers features from CNN and apply logistic regression classifier- 07/03 - Test score: 72.19%**

In [None]:
from keras import backend as K
from keras.models import load_model

In [None]:
cnn_model = load_model('models/cnn_8.h5')

In [None]:
X_train_cnn = X_train.reshape((len(X_train), 11, 1440, 1), order = 'F')
X_test_cnn = X_test.reshape((len(X_test), 11, 1440, 1), order = 'F')

In [None]:
extract_cnn_features = K.function([cnn_model.layers[0].input, K.learning_phase()],
                                [cnn_model.layers[5].output])

In [None]:
X_train_cnn_features = extract_cnn_features([X_train_cnn, 0])[0]

In [None]:
X_test_cnn_features = extract_cnn_features([X_test_cnn, 0])[0]

In [None]:
cnn_lg = LogisticRegression()
cnn_lg.fit(X_train_cnn_features, Y_train)
Y_test = cnn_lg.predict(X_test_cnn_features)

In [None]:
Y_test.shape

In [None]:
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_logreg_0703.csv',sep=';', index=False)

# Seventh model : CNN with time and frequency features
**Time and frequency features - 08/03 - Test score: 71.62%**

In [None]:
nb_freq = 200

X_FF = []
X_FF_test = []
features = ['miss', 'score', 'assist', 'offensive rebound', 'defensive rebound', 'offensive foul', 'defensive foul',
           'lost ball', 'steals', 'bad pass', 'block']
for feat in features :
    filter_col = [col for col in Data_X_train if col.startswith(feat)]
    X_feat = Data_X_train[filter_col].as_matrix()
    X_feat_test =  Data_X_test[filter_col].as_matrix()
    X_feat_fft = np.fft.fft(X_feat)[:,:nb_freq]
    X_feat_fft_test = np.fft.fft(X_feat_test)[:,:nb_freq]
    X_FF.append(abs(X_feat_fft))
    X_FF_test.append(abs(X_feat_fft_test))
X_fft = np.hstack(X_FF)
X_fft_test = np.hstack(X_FF_test)

In [None]:
nb_games = len(X_train)
X_seq = X_train.reshape((nb_games, 11, 10, -1), order = 'F')
X_seq = X_seq.mean(axis = 2)
X_seq = X_seq.reshape(nb_games, 1584)
X_tot = np.concatenate((X_seq, X_fft), axis = 1)

nb_games_test = len(X_test)
X_seq_test = X_test.reshape((nb_games_test, 11, 10, -1), order = 'F')
X_seq_test = X_seq_test.mean(axis = 2)
X_seq_test = X_seq_test.reshape(nb_games_test, 1584)
X_tot_test = np.concatenate((X_seq_test, X_fft_test), axis = 1)

In [None]:
X_cnn = X_tot.reshape((len(X_tot), 11, 344, 1), order = 'F')
Y_cnn = np_utils.to_categorical(Y_train, 2)

conv_model = Sequential()

conv_model.add(BatchNormalization(axis=1, 
                                  input_shape = (11, 344, 1)))
conv_model.add(Conv2D(filters = 16, 
                 kernel_size = (11, 10), 
                 activation = 'relu'))
conv_model.add(Dropout(0.75))
conv_model.add(Flatten())
conv_model.add(Dense(units = 50, activation = 'relu'))
conv_model.add(Dropout(0.5))
conv_model.add(Dense(units = 2, 
                activation='softmax'))

conv_model.compile(loss = losses.categorical_crossentropy,
                     optimizer = 'adam',
                     metrics = ['accuracy'])

training = conv_model.fit(X_cnn, Y_cnn, epochs = 200, batch_size = 32, verbose = False)
conv_model.save('models/cnn_9.h5')

In [None]:
X_test_cnn = X_tot_test.reshape((len(X_tot_test), 11, 344, 1), order = 'F')

Y_test = conv_model.predict(X_test_cnn)
Y_test = np.argmax(Y_test, axis = 1)

ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_0803.csv',sep=';', index=False)

# Eight model : LSTM + Log Reg
**Last layer features from LSTM + Logistic Regression - 10/03 - Test score: 69.67%**

In [None]:
from keras import backend as K
from keras.models import load_model
from sklearn.linear_model import LogisticRegression

In [None]:
X_train = X_train.reshape((len(X_train), 11, 10, -1), order = 'F')
X_train = X_train.mean(axis = 2)

X_test = X_test.reshape((len(X_test), 11, 10, -1), order = 'F')
X_test = X_test.mean(axis = 2)

In [None]:
model = load_model('models/LSTMs/lstm_11.h5')

extract_cnn_features = K.function([model.layers[0].input, K.learning_phase()],
                                  [model.layers[1].output])
X_train_features = extract_cnn_features([X_train, 0])[0]
X_test_features = extract_cnn_features([X_test, 0])[0]

clf = LogisticRegression()
clf.fit(X_train_features, Y_train.flatten())

Y_test = clf.predict(X_test_features)

In [None]:
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/LSTM_logreg_1003.csv',sep=';', index=False)

# Ninth model: Average of 5 best model
**Average of CNN_0803, CNN_logreg_0703, CNN_0703, CNN_0603, CNN_0103 = 5 best submissions - 10/03 - Test score: 72.48%**

In [None]:
import functools

In [None]:
sub1 = pd.read_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_0103.csv', sep = ";")
sub2 = pd.read_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_0603.csv', sep = ";")
sub3 = pd.read_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_0703.csv', sep = ";")
sub4 = pd.read_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_logreg_0703.csv', sep = ";")
sub5 = pd.read_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_0803.csv', sep = ";")

In [None]:
temp = functools.reduce(lambda x, y: pd.merge(x, y, on = 'ID'), [sub1, sub2, sub3, sub4, sub5])
temp['Avg'] = round(temp.iloc[:,1:].mean(axis = 1)).astype(int)
submission = temp[['ID', 'Avg']]
submission.rename(columns = {'Avg': 'label'}, inplace = True)

submission.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/average_5_best_submissions_1003.csv',sep=';', index=False)

# Tenth model: LSTM + XGBoost
**Last Layer from LSTM (20 secs) + XGBoost - Test score: 70.58%**

In [None]:
from keras import backend as K
from keras.models import load_model
from xgboost import XGBClassifier

In [None]:
X_train = X_train.reshape((len(X_train), 11, 20, -1), order = 'F')
X_train = X_train.mean(axis = 2)

X_test = X_test.reshape((len(X_test), 11, 20, -1), order = 'F')
X_test = X_test.mean(axis = 2)

In [None]:
model = load_model('models/LSTMs/lstm_14.h5')

extract_cnn_features = K.function([model.layers[0].input, K.learning_phase()],
                                  [model.layers[1].output])
X_train_features = extract_cnn_features([X_train, 0])[0]
X_test_features = extract_cnn_features([X_test, 0])[0]

clf = XGBClassifier(n_estimators = 1000, max_depth = 2)
clf.fit(X_train_features, Y_train.flatten())

Y_test = clf.predict(X_test_features)

In [None]:
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/LSTM_xgboost_1303.csv',sep=';', index=False)

## Eleventh model: Logistic Regression + feature engineering
**Test score : 71.43%**

In [None]:
X_train = feature_engineering(Data_X_train)
X_test = feature_engineering(Data_X_test)

X_train = X_train.as_matrix()[:,1:]
X_test = X_test.as_matrix()[:,1:]
Y_train = Data_Y_train.as_matrix()[:,1:]

In [None]:
LR = LogisticRegression(C=0.00001)
LR.fit(X_train, Y_train)

In [None]:
Y_test = LR.predict(X_test)

ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

In [None]:
Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/LogisticRegression_1403.csv',sep=';', index=False)

## Twelfth model: Feature Engineering + Random Forest
**Test score - 72.24%**

In [None]:
# If the score difference is greater than 8 points (20147 = score_1140)
def postprocess(X, Y) :
    Y[X[:,20147] > 8] = 1
    Y[X[:,20147] <- 8] = 0
    return Y

In [None]:
X_train = feature_engineering(Data_X_train)
X_test = feature_engineering(Data_X_test)

X_train = X_train.as_matrix()[:,1:]
X_test = X_test.as_matrix()[:,1:]
Y_train = Data_Y_train.as_matrix()[:,1:]

In [None]:
parameters = {'n_estimators': 150, 'max_depth': None, 'max_features': 15, 
               'min_samples_split': 15, 'min_samples_leaf': 2, 'bootstrap': True, 
               'oob_score': True, 'criterion': 'entropy'}
RandomForest = RandomForestClassifier(**parameters)
RandomForest.fit(X_train, Y_train.reshape(len(X_train),))

In [None]:
# PostProcessing
Y_test = RandomForest.predict(X_test)
Y_test = postprocess(X_test,Y_test)

In [None]:
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

In [None]:
Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/RandomForest_1603.csv',sep=';', index=False)

## Thirteenth model:
**Aggregation of features + pre and post processing - score 73.53%**

In [None]:
def postprocess(X, Y) :
    Y[X[:,-1] > 9] = 1
    Y[X[:,-1] < -9] = 0
    return Y

In [None]:
Data_X_train = feature_engineering(Data_X_train)
Data_X_test = feature_engineering(Data_X_test)

X_train = Data_X_train.as_matrix()[:,1:]
X_test = Data_X_test.as_matrix()[:,1:]
Y_train = Data_Y_train.as_matrix()[:,1:].reshape(len(Data_X_train),)

In [None]:
nb_games_train, col = Data_X_train.shape
nb_games_test, _ = Data_X_test.shape
nb_features = int((col-1)/1440)

## Aggregate features
X_train = X_train.reshape((nb_games_train, nb_features, 10, -1), order = 'F')
X_train = X_train.mean(axis = 2)
X_train = X_train.reshape(nb_games_train, nb_features*144)

## Add Score at the end to postprocess
score_end_train = Data_X_train['score_1440'].as_matrix()
X_tot_train = np.zeros((X_train.shape[0],X_train.shape[1]+1))
X_tot_train[:,:-1] = X_train
X_tot_train[:,-1] = score_end_train


## Aggregate features
X_test = X_test.reshape((nb_games_test, nb_features, 10, -1), order = 'F')
X_test = X_test.mean(axis = 2)
X_test = X_test.reshape(nb_games_test, nb_features*144)

## Add Score at the end to postprocess
score_end_test = Data_X_test['score_1440'].as_matrix()
X_tot_test = np.zeros((X_test.shape[0],X_test.shape[1]+1))
X_tot_test[:,:-1] = X_test
X_tot_test[:,-1] = score_end_test

In [None]:
parameters = {'n_estimators': 200, 'max_depth': None, 'max_features': 15, 
               'min_samples_split': 15, 'min_samples_leaf': 2, 'bootstrap': True, 
               'oob_score': True, 'criterion': 'entropy'}
RandomForest = RandomForestClassifier(**parameters)
RandomForest.fit(X_tot_train, Y_train)

In [None]:
# PostProcessing
Y_test = RandomForest.predict(X_tot_test)
Y_test = postprocess(X_tot_test,Y_test)

In [None]:
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

In [None]:
Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/RandomForest_1703.csv',sep=';', index=False)

## Fourteenth model :
**Combinaition of Random Forest, XGBoost and LogisticRegression - score 74.34%**

In [3]:
Data_X_train = feature_engineering(Data_X_train, two_points = True)
Data_X_test = feature_engineering(Data_X_test, two_points = True)

X_train = Data_X_train.as_matrix()[:,1:]
X_test = Data_X_test.as_matrix()[:,1:]
Y_train = Data_Y_train.as_matrix()[:,1:].reshape(len(Data_X_train),)

In [4]:
nb_games_train, col = Data_X_train.shape
nb_games_test, _ = Data_X_test.shape
nb_features = int((col-1)/1440)

## Aggregate features
X_train = X_train.reshape((nb_games_train, nb_features, 10, -1), order = 'F')
X_train = X_train.mean(axis = 2)
X_train = X_train.reshape(nb_games_train, nb_features*144)

## Add Score at the end
score_end_train = Data_X_train['score_1440'].as_matrix()
X_tot_train = np.zeros((X_train.shape[0],X_train.shape[1]+1))
X_tot_train[:,:-1] = X_train
X_tot_train[:,-1] = score_end_train


## Aggregate features
X_test = X_test.reshape((nb_games_test, nb_features, 10, -1), order = 'F')
X_test = X_test.mean(axis = 2)
X_test = X_test.reshape(nb_games_test, nb_features*144)

## Add Score at the end to postprocess
score_end_test = Data_X_test['score_1440'].as_matrix()
X_tot_test = np.zeros((X_test.shape[0],X_test.shape[1]+1))
X_tot_test[:,:-1] = X_test
X_tot_test[:,-1] = score_end_test

In [5]:
parameters_randomforest = {'n_estimators': 200, 'max_depth': None, 'max_features': 15, 
               'min_samples_split': 15, 'min_samples_leaf': 2, 'bootstrap': True, 
               'oob_score': True, 'criterion': 'entropy'}

xgb = GradientBoostingClassifier(max_depth=10, n_estimators = 1000)
LogReg = LogisticRegression(C=0.00001)
RandomForest = RandomForestClassifier(**parameters_randomforest)

eclf = VotingClassifier(estimators=[
                ('lr', LogReg), ('xgb', xgb), ('RandomForest', RandomForest)], voting='soft')
eclf.fit(X_tot_train, Y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1e-05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('xgb', Grad...imators=200, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [6]:
Y_test = eclf.predict(X_tot_test)
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

In [7]:
Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CombinationModels_2203.csv',sep=';', index=False)

# Fifteenth model: 
**Bidirectional LSTM - Last Layer - Overfitted Random Forest - Submission score: 74.10%**

In [None]:
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras import losses
from keras import backend as K

from sklearn.ensemble import RandomForestClassifier

In [None]:
Data_X_train = feature_engineering(Data_X_train)
Data_X_test = feature_engineering(Data_X_test)

X_train = Data_X_train.as_matrix()[:,1:]
X_test = Data_X_test.as_matrix()[:,1:]
Y_train = Data_Y_train.as_matrix()[:,1:]

del Data_X_train

X_train_lstm = X_train.reshape((len(X_train), 16, 15, -1), order = 'F')
X_train_lstm = X_train_lstm.mean(axis = 2)
Y_train_lstm = np_utils.to_categorical(Y_train, 2)

X_test_lstm = X_test.reshape((len(X_test), 16, 15, -1), order = 'F')
X_test_lstm = X_test_lstm.mean(axis = 2)

In [None]:
bi_lstm_model = Sequential()
bi_lstm_model.add(Bidirectional(LSTM(125, recurrent_dropout = 0.25), 
                                input_shape = (16, 96)))
bi_lstm_model.add(Dropout(0.75))
bi_lstm_model.add(Dense(units = 150, 
                        activation = 'relu'))
bi_lstm_model.add(Dropout(0.5))
bi_lstm_model.add(Dense(units = 2, 
                        activation='softmax'))
bi_lstm_model.compile(loss = losses.categorical_crossentropy,
                      optimizer = 'adam',
                      metrics = ['accuracy'])

#Training
bi_lstm_model.fit(X_train_lstm, Y_train_lstm, 
                  epochs = 15, batch_size = 64, verbose = False)

bi_lstm_model.save('models/bi_lstm.h5')

In [None]:
extract_cnn_features = K.function([bi_lstm_model.layers[0].input, K.learning_phase()],
                                  [bi_lstm_model.layers[3].output])
X_train_features = extract_cnn_features([X_train_lstm, 0])[0]
X_test_features = extract_cnn_features([X_test_lstm, 0])[0]

parameters = {'n_estimators': 500, 'max_depth': 40, 'min_samples_leaf': 2}
clf = RandomForestClassifier(**parameters)
clf.fit(X_train_features, Y_train.ravel())

In [None]:
Y_test = clf.predict(X_test_features)
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/BiLSTM_1903.csv',sep=';', index=False)

# 16th model
**Bidirectional LSTM  on 18 features - Last Layer - Overfitted Random Forest - Submission score: 74.48%**

In [None]:
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras import losses
from keras import backend as K

from sklearn.ensemble import RandomForestClassifier

In [None]:
# Open files
Data_X_train = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Data_Y_train = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')
Data_X_test = pd.read_csv('data/challenge_fichiers_dentrees_de_test_challenge_nba/test.csv')

In [None]:
Data_X_train = feature_engineering(Data_X_train, two_points = True)
Data_X_test = feature_engineering(Data_X_test, two_points = True)

X_train = Data_X_train.as_matrix()[:,1:]
X_test = Data_X_test.as_matrix()[:,1:]
Y_train = Data_Y_train.as_matrix()[:,1:]

del Data_X_train

In [None]:
X_train_lstm = X_train.reshape((len(X_train), 18, 15, -1), order = 'F')
X_train_lstm = X_train_lstm.mean(axis = 2)
Y_train_lstm = np_utils.to_categorical(Y_train, 2)

X_test_lstm = X_test.reshape((len(X_test), 18, 15, -1), order = 'F')
X_test_lstm = X_test_lstm.mean(axis = 2)

In [None]:
#Model
bi_lstm_model = Sequential()
bi_lstm_model.add(Bidirectional(LSTM(175, recurrent_dropout = 0.25), 
                                input_shape = (18, 96)))
bi_lstm_model.add(Dropout(0.75))
bi_lstm_model.add(Dense(units = 150, 
                        activation = 'relu'))
bi_lstm_model.add(Dropout(0.5))
bi_lstm_model.add(Dense(units = 2, 
                        activation='softmax'))
bi_lstm_model.compile(loss = losses.categorical_crossentropy,
                      optimizer = 'adam',
                      metrics = ['accuracy'])

#Training
bi_lstm_model.fit(X_train_lstm, Y_train_lstm,
                  epochs = 15, batch_size = 64, verbose = False)


bi_lstm_model.save('models/bi_lstm_2103.h5')

In [None]:
extract_cnn_features = K.function([bi_lstm_model.layers[0].input, K.learning_phase()],
                                  [bi_lstm_model.layers[3].output])
X_train_features = extract_cnn_features([X_train_lstm, 0])[0]
X_test_features = extract_cnn_features([X_test_lstm, 0])[0]

parameters = {'n_estimators': 300, 'max_depth': 60, 'min_samples_leaf': 2}
clf = RandomForestClassifier(**parameters)
clf.fit(X_train_features, Y_train.ravel())

In [None]:
Data_X_test = pd.read_csv('data/challenge_fichiers_dentrees_de_test_challenge_nba/test.csv')

In [None]:
Y_test = clf.predict(X_test_features)
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/BiLSTM_2103.csv',sep=';', index=False)

# 17th model
**Log Reg on Xgboost + Random Forest + BiLSTM Predictions - Submission Score: 74.77%**

In [8]:
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras import losses
from keras import backend as K
from keras.models import load_model

from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

In [3]:
# Open files
Data_X_train = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Data_Y_train = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')
Data_X_test = pd.read_csv('data/challenge_fichiers_dentrees_de_test_challenge_nba/test.csv')

In [11]:
Data_X_train = feature_engineering(Data_X_train, two_points = True)
Data_X_test = feature_engineering(Data_X_test, two_points = True)

nb_games_train, col = Data_X_train.shape
nb_games_test, _ = Data_X_test.shape
nb_features = int((col-1)/1440)

X_train = Data_X_train.as_matrix()[:,1:]
X_test = Data_X_test.as_matrix()[:,1:]
Y_train = Data_Y_train.as_matrix()[:,1:]

del Data_X_train

In [13]:
score_end = X_train[:,25903]
X_train_models = X_train.reshape((nb_games_train, nb_features, 10, -1), order = 'F')
X_train_models = X_train_models.mean(axis = 2)
X_train_models = X_train_models.reshape(nb_games_train, nb_features*144)

X_train_tot = np.zeros((X_train_models.shape[0],X_train_models.shape[1]+1))
X_train_tot[:,:-1] = X_train_models
X_train_tot[:,-1] = score_end

## Test
score_end_test = X_test[:,25903]
X_test_models = X_test.reshape((nb_games_test, nb_features, 10, -1), order = 'F')
X_test_models = X_test_models.mean(axis = 2)
X_test_models = X_test_models.reshape(nb_games_test, nb_features*144)

X_test_tot = np.zeros((X_test_models.shape[0],X_test_models.shape[1]+1))
X_test_tot[:,:-1] = X_test_models
X_test_tot[:,-1] = score_end_test

del X_train_models, X_test_models

In [15]:
#Random forest
parameters = {'n_estimators': 200, 'max_depth': None, 'max_features': 15, 
               'min_samples_split': 15, 'min_samples_leaf': 2, 'bootstrap': True, 
               'oob_score': True, 'criterion': 'entropy'}

RandomForest = RandomForestClassifier(**parameters)
RandomForest.fit(X_train_tot, Y_train.ravel())
joblib.dump(RandomForest, 'models/Random_Forest_2103.pkl')

Y_pred_RandomForest = RandomForest.predict(X_test_tot)

In [16]:
#XGboost
xgb = GradientBoostingClassifier(max_depth=10, n_estimators = 1000)
xgb.fit(X_train_tot, Y_train.ravel())

joblib.dump(xgb, 'models/XGBoost_2103.pkl')

Y_pred_xgb = xgb.predict(X_test_tot)

In [17]:
#Bi-LSTM
X_train_lstm = X_train.reshape((len(X_train), 18, 15, -1), order = 'F')
X_train_lstm = X_train_lstm.mean(axis = 2)
Y_train_lstm = np_utils.to_categorical(Y_train, 2)

X_test_lstm = X_test.reshape((len(X_test), 18, 15, -1), order = 'F')
X_test_lstm = X_test_lstm.mean(axis = 2)


bi_lstm_model = load_model('models/bi_lstm_2103.h5')


extract_cnn_features = K.function([bi_lstm_model.layers[0].input, K.learning_phase()],
                                  [bi_lstm_model.layers[3].output])
X_train_features = extract_cnn_features([X_train_lstm, 0])[0]
X_test_features = extract_cnn_features([X_test_lstm, 0])[0]


parameters = {'n_estimators': 300, 'max_depth': 60, 'min_samples_leaf': 2}
clf = RandomForestClassifier(**parameters)
clf.fit(X_train_features, Y_train.ravel())
joblib.dump(clf, 'models/Random_Forest_on_BiLSTM_2103.pkl')


Y_pred_lstm = clf.predict(X_test_features)

In [18]:
#Logistic Regression - Meta Classifier
LR = joblib.load('models/Log_Reg_On_Other_Clfs_Predictions.pkl')

Y_features = np.column_stack((Y_pred_RandomForest, Y_pred_xgb, Y_pred_lstm, score_end_test))
Y_test = LR.predict(Y_features)

ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/MetaClassifier_2103.csv',sep=';', index=False)

# 18th model
**Fourier Transform with preprocessing - 73.19%**

In [10]:
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Conv2D, BatchNormalization, Dropout, Flatten, Dense, LSTM
from keras import losses

In [3]:
Data_X_train = feature_engineering(Data_X_train)
Data_X_test = feature_engineering(Data_X_test)

X_train = Data_X_train.as_matrix()[:,1:]
X_test = Data_X_test.as_matrix()[:,1:]
Y_train = Data_Y_train.as_matrix()[:,1:]

In [4]:
nb_games_train, col = Data_X_train.shape
nb_games_test, _ = Data_X_test.shape
nb_features = int((col-1)/1440)

In [6]:
nb_freq = 200

X_FF = []
X_FF_test = []
features = ['total rebound', 'fied goals', 'total foul', 'three points','free throws', 'miss', 'score', 'assist', 'diff points',
            'offensive rebound', 'defensive rebound', 'offensive foul', 'defensive foul','lost ball', 'steals', 'bad pass', 'block']
for feat in features :
    filter_col = [col for col in Data_X_train if col.startswith(feat)]
    X_feat = Data_X_train[filter_col].as_matrix()
    X_feat_test =  Data_X_test[filter_col].as_matrix()
    X_feat_fft = np.fft.fft(X_feat)[:,:nb_freq]
    X_feat_fft_test = np.fft.fft(X_feat_test)[:,:nb_freq]
    X_FF.append(abs(X_feat_fft))
    X_FF_test.append(abs(X_feat_fft_test))
X_fft = np.hstack(X_FF)
X_fft_test = np.hstack(X_FF_test)

In [7]:
X_seq = X_train.reshape((nb_games_train, nb_features, 10, -1), order = 'F')

X_seq = X_seq.mean(axis = 2)
X_seq = X_seq.reshape(nb_games_train, nb_features*144)
X_tot = np.concatenate((X_seq, X_fft), axis = 1)

X_seq_test = X_test.reshape((nb_games_test, nb_features, 10, -1), order = 'F')
X_seq_test = X_seq_test.mean(axis = 2)
X_seq_test = X_seq_test.reshape(nb_games_test, nb_features*144)
X_tot_test = np.concatenate((X_seq_test, X_fft_test), axis = 1)

In [13]:
#Random forest
parameters = {'n_estimators': 200, 'max_depth': None, 'max_features': 15, 
               'min_samples_split': 15, 'min_samples_leaf': 2, 'bootstrap': True, 
               'oob_score': True, 'criterion': 'entropy'}

RandomForest = RandomForestClassifier(**parameters)
RandomForest.fit(X_tot, Y_train.ravel())

Y_pred_RandomForest = RandomForest.predict(X_tot_test)

In [15]:
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_pred_RandomForest}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/Fourier_RandomForest_2203.csv',sep=';', index=False)

# 19th model
**Wavelet with preprocessing - 73.19%**

In [None]:
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Conv2D, BatchNormalization, Dropout, Flatten, Dense, LSTM
from keras import losses

In [None]:
Data_X_train = feature_engineering(Data_X_train)
Data_X_test = feature_engineering(Data_X_test)

X_train = Data_X_train.as_matrix()[:,1:]
X_test = Data_X_test.as_matrix()[:,1:]
Y_train = Data_Y_train.as_matrix()[:,1:]

In [None]:
nb_games_train, col = Data_X_train.shape
nb_games_test, _ = Data_X_test.shape
nb_features = int((col-1)/1440)

tw = 240 # 4min
nb_freq = 100 # 50 first features

In [None]:
def create_wavelet_features(wav_type, data_X_train, features, nb_freq, tw) :
    X_DWT = []
    for feat in features :
        filter_col = [col for col in Data_X_train if col.startswith(feat)]
        for i in range(int(1440/tw)) :        
            X_feat = data_X_train[filter_col].as_matrix()[:,i*tw:(i+1)*tw]
            cA, cD = pywt.dwt(X_feat, wav_type)
            X_DWT.append(cA[:,:nb_freq])
            X_DWT.append(cD[:,:nb_freq])
    X_wt = np.hstack(X_DWT)   
    return X_wt

In [None]:
features = ['total rebound', 'fied goals', 'total foul', 'three points','free throws', 'miss', 'score', 'assist', 'diff points',
            'offensive rebound', 'defensive rebound', 'offensive foul', 'defensive foul','lost ball', 'steals', 'bad pass', 'block']
X_wt = create_wavelet_features('db1', Data_X_train, features, nb_freq, tw)

In [None]:
X_seq = X.reshape((nb_games, nb_features, 10, -1), order = 'F')
X_seq = X_seq.mean(axis = 2)
X_seq = X_seq.reshape(nb_games, nb_features*144)
X_tot = np.concatenate((X_seq, X_wt), axis = 1)

In [None]:
#Random forest
parameters = {'n_estimators': 200, 'max_depth': None, 'max_features': 15, 
               'min_samples_split': 15, 'min_samples_leaf': 2, 'bootstrap': True, 
               'oob_score': True, 'criterion': 'entropy'}

RandomForest = RandomForestClassifier(**parameters)


In [None]:
RandomForest.fit(X_train_tot, Y_train.ravel())

Y_pred_RandomForest = RandomForest.predict(X_test_tot)

In [None]:
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_pred_RandomForest}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/Wavelet_RandomForest_2203.csv',sep=';', index=False)