# Submitted Models

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from src.utils import preprocess
random.seed(7)



## Loading Data

In [2]:
# Open files
Data_X_train = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Data_Y_train = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')
Data_X_test = pd.read_csv('data/challenge_fichiers_dentrees_de_test_challenge_nba/test.csv')

**Data Preprocessing**

In [4]:
X_train, Y_train, _, _ = preprocess(Data_X_train, Data_Y_train, portion_train=1)
X_test = Data_X_test.as_matrix()[:,1:]

## First model : Logistic Regression

**Logistic Regression - 01/03 - Test score : 71.626%** <br>

In [8]:
c = 1e-5
model_LogisticRegression = LogisticRegression(C=c)
model_LogisticRegression.fit(X_train, Y_train)
Y_test = model_LogisticRegression.predict(X_test)

In [43]:
ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

In [53]:
# Write in file
Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/LogisticRegression_0103.csv',sep=';', index=False)

## Second model: CNN

**CNN on raw images - 01/03 - Test score : 72.20%**

In [5]:
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Conv2D, BatchNormalization, Dropout, Flatten, Dense, LSTM
from keras import losses

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
X_train_cnn = X_train.reshape((len(X_train), 11, 1440, 1), order = 'F')
Y_train_cnn = np_utils.to_categorical(Y_train, 2)
X_test_cnn = X_test.reshape((len(X_test), 11, 1440, 1), order = 'F')

In [9]:
conv_model = Sequential()
conv_model.add(BatchNormalization(axis=1, 
                                  input_shape = (11, 1440, 1)))
conv_model.add(Conv2D(filters = 16, 
                 kernel_size = (11, 10), 
                 activation = 'relu'))
conv_model.add(Dropout(0.75))
conv_model.add(Flatten())
conv_model.add(Dense(units = 50, activation = 'relu'))
conv_model.add(Dropout(0.5))
conv_model.add(Dense(units = 2, 
                activation='softmax'))
conv_model.compile(loss = losses.categorical_crossentropy,
                     optimizer = 'adam',
                     metrics = ['accuracy'])
conv_model.fit(X_train_cnn, Y_train_cnn, epochs = 200, batch_size = 32, verbose = False)
Y_test_cnn = conv_model.predict(X_test_cnn)

In [10]:
Y_test = np.argmax(Y_test_cnn, axis = 1)

ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_0103.csv',sep=';', index=False)

# Third model: LSTM
**4 main features - 05/03 - Test score: 68.81%**

In [None]:
X_train, Y_train, _, _ = preprocess(Data_X_train, Data_Y_train, portion_train=1, main_feature = True)

X_train_seq_10 = X_train.reshape((len(X_train), 4, 10, -1), order = 'F')
X_train_seq_10 = X_train_seq_10.mean(axis = 2)

In [27]:
X_test = extract_main_features(Data_X_test)

In [28]:
X_test_seq_10 = X_test.reshape((len(X_test), 4, 10, -1), order = 'F')
X_test_seq_10 = X_test_seq_10.mean(axis = 2)

In [12]:
Y_train = np_utils.to_categorical(Y_train, 2)

In [14]:
lstm_model = Sequential()
lstm_model.add(LSTM(75, input_shape = (4, 144), dropout = 0.5))
lstm_model.add(Dense(units = 10,
                     activation = 'relu'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(units = 2, 
                activation='softmax'))

#Construct Loss
lstm_model.compile(loss = losses.categorical_crossentropy,
                     optimizer = 'adam',
                     metrics = ['accuracy'])

#Train
training = lstm_model.fit(X_train_seq_10, Y_train, epochs = 200, batch_size = 32, verbose = False)

lstm_model.save('models/lstm_seq_10_4_main_features.h5')

In [31]:
Y_test = lstm_model.predict(X_test_seq_10)
Y_test = np.argmax(Y_test, axis = 1)

ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/LSTM_0503.csv',sep=';', index=False)

# Fourth Model: CNN
**CNN on raw images - 06/03 - Test score: 71.38%**

In [6]:
X_train_cnn = X_train.reshape((len(X_train), 11, 1440, 1), order = 'F')
Y_train_cnn = np_utils.to_categorical(Y_train, 2)
X_test_cnn = X_test.reshape((len(X_test), 11, 1440, 1), order = 'F')

In [7]:
Y_train = np_utils.to_categorical(Y_train, 2)

In [9]:
conv_model = Sequential()
conv_model.add(BatchNormalization(axis=1, 
                                  input_shape = (11, 1440, 1)))
conv_model.add(Conv2D(filters = 32, 
                 kernel_size = (11, 10), 
                 activation = 'relu',
                 strides = (1, 2)))
conv_model.add(Dropout(0.5))
conv_model.add(Flatten())
conv_model.add(Dense(units = 50, 
                     activation = 'relu'))
conv_model.add(Dropout(0.5))
conv_model.add(Dense(units = 2, 
                activation='softmax'))

#Construct Loss
conv_model.compile(loss = losses.categorical_crossentropy,
                     optimizer = 'adam',
                     metrics = ['accuracy'])

#Train
training = conv_model.fit(X_train_cnn, Y_train, epochs = 200, batch_size = 32, verbose = False)
conv_model.save('models/cnn_7.h5')

In [12]:
Y_test = conv_model.predict(X_test_cnn)
Y_test = np.argmax(Y_test, axis = 1)

ID_test = Data_X_test.iloc[:,0].as_matrix()
d = {'ID': ID_test, 'label': Y_test}
Results_test = pd.DataFrame(data=d)

Results_test.to_csv('data/challenge_fichiers_de_sorties_de_test_challenge_nba/CNN_0603.csv',sep=';', index=False)