In [23]:
import sys
sys.chdir('../')

import numpy as np
import pandas as pd
import random
import time 

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dropout, Flatten, Dense
from keras.optimizers import SGD
from keras import losses
from keras import regularizers
from keras.constraints import max_norm
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import StratifiedKFold

random.seed(7)

%matplotlib inline
import matplotlib.pyplot as plt

from src.utils import preprocess

# Load Data

In [4]:
X_train_pd = pd.read_csv('data/challenge_fichier_dentrees_dentrainement_challenge_nba/train.csv')
Y_train_pd = pd.read_csv('data/challenge_fichier_de_sortie_dentrainement_challenge_nba.csv', sep=';')

In [5]:
X, Y, _, _ = preprocess(X_train_pd, Y_train_pd, 1)

In [30]:
X = X.reshape((len(X), 11, 1440, 1), order = 'F')

# Defining k fold functions and evaluation functions

In [22]:
def train_and_evaluate(model, X_train, Y_train, X_val, Y_val, epochs, batch_size, verbose):
    training = model.fit(X_train, Y_train, epochs = epochs, batch_size = batch_size, verbose = verbose)
    evalu = model.evaluate(X_val, Y_val)
    
    return training.history['acc'][-1], evalu[1]

In [25]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 7)

In [46]:
def kfold_print(skf_splitter, X, Y, create_model_func, epochs, batch_size, verbose):
    accs_train = []
    accs_val = []
    for train, val in skf.split(X, Y):
        model = create_model_func()
        Y_train = np_utils.to_categorical(Y[train], 2)
        Y_val = np_utils.to_categorical(Y[val], 2)
        acc_train, acc_val = train_and_evaluate(model, 
                                                X[train], Y_train, X[val], Y_val, 
                                                epochs = epochs, batch_size = batch_size, verbose = verbose)

        accs_train.append(acc_train)
        accs_val.append(acc_val)
        print('(Training, Validation) accuracies: ({0:.2f},{1:.2f})'.format(100*acc_train, 100*acc_val))

    print('Mean Training Accuracy: {0:.2f} +/- {1:.2f}'.format(100*np.mean(accs_train), 100*np.std(accs_train)))
    print('Mean Validation Accuracy: {0:.2f} +/- {1:.2f}'.format(100*np.mean(accs_val), 100*np.std(accs_val)))

# Train CNN kfold

In [40]:
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 16, 
                     kernel_size = (11, 15), 
                     activation = 'relu'))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))

    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (75.12,70.79)
(Training, Validation) accuracies: (73.69,69.00)
(Training, Validation) accuracies: (74.62,71.69)
(Training, Validation) accuracies: (74.04,69.70)
(Training, Validation) accuracies: (74.24,72.55)
Mean Training Accuracy: 74.34 +/- 0.49
Mean Validation Accuracy: 70.75 +/- 1.29


In [42]:
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 16, 
                     kernel_size = (11, 10),
                     strides = (1, 10),
                     activation = 'relu'))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 50, activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (73.62,70.75)
(Training, Validation) accuracies: (74.04,71.03)
(Training, Validation) accuracies: (73.49,72.92)
(Training, Validation) accuracies: (73.61,72.01)
(Training, Validation) accuracies: (73.47,72.00)
Mean Training Accuracy: 73.65 +/- 0.20
Mean Validation Accuracy: 71.74 +/- 0.78


In [48]:
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 16, 
                     kernel_size = (11, 10),
                     strides = (1, 5),
                     activation = 'relu'))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 50, activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (74.66,71.50)
(Training, Validation) accuracies: (75.49,70.95)
(Training, Validation) accuracies: (75.07,71.81)
(Training, Validation) accuracies: (74.52,72.41)
(Training, Validation) accuracies: (74.20,72.35)
Mean Training Accuracy: 74.79 +/- 0.45
Mean Validation Accuracy: 71.80 +/- 0.55


In [49]:
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 16, 
                     kernel_size = (11, 15),
                     strides = (1, 8),
                     activation = 'relu'))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 50, activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (73.56,70.99)
(Training, Validation) accuracies: (74.89,70.47)
(Training, Validation) accuracies: (73.89,71.57)
(Training, Validation) accuracies: (74.27,71.21)
(Training, Validation) accuracies: (72.95,73.31)
Mean Training Accuracy: 73.91 +/- 0.65
Mean Validation Accuracy: 71.51 +/- 0.97


In [50]:
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 16, 
                     kernel_size = (11, 20),
                     strides = (1, 5),
                     activation = 'relu'))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 50, activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (75.54,71.07)
(Training, Validation) accuracies: (75.42,70.43)
(Training, Validation) accuracies: (76.24,71.45)
(Training, Validation) accuracies: (74.56,71.33)
(Training, Validation) accuracies: (75.87,71.60)
Mean Training Accuracy: 75.52 +/- 0.56
Mean Validation Accuracy: 71.18 +/- 0.41


In [51]:
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 16, 
                     kernel_size = (11, 20),
                     strides = (1, 10),
                     activation = 'relu'))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 50, activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (73.82,70.67)
(Training, Validation) accuracies: (73.81,70.23)
(Training, Validation) accuracies: (74.11,71.73)
(Training, Validation) accuracies: (73.43,72.25)
(Training, Validation) accuracies: (73.76,71.96)
Mean Training Accuracy: 73.79 +/- 0.21
Mean Validation Accuracy: 71.37 +/- 0.78


In [45]:
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 32, 
                     kernel_size = (11, 10), 
                     activation = 'relu',
                     strides = (1, 2)))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 50, 
                         activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 250, 32, False)

(Training, Validation) accuracies: (78.82,69.52)
(Training, Validation) accuracies: (79.17,69.32)
(Training, Validation) accuracies: (78.49,72.68)
(Training, Validation) accuracies: (78.49,72.13)
(Training, Validation) accuracies: (78.78,70.45)
Mean Training Accuracy: 78.75 +/- 0.25
Mean Validation Accuracy: 70.82 +/- 1.36


In [52]:
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 16, 
                     kernel_size = (11, 10),
                     strides = (1, 5),
                     activation = 'relu'))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 50, activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 10, activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (72.32,69.99)
(Training, Validation) accuracies: (73.45,70.63)
(Training, Validation) accuracies: (73.07,72.33)
(Training, Validation) accuracies: (72.33,71.73)
(Training, Validation) accuracies: (72.17,72.12)
Mean Training Accuracy: 72.67 +/- 0.50
Mean Validation Accuracy: 71.36 +/- 0.90


In [53]:
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 16, 
                     kernel_size = (11, 10),
                     strides = (1, 5),
                     activation = 'relu'))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 75, activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (76.10,70.47)
(Training, Validation) accuracies: (76.30,70.59)
(Training, Validation) accuracies: (76.06,71.89)
(Training, Validation) accuracies: (75.97,70.97)
(Training, Validation) accuracies: (75.24,72.35)
Mean Training Accuracy: 75.93 +/- 0.36
Mean Validation Accuracy: 71.25 +/- 0.74


In [55]:
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 16, 
                     kernel_size = (11, 10),
                     strides = (1, 5),
                     activation = 'relu'))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 25, activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (73.98,71.14)
(Training, Validation) accuracies: (74.27,70.35)
(Training, Validation) accuracies: (73.95,72.33)
(Training, Validation) accuracies: (74.34,71.33)
(Training, Validation) accuracies: (73.40,72.08)
Mean Training Accuracy: 73.99 +/- 0.33
Mean Validation Accuracy: 71.45 +/- 0.70


# Submitted models: verifying that this cross validation is a reasonable proxy for estimating test accuracy

In [47]:
# 72% on test
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 16, 
                     kernel_size = (11, 10), 
                     activation = 'relu'))
    conv_model.add(Dropout(0.75))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 50, activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (81.35,70.75)
(Training, Validation) accuracies: (81.10,70.15)
(Training, Validation) accuracies: (78.31,71.57)
(Training, Validation) accuracies: (80.51,71.57)
(Training, Validation) accuracies: (78.42,70.84)
Mean Training Accuracy: 79.94 +/- 1.31
Mean Validation Accuracy: 70.98 +/- 0.54


In [54]:
# 71.38% on test
def create_model():
    conv_model = Sequential()
    conv_model.add(BatchNormalization(axis=1, 
                                      input_shape = (11, 1440, 1)))
    conv_model.add(Conv2D(filters = 32, 
                     kernel_size = (11, 10), 
                     activation = 'relu',
                     strides = (1, 2)))
    conv_model.add(Dropout(0.5))
    conv_model.add(Flatten())
    conv_model.add(Dense(units = 50, 
                         activation = 'relu'))
    conv_model.add(Dropout(0.5))
    conv_model.add(Dense(units = 2, 
                    activation='softmax'))
    #Construct Loss
    conv_model.compile(loss = losses.categorical_crossentropy,
                         optimizer = 'adam',
                         metrics = ['accuracy'])
    return conv_model

kfold_print(skf, X, Y, create_model, 200, 32, False)

(Training, Validation) accuracies: (81.98,70.75)
(Training, Validation) accuracies: (79.49,69.79)
(Training, Validation) accuracies: (82.34,72.49)
(Training, Validation) accuracies: (82.42,70.54)
(Training, Validation) accuracies: (79.68,71.76)
Mean Training Accuracy: 81.18 +/- 1.31
Mean Validation Accuracy: 71.06 +/- 0.95
