In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn import utils
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

In [2]:
def load_train(subject, list_series):
    data = []
    for se in list_series:
        file_name = 'data/train/subj' + str(subject) + "_series" + str(se)
        
        eeg = pd.read_csv(file_name + '_data.csv')
        eeg.drop("id", axis = 1, inplace=True)

        evt = pd.read_csv(file_name + '_events.csv')
        evt.drop("id", axis = 1, inplace=True)
        
        data.append( pd.concat( [eeg, evt], axis=1 ) )
        
    all_data = pd.concat( data, ignore_index = True ).values  
    return all_data[:, :-6].astype(float), all_data[:, -6:].astype(bool)
    

In [3]:
def target_idx( y, target = None ):
    if(target is None) : return target_idx(y, np.zeros(6) )
    
    assert isinstance(target, (int, list, tuple, np.ndarray ) ), 'wrong type for state'
    
    if type(target) == int:
        return np.argwhere( y[:, target] == 1 )[:,0]
    elif isinstance(target, (list, tuple, np.ndarray) ):
        return np.argwhere( ( y == np.array( target ) ).all( axis = 1 )  )[:,0]

In [4]:
def get_windows(X, indices, win_size, sampling):
    spl_size = sampling*(win_size//sampling)
    
    X_win = np.zeros( ( len(indices), win_size//sampling,) + X.shape[1:]  )
    for i, end in enumerate(indices):
        X_win[i] = X[ end - spl_size + 1 : end+1 : sampling]
        
    return X_win

In [5]:
def batch_generator(X, y, win_size, sampling, batch_size):
    
    categs_idx = []
    for c in xrange(7):
        if c == 6 : c = None
        
        categ_idx = target_idx(y, c) 
        #remove indices < winsize
        categ_idx = categ_idx[categ_idx >= win_size-1]
        
        categs_idx.append( categ_idx ) 
    
    #Total batch size
    size = sum( batch_size )
    
    while True:
        y_batch = np.zeros( (size,) + y.shape[1:]  )
        X_batch = np.zeros( (size, win_size//sampling,) + X.shape[1:]  )
        
        batch_start = 0
        for i, categ_idx in enumerate(categs_idx):
            categ_idx_sample = np.random.choice( categ_idx, batch_size[i], replace = False )
            y_sample = y[ categ_idx_sample ]
            X_sample = get_windows(X, categ_idx_sample, win_size, sampling)

            batch_end = batch_start + batch_size[i]
            y_batch[batch_start : batch_end] = y_sample
            X_batch[batch_start : batch_end] = X_sample
            batch_start = batch_end
            
        yield X_batch, y_batch

In [6]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Activation
from keras.layers import Dropout
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import AtrousConvolution2D
from keras.layers.pooling import MaxPooling2D

# (uncalibrated) 0.87145 !
def create_bilayer(shape):
    nn = Sequential()

    #Dense output
    nn.add( Flatten( input_shape=shape ) )
    nn.add( Dense(output_dim = 128 ) )
    nn.add( Dense(output_dim = 6 ) )
    nn.add( Activation('sigmoid') )
    
    #Compile
    nn.compile(loss='binary_crossentropy', optimizer='adam')
    
    return nn

#only one output layer scores (uncalibrated) 0.84894 !
def create_dense(shape):
    nn = Sequential()

    #Dense output
    nn.add( Flatten( input_shape=shape ) )
    nn.add( Dense(output_dim = 6 ) )
    nn.add( Activation('sigmoid') )
    
    #Compile
    nn.compile(loss='binary_crossentropy', optimizer='adam')
    
    return nn

Using Theano backend.


In [7]:
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score

class roc_auc_callback(Callback):
    def __init__(self, X_test, y_test):
        self.X_test = X_test
        self.y_test = y_test
        
    def on_epoch_end(self, epoch, logs={} ):
        pred = self.model.predict( self.X_test, batch_size=64 )
        logs['val_roc_auc'] = roc_auc_score( self.y_test , pred, average = 'micro' )
        print '\n -', 'val roc auc : ', logs['val_roc_auc']

In [8]:
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_roc_auc', min_delta=0, patience=10, verbose=True, mode='max')

def train(subject, w, subsampling, batch_size):
    X_train, y_train = load_train(subject, xrange(1,8) )
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    X_test, y_test = load_train(subject, [8] )
    X_test = scaler.transform(X_test)
    X_test, y_test = batch_generator(X_test, y_test, w, subsampling, batch_size ).next()

    model = create_conv( (w//subsampling, 32) )
    gen = batch_generator(X_train, y_train, w, subsampling, batch_size )
    fit = model.fit_generator(generator = gen, samples_per_epoch = 50*sum(batch_size), nb_epoch = 50,
                              validation_data=(X_test, y_test), verbose=True,
                              callbacks=[roc_auc_callback(X_test, y_test), early_stopping ] )
    
    return scaler, model

In [9]:
def test_batch_generator(X, win_size, sampling, batch_size):
    start = win_size - 1

    while True:
        end = min( start+batch_size, len(X) )
        X_batch = get_windows(X, range(start, end), win_size, sampling)
        start = end
        
        yield X_batch

In [10]:
columns_names = ['HandStart', 'FirstDigitTouch', 'BothStartLoadPhase', 'LiftOff', 'Replace', 'BothReleased']

def predict(subject, serie, win_size, subsampling, scaler, model):
    folder = 'train'
    if serie>8 : folder = 'test'
    file_name = 'data/'+ folder +'/subj' + str(subject) + "_series" + str(serie)  
    
    eeg = pd.read_csv(file_name + '_data.csv')
    ids = eeg.loc[:, 'id']
    eeg.drop("id", axis = 1, inplace=True)
    
    X_test = eeg.values.astype(float)
    X_test = scaler.transform(X_test)
    
    pred = np.zeros( (X_test.shape[0], 6) )
    
    gen = test_batch_generator( X_test, w, subsampling, 64)
    pred[win_size-1:] = model.predict_generator( gen, val_samples = X_test.shape[0] - win_size + 1)

    result = pd.DataFrame( pred, columns = columns_names )
    result.insert(0, 'id', ids)
    
    return result

In [11]:
w = 500
subsampling = 16

batch_size = 20 * np.array( [1, 1, 1, 1, 1, 1, 1] )

In [12]:
models = []
for subject in xrange(1,13):
    print 'Training for subject:', subject
    scaler, model = train(subject, w, subsampling, batch_size)
    models.append( {'scaler': scaler, 'model': model} )

Training for subject: 1


NameError: global name 'create_conv' is not defined

In [None]:
def make_predictions( series ):
    preds = []
    for subject in xrange(1,13):
        print 'Predicting for subject:', subject
        for serie in series:
            print '\t Predicting for serie:', serie
            pred = predict(subject, serie, w, subsampling, models[subject-1]['scaler'], models[subject-1]['model'])
            preds.append(pred)
    
    submission = pd.concat( preds, ignore_index = True )
    
    str_series = ''
    for serie in series: str_series += '_' + str(serie)
    submission.to_csv('predictions'+ str_series +'.csv', index = False)

In [None]:
make_predictions([8])

In [None]:
make_predictions([9, 10])