## Load data

In [1]:
import os, sys, time
import numpy as np
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproducible
from keras.layers import Dense, Input, Flatten, Dropout, Merge
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential, load_model, Model
from keras.callbacks import Callback, EarlyStopping, TensorBoard
import keras.backend as K

Using TensorFlow backend.


In [2]:
# paths
PK_FPATH = './data/processed_data_sidhid.pk'
MODEL_PATH = './models/'
LOG_PATH = './logs/'
# constants
N_LABELS = 50
N_SIDHID = 58328
EMBEDDING_DIM = 200
# learning configurations
VALIDATION_SPLIT = 0.2
N_EPOCHS = 20
SZ_BATCH = 512

In [3]:
pk_data = pk.load(open(PK_FPATH, 'rb'))
# print pk_data.keys()

embedding_w2v = pk_data['embedding_w2v']
embedding_glove = pk_data['embedding_glove']

X_train, Y_train = pk_data['X_train'], pk_data['Y_train']
X_val, Y_val = pk_data['X_val'], pk_data['Y_val']

INPUT_SEQ_LEN = X_train.shape[1]
EMBEDDING_INPUT_DIM = embedding_w2v.shape[0]

print X_train.shape, Y_train.shape
print X_val.shape, Y_val.shape

(46663, 1000) (46663, 50)
(11665, 1000) (11665, 50)


### Modify sample weight, and use larger batch size

In [4]:
inv_freq = 1e6*Y_train.sum(axis=0)**(-1.5)
sample_weight = (inv_freq * Y_train).sum(axis=1)
print sample_weight.shape
sample_weight
# y_n_poslabels = Y_train_noother.sum(axis=1)

(46663,)


array([ 12.01673109,  17.46144513,   0.1048649 , ...,  15.49525083,
        14.95808575,  43.77665432])

## Define evaluation metrics

In [5]:
    def multlabel_prec(y_true, y_pred):
        y_pred, y_true = y_pred[:,:-1], y_true[:,:-1] # test without last column considered
        y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
        tp = K.sum(y_true * y_pred, axis =-1)
        sum_true = K.sum(y_true, axis=-1)
        sum_pred = K.sum(y_pred, axis=-1)
        return K.mean(tp/(sum_pred+1e-10)) # to avoid NaN precision

    def multlabel_recall(y_true, y_pred):
        y_pred, y_true = y_pred[:,:-1], y_true[:,:-1] # test without last column considered
        y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
        tp = K.sum(y_true * y_pred, axis =-1)
        sum_true = K.sum(y_true, axis=-1)
        sum_pred = K.sum(y_pred, axis=-1)
        return K.mean(tp/(sum_true+1e-10)) 

    def multlabel_F1(y_true, y_pred):
        y_pred, y_true = y_pred[:,:-1], y_true[:,:-1] # test without last column considered
        y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
        tp = K.sum(y_true * y_pred, axis =-1)
        sum_true = K.sum(y_true, axis=-1)
        sum_pred = K.sum(y_pred, axis=-1)
        return 2*K.mean(tp/(sum_true+sum_pred+1e-10))

    def multlabel_acc(y_true, y_pred):
        y_pred, y_true = y_pred[:,:-1], y_true[:,:-1] # test without last column considered
        y_pred = K.round(K.clip(y_pred, 0, 1)) # turn to 0/1 
        intersect = y_true * y_pred
        intersect = K.sum(intersect, axis=-1)
        union = K.clip(y_true+y_pred, 0, 1)
        union = K.sum(union, axis=-1)
        return K.mean(intersect/(union+1e-10))

In [6]:
def evaluate_model(model):
    print 'evaluation on training set:'
    print model.evaluate(X_train, Y_train, batch_size=128)
    print 'evaluation on validation set:'
    print model.evaluate(X_val, Y_val, batch_size=128)

In [7]:
# wraps up operations on models
def compile_fit_evaluate(model, quick_test=False, print_summary=True,
                         save_log=True, save_model=True, del_model=False):
    
    model.compile(loss='binary_crossentropy',
             optimizer='rmsprop',
             metrics=[multlabel_prec, multlabel_recall, multlabel_F1, multlabel_acc])
    if print_summary:
        print model.summary()
        
    if quick_test: # use tiny data for quick testing
        print '(quick test mode)'
        model.fit(X_train[:100], Y_train[:100], nb_epoch=1)
        return  
    
    _callbacks = [EarlyStopping(monitor='val_loss', patience=2)] 
    if save_log:
        logdir = os.path.join( LOG_PATH, time.strftime('%m%d')+'_'+str(model.name) )
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        _callbacks.append(TensorBoard(log_dir=logdir))
        print 'run "tensorboard --logdir=%s" to launch tensorboard'%logdir
    
    model.fit( X_train, Y_train, 
              validation_data=(X_val, Y_val), 
              nb_epoch=N_EPOCHS, batch_size=SZ_BATCH, 
              sample_weight = sample_weight, 
              callbacks=_callbacks )
    
    print 'evaluating model...'
    evaluate_model(model)
    
    if save_model: 
        model_fpath = os.path.join( MODEL_PATH, time.strftime('%m%d')+'_%s.h5'% str(model.name) )
        model.save(model_fpath)
    
    if del_model:
        del model # delete the model to save memory

In [8]:
# ''' ***NOTE***
# To load models from file, we have to modify metrics.py at: 
# `/local/XW/SOFT/anaconda2/envs/thesis_nb/lib/python2.7/site-packages/keras` 
# to add the `multlabel_XXX` function, otherwise throws exception ! 

# cf issue: https://github.com/fchollet/keras/issues/3911
# '''
# m = load_model(os.path.sep.join([MODEL_PATH, 'model_1conv1d.h5']))

## Best model: 2 conv layers and 2 FC 

In [9]:
flag_quick_test = 0

In [10]:
embed1_w2v = Embedding(input_dim=EMBEDDING_INPUT_DIM ,output_dim=EMBEDDING_DIM, 
              weights=[embedding_w2v],input_length=INPUT_SEQ_LEN, trainable=False )
embed2_glove = Embedding(input_dim=EMBEDDING_INPUT_DIM ,output_dim=EMBEDDING_DIM, 
              weights=[embedding_glove],input_length=INPUT_SEQ_LEN, trainable=False )

input_layer = Input(shape=(INPUT_SEQ_LEN,), dtype='int32', name='main_input')

embed1 = embed1_w2v(input_layer)
conv_embed1 = Conv1D(128, 5, activation='relu')(embed1)

embed2 = embed2_glove(input_layer)
conv_embed2 = Conv1D(128, 5, activation='relu')(embed2)

from keras.layers import merge # `Merge` is for model, while `merge` is for tensor.
merge_layer = merge([conv_embed1, conv_embed2], mode='sum')

x = MaxPooling1D(5)(merge_layer)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Flatten()(x)
x = Dropout(p=0.5)(x)
x = Dense(500, activation='relu')(x)
x = Dropout(p=0.5)(x)
output_layer = Dense(N_LABELS, activation='sigmoid')(x)

model_2embed_2conv1d_2FC = Model(input=input_layer, output=output_layer, 
                                 name = 'model_2embed_2conv1d_2FC')

compile_fit_evaluate(model_2embed_2conv1d_2FC, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 200)     0           main_input[0][0]                 
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1000, 200)     0           main_input[0][0]                 
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D)  (None, 996, 128)      128128      embedding_1[0][0]                
___________________________________________________________________________________________

In [11]:
model_2conv1d_2FC = Sequential(
       [Embedding(input_dim=EMBEDDING_INPUT_DIM ,output_dim=EMBEDDING_DIM, 
              weights=[embedding_w2v],input_length=INPUT_SEQ_LEN, trainable=False ),
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(5),
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(5),
        Flatten(),
        Dropout(p=0.5),
        Dense(500, activation='relu'),
        Dropout(p=0.5),
        Dense(N_LABELS, activation='sigmoid') 
       ], name = 'model_2conv1d_2FC')
compile_fit_evaluate(model_2conv1d_2FC, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_3 (Embedding)          (None, 1000, 200)     0           embedding_input_1[0][0]          
____________________________________________________________________________________________________
convolution1d_4 (Convolution1D)  (None, 996, 128)      128128      embedding_3[0][0]                
____________________________________________________________________________________________________
maxpooling1d_3 (MaxPooling1D)    (None, 199, 128)      0           convolution1d_4[0][0]            
____________________________________________________________________________________________________
convolution1d_5 (Convolution1D)  (None, 195, 128)      82048       maxpooling1d_3[0][0]             
___________________________________________________________________________________________

In [12]:
model_2conv1d_2FC_glove = Sequential(
       [Embedding(input_dim=EMBEDDING_INPUT_DIM ,output_dim=EMBEDDING_DIM, 
              weights=[embedding_glove],input_length=INPUT_SEQ_LEN, trainable=False ),
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(5),
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(5),
        Flatten(),
        Dropout(p=0.5),
        Dense(500, activation='relu'),
        Dropout(p=0.5),
        Dense(N_LABELS, activation='sigmoid') 
       ], name = 'model_2conv1d_2FC_glove')
compile_fit_evaluate(model_2conv1d_2FC_glove, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_4 (Embedding)          (None, 1000, 200)     0           embedding_input_2[0][0]          
____________________________________________________________________________________________________
convolution1d_6 (Convolution1D)  (None, 996, 128)      128128      embedding_4[0][0]                
____________________________________________________________________________________________________
maxpooling1d_5 (MaxPooling1D)    (None, 199, 128)      0           convolution1d_6[0][0]            
____________________________________________________________________________________________________
convolution1d_7 (Convolution1D)  (None, 195, 128)      82048       maxpooling1d_5[0][0]             
___________________________________________________________________________________________

In [13]:
model_3conv1d_dropout =Sequential(
        [ Embedding(input_dim=EMBEDDING_INPUT_DIM ,output_dim=EMBEDDING_DIM, 
                  weights=[embedding_w2v],input_length=INPUT_SEQ_LEN, trainable=False ),
            Conv1D(256, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(128, 5, activation='relu'),
            MaxPooling1D(5),
            Conv1D(64, 2, activation='relu'),
            MaxPooling1D(5),
            Flatten(),
            Dropout(p=0.5),
            Dense(N_LABELS, activation='sigmoid') ],
        name = 'model_3conv1d_dropout')

compile_fit_evaluate(model_3conv1d_dropout, flag_quick_test)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 1000, 200)     0           embedding_input_3[0][0]          
____________________________________________________________________________________________________
convolution1d_8 (Convolution1D)  (None, 996, 256)      256256      embedding_5[0][0]                
____________________________________________________________________________________________________
maxpooling1d_7 (MaxPooling1D)    (None, 199, 256)      0           convolution1d_8[0][0]            
____________________________________________________________________________________________________
convolution1d_9 (Convolution1D)  (None, 195, 128)      163968      maxpooling1d_7[0][0]             
___________________________________________________________________________________________