In [1]:
import os, sys, time
import numpy as np
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproductive

Using TensorFlow backend.


In [4]:
# paths
PK_FPATH = 'data/processed_data_sidhid.pk'
MODEL_FPATH = './models/1124_model_2embed_2conv1d_2FC.h5' # path of best trained model 
# constants
N_LABELS = 50
N_SIDHID = 58328

### Load data

In [5]:
# load pickled data
pk_data = pk.load(open(PK_FPATH, 'rb'))
X_train, Y_train = pk_data['X_train'], pk_data['Y_train']
X_val, Y_val = pk_data['X_val'], pk_data['Y_val']

### load best model: 2 embedding layers, 2 conv layers and 2 FC 

In [8]:
# ***NOTE***
# To load models from file, we have to modify metrics.py at: 
# `/local/XW/SOFT/anaconda2/envs/thesis_nb/lib/python2.7/site-packages/keras/` 
# to add the custom metric function, otherwise `load_model` throws exception ! 
# cf issue: https://github.com/fchollet/keras/issues/3911
from keras.models import load_model
model = load_model(MODEL_FPATH)

In [10]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 1000, 200)     0           main_input[0][0]                 
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 1000, 200)     0           main_input[0][0]                 
____________________________________________________________________________________________________
convolution1d_4 (Convolution1D)  (None, 996, 128)      128128      embedding_3[0][0]                
___________________________________________________________________________________________

## Extract embedding vector, and use SVM to predict

In [11]:
print model.layers[0].input
print model.layers[11].output

Tensor("main_input_1:0", shape=(?, 1000), dtype=int32)
Tensor("Relu_7:0", shape=(?, 500), dtype=float32)


In [14]:
# use K.function to construct a model that outputs embedding vector
from keras import backend as K
get_embedvec = K.function([model.layers[0].input, K.learning_phase()],
                                  [model.layers[11].output])
embedvec = lambda X: get_embedvec([X,0])[0]

In [15]:
# output in test mode = 0
layer_output = embedvec(X_train[:10])
print layer_output.shape

(10, 500)


In [16]:
def to_embedvec(X):
    BATCH_SZ = 128
    embedded = []
    for i in tqdm(xrange(0, X.shape[0], BATCH_SZ)):
        x_batch = X[i:min(i+BATCH_SZ, X.shape[0])]
        embedveci = embedvec(x_batch)
        embedded.append(embedveci)
    return np.vstack(embedded)

In [17]:
Xembed_train = to_embedvec(X_train)
print Xembed_train.shape

100%|██████████| 365/365 [16:46<00:00,  2.32s/it]

(46663, 500)





In [19]:
Xembed_val = to_embedvec(X_val)
print Xembed_val.shape

100%|██████████| 92/92 [04:14<00:00,  2.08s/it]

(11665, 500)





In [20]:
def multilabel_evaluate(y_pred, y_true=Y_val):
    y_pred, y_true = y_pred[:,:-1], y_true[:,:-1] # test without last column considered
    tp = np.sum(y_true * y_pred, axis=-1) 
    sum_true = np.sum(y_true, axis=-1)
    sum_pred = np.sum(y_pred, axis=-1)
    union = np.sum(np.clip(y_true+y_pred, 0, 1), axis=-1)
    print 'precision =', np.mean(tp/(sum_pred+1e-10))
    print 'recall = ', np.mean(tp/(sum_true+1e-10))
    print 'F1 = ', 2*np.mean(tp/(sum_true+sum_pred+1e-10))
    print 'acc = ', np.mean(tp/(union+1e-10))

In [26]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
parameters = {
              'alpha': [1, 1e-1, 3e-1, 1e-2, 3e-2, 1e-3, 3e-3, 1e-4, 3e-4],
              'n_iter': [10, 50, 200]}
clfs = []
for i in tqdm(xrange(N_LABELS)):
    sgd =  SGDClassifier(loss='hinge', penalty='l2', random_state=1, class_weight='balanced')
    clf = GridSearchCV(sgd, parameters, n_jobs=-1)
    clf.fit(Xembed_train, Y_train[:,i]) 
    clfs.append(clf)

preds = [clfs[i].predict(Xembed_val) for i in xrange(N_LABELS)]    
pred_svm = np.vstack(preds).T
print pred_svm.shape
print map(int, pred_svm.sum(axis=0))
multilabel_evaluate(y_pred=pred_svm, y_true=Y_val)

100%|██████████| 50/50 [1:59:34<00:00, 130.37s/it]


(11665, 50)
[8156, 3626, 4116, 3338, 5974, 7801, 9148, 2789, 5914, 7374, 2083, 3, 0, 6504, 5911, 3544, 7386, 3984, 2913, 2301, 2868, 2817, 2607, 21, 3413, 10611, 4317, 7362, 3841, 1923, 3673, 1922, 5737, 2849, 7128, 2815, 3792, 4, 1867, 1497, 2577, 2445, 4341, 5908, 1909, 2230, 3842, 3, 5706, 11513]
precision = 0.197625869081
recall =  0.660431292459
F1 =  0.290633777991
acc =  0.180832980669
