In [1]:
import json
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, GRU, LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D, MaxPooling1D
import itertools
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np
import os

import progressbar
from metrics.accuracy import conlleval

import tensorflow as tf
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Using TensorFlow backend.


In [2]:
def read_data(rootdir):
    text_l = []
    label_l = []
    for subdir, dirs, files in os.walk(rootdir):
        for fname in files:
            #print os.path.join(subdir, file)
            filepath = subdir + os.sep + fname
            with open(filepath) as f:
                for line1,line2 in itertools.izip_longest(*[f]*2):
                    try:
                        text = ast.literal_eval(line1)
                        label = ast.literal_eval(line2)
                        if len(text) > 2:
                            text_l.append(text)
                            label_l.append(label)
                    except:
                        pass
            f.close()
    return text_l, label_l

In [3]:
X_text, y_text = read_data('../data/')

print len(X_text), len(y_text)

text = list(set(itertools.chain(*X_text)))

le = LabelEncoder()
le.fit(text)

w2idx = dict(zip(le.classes_, le.transform(le.classes_)))

idx2w  = {w2idx[k]:k for k in w2idx}

X = []
for each in X_text:
    X.append(le.transform(each))

label = list(set(itertools.chain(*y_text)))

le = LabelEncoder()
le.fit(label)

labels2idx = dict(zip(le.classes_, le.transform(le.classes_)))

idx2la = {labels2idx[k]:k for k in labels2idx}

y = []
for each in y_text:
    y.append(le.transform(each))

print idx2la

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

words_val = [ list(map(lambda x: idx2w[x], w)) for w in X_test]
groundtruth_val = [ list(map(lambda x: idx2la[x], y)) for y in y_test] # y values test
words_train = [ list(map(lambda x: idx2w[x], w)) for w in X_train]
groundtruth_train = [ list(map(lambda x: idx2la[x], y)) for y in y_train] # y values train

n_classes = len(idx2la)
n_vocab = len(idx2w)

5426 5426
{0: 'GO:0000267', 1: 'GO:0000502', 2: 'GO:0000775', 3: 'GO:0000785', 4: 'GO:0000786', 5: 'GO:0000791', 6: 'GO:0000792', 7: 'GO:0000795', 8: 'GO:0000805', 9: 'GO:0000806', 10: 'GO:0000811', 11: 'GO:0001669', 12: 'GO:0001750', 13: 'GO:0001917', 14: 'GO:0005575', 15: 'GO:0005576', 16: 'GO:0005577', 17: 'GO:0005581', 18: 'GO:0005585', 19: 'GO:0005610', 20: 'GO:0005622', 21: 'GO:0005623', 22: 'GO:0005634', 23: 'GO:0005643', 24: 'GO:0005654', 25: 'GO:0005656', 26: 'GO:0005657', 27: 'GO:0005675', 28: 'GO:0005694', 29: 'GO:0005712', 30: 'GO:0005730', 31: 'GO:0005737', 32: 'GO:0005739', 33: 'GO:0005764', 34: 'GO:0005768', 35: 'GO:0005773', 36: 'GO:0005776', 37: 'GO:0005777', 38: 'GO:0005783', 39: 'GO:0005792', 40: 'GO:0005813', 41: 'GO:0005819', 42: 'GO:0005829', 43: 'GO:0005833', 44: 'GO:0005835', 45: 'GO:0005840', 46: 'GO:0005856', 47: 'GO:0005871', 48: 'GO:0005874', 49: 'GO:0005883', 50: 'GO:0005886', 51: 'GO:0005901', 52: 'GO:0005902', 53: 'GO:0005912', 54: 'GO:0005929', 55: 'GO:0

In [4]:
print le.classes_

['GO:0000267' 'GO:0000502' 'GO:0000775' 'GO:0000785' 'GO:0000786'
 'GO:0000791' 'GO:0000792' 'GO:0000795' 'GO:0000805' 'GO:0000806'
 'GO:0000811' 'GO:0001669' 'GO:0001750' 'GO:0001917' 'GO:0005575'
 'GO:0005576' 'GO:0005577' 'GO:0005581' 'GO:0005585' 'GO:0005610'
 'GO:0005622' 'GO:0005623' 'GO:0005634' 'GO:0005643' 'GO:0005654'
 'GO:0005656' 'GO:0005657' 'GO:0005675' 'GO:0005694' 'GO:0005712'
 'GO:0005730' 'GO:0005737' 'GO:0005739' 'GO:0005764' 'GO:0005768'
 'GO:0005773' 'GO:0005776' 'GO:0005777' 'GO:0005783' 'GO:0005792'
 'GO:0005813' 'GO:0005819' 'GO:0005829' 'GO:0005833' 'GO:0005835'
 'GO:0005840' 'GO:0005856' 'GO:0005871' 'GO:0005874' 'GO:0005883'
 'GO:0005886' 'GO:0005901' 'GO:0005902' 'GO:0005912' 'GO:0005929'
 'GO:0005966' 'GO:0008091' 'GO:0008305' 'GO:0009986' 'GO:0010369'
 'GO:0014069' 'GO:0016020' 'GO:0016021' 'GO:0016028' 'GO:0016234'
 'GO:0016459' 'GO:0016528' 'GO:0017086' 'GO:0019814' 'GO:0030016'
 'GO:0030054' 'GO:0030056' 'GO:0030286' 'GO:0030424' 'GO:0030425'
 'GO:00308

In [5]:
print("Example sentence : {}".format(words_train[0]))
print("Encoded form: {}".format(X_train[0]))
print 
print("It's label : {}".format(groundtruth_train[0]))
print("Encoded form: {}".format(y_train[0]))

Example sentence : ['Total', 'RNA', 'was', 'extracted', 'from', 'the', 'entire', 'small', 'intestine', 'as', 'previously', 'described', '', 'Expression', 'of', 'the', 'major', 'intestinal', 'mucin', 'Muc2', 'was', 'also', 'measured', 'using', 'the', 'forward', '5aposGAC', 'TTC', 'GAT', 'GGA', 'CAC', 'TGC', 'TC3apos', 'and', 'reverse', '5aposCAC', 'GGT', 'GTT', 'TAT', 'CTA', 'CCA', 'AC3apos', 'primers']
Encoded form: [10996  9386 23920 15902 16311 23060 15569 22251 17663 12628 20680 14746
     0  5372 19504 23060 18344 17661 18968  8007 23920 12108 18480 23728
 23060 16264  1597 10741  5803  5861  3656 10646 10611 12218 21605  1584
  5871  5922 10597  3832  3691  2323 20695]

It's label : ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']
Encoded form: [103 103 103 103 103 103 103 1

In [6]:
print('Build model...')
model = Sequential()
model.add(Embedding(n_vocab,100))
model.add(Convolution1D(64,5,border_mode='same', activation='relu'))
model.add(Dropout(0.25))
model.add(GRU(100,return_sequences=True))
model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
model.compile('rmsprop', 'categorical_crossentropy')

Build model...


  after removing the cwd from sys.path.


In [None]:
history = model.fit(X_train, y_train, validation_split=0.0, nb_epoch=100,
                        batch_size=100, verbose=1, )

In [None]:
### Training
n_epochs = 50

train_f_scores = []
val_f_scores = []
best_val_f1 = 0

for i in range(n_epochs):
    print("Epoch {}".format(i))
    
    print("Training =>")
    train_pred_label = []
    avgLoss = 0


    bar = progressbar.ProgressBar(maxval=len(X_train))
    for n_batch, sent in bar(enumerate(X_train)):
        label = y_train[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        sent = sent[np.newaxis,:]

        if sent.shape[1] > 1: #some bug in keras
            loss = model.train_on_batch(sent, label)
            avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred,-1)[0]
        train_pred_label.append(pred)

    avgLoss = avgLoss/n_batch
    
    predword_train = [ list(map(lambda x: idx2la[x], y)) for y in train_pred_label]
    
    
#     con_dict = conlleval(predword_train, groundtruth_train, words_train, 'r.txt')
#     print con_dict
#     train_f_scores.append(con_dict['f1'])
    

#     print classification_report(list(itertools.chain.from_iterable(groundtruth_train)), 
#                                 list(itertools.chain.from_iterable(predword_train)))
    
#     print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
    
    
    print("Validating =>")
    
    val_pred_label = []
    avgLoss = 0
    
    bar = progressbar.ProgressBar(maxval=len(X_test))
    for n_batch, sent in bar(enumerate(X_test)):
        label = y_test[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        sent = sent[np.newaxis,:]
        
        if sent.shape[1] > 1: #some bug in keras
            loss = model.test_on_batch(sent, label)
            avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred,-1)[0]
        val_pred_label.append(pred)

    avgLoss = avgLoss/n_batch
    
    predword_val = [ list(map(lambda x: idx2la[x], y)) for y in val_pred_label]
#     con_dict = conlleval(predword_val, groundtruth_val, words_val, 'r.txt')
#     print con_dict
#     val_f_scores.append(con_dict['f1'])
    
#     print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

print classification_report(list(itertools.chain.from_iterable(groundtruth_val)), 
                            list(itertools.chain.from_iterable(predword_val)))

    
#     if con_dict['f1'] > best_val_f1:
#     	best_val_f1 = con_dict['f1']
#     	open('model_architecture.json','w').write(model.to_json())
#     	model.save_weights('best_model_weights.h5',overwrite=True)
#     	print("Best validation F1 score = {}".format(best_val_f1))
#     print()

Epoch 0
Training =>


100% |########################################################################|
  0% |                                                                        |

Validating =>


 95% |####################################################################    |