In [27]:
import json
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, GRU, LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D, MaxPooling1D
import itertools
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np

import progressbar
from metrics.accuracy import conlleval

In [9]:
def read_data(fname):
    text_l = []
    label_l = []
    with open(fname) as f:
        for line1,line2 in itertools.izip_longest(*[f]*2):
            try:
                text = ast.literal_eval(line1)
                label = ast.literal_eval(line2)
                text_l.append(text)
                label_l.append(label)
            except:
                pass
    f.close()
    return text_l, label_l

In [10]:
X_text, y_text = read_data('../data/11532192.txt')

text = list(set(itertools.chain(*X_text)))

le = LabelEncoder()
le.fit(text)

w2idx = dict(zip(le.classes_, le.transform(le.classes_)))

idx2w  = {w2idx[k]:k for k in w2idx}


X = []
for each in X_text:
    X.append(le.transform(each))
    
label = list(set(itertools.chain(*y_text)))

le = LabelEncoder()
le.fit(label)

labels2idx = dict(zip(le.classes_, le.transform(le.classes_)))

idx2la = {labels2idx[k]:k for k in labels2idx}

y = []
for each in y_text:
    y.append(le.transform(each))

# w2idx, labels2idx = dicts['words2idx'], dicts['labels2idx']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

words_val = [ list(map(lambda x: idx2w[x], w)) for w in X_test]
groundtruth_val = [ list(map(lambda x: idx2la[x], y)) for y in y_test] # y values test
words_train = [ list(map(lambda x: idx2w[x], w)) for w in X_train]
groundtruth_train = [ list(map(lambda x: idx2la[x], y)) for y in y_train] # y values train

n_classes = len(idx2la)
n_vocab = len(idx2w)

In [11]:
print("Example sentence : {}".format(words_train[0]))
print("Encoded form: {}".format(X_train[0]))
print()
print("It's label : {}".format(groundtruth_val[0]))
print("Encoded form: {}".format(y_train[0]))

Example sentence : ['In', 'strain', '129P3J', 'IOP', 'did', 'not', 'differ', 'significantly', 'with', 'age', 'between', '3', 'and', '14', 'months', 'but', 'was', 'lower', 'in', '18', 'month', 'old', 'mice', 'P', 'lt', '0001', 'compared', 'to', 'all', 'younger', 'ages', 'Figure', '4', 'Despite', 'a', '1', 'mmHg', 'dip', 'in', 'IOP', 'at', '8', 'months', 'there', 'were', 'no', 'significant', 'IOP', 'differences', 'between', 'C3HHeJ', 'mice', 'at', 'each', 'age', 'tested', 'P', '', '02', 'for', 'age', 'Although', 'the', 'effect', 'of', 'age', 'has', 'not', 'been', 'thoroughly', 'assessed', 'in', 'other', 'strains', 'no', 'obvious', 'agerelated', 'differences', 'have', 'been', 'identified', 'in', 'other', 'strains', 'analyzed', 'at', 'multiple', 'ages', 'except', 'for', 'the', 'glaucomatous', 'DBA2J', 'and', 'AKXD28Ty', 'strains']
Encoded form: [ 227 1095   31  222  582  887  584 1073 1204  361  443   75  396   43  864
  454 1187  815  749   54  863  907  849  280  817    3  501 1146  371 

In [12]:
print('Build model...')
model = Sequential()
model.add(Embedding(n_vocab,100))
model.add(Convolution1D(64,5,border_mode='same', activation='relu'))
model.add(Dropout(0.25))
model.add(GRU(100,return_sequences=True))
model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
model.compile('rmsprop', 'categorical_crossentropy')

Build model...




In [35]:
### Training
n_epochs = 100

train_f_scores = []
val_f_scores = []
best_val_f1 = 0

for i in range(n_epochs):
    print("Epoch {}".format(i))
    
    print("Training =>")
    train_pred_label = []
    avgLoss = 0


    bar = progressbar.ProgressBar(maxval=len(X_train))
    for n_batch, sent in bar(enumerate(X_train)):
        label = y_train[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        sent = sent[np.newaxis,:]

        if sent.shape[1] > 1: #some bug in keras
            loss = model.train_on_batch(sent, label)
            avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred,-1)[0]
        train_pred_label.append(pred)

    avgLoss = avgLoss/n_batch
    
    predword_train = [ list(map(lambda x: idx2la[x], y)) for y in train_pred_label]
    
    
#     con_dict = conlleval(predword_train, groundtruth_train, words_train, 'r.txt')
#     print con_dict
#     train_f_scores.append(con_dict['f1'])
    

    print classification_report(list(itertools.chain.from_iterable(groundtruth_train)), 
                                list(itertools.chain.from_iterable(predword_train)))
    
#     print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
    
    
    print("Validating =>")
    
    val_pred_label = []
    avgLoss = 0
    
    bar = progressbar.ProgressBar(maxval=len(X_test))
    for n_batch, sent in bar(enumerate(X_test)):
        label = y_test[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        sent = sent[np.newaxis,:]
        
        if sent.shape[1] > 1: #some bug in keras
            loss = model.test_on_batch(sent, label)
            avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred,-1)[0]
        val_pred_label.append(pred)

    avgLoss = avgLoss/n_batch
    
    predword_val = [ list(map(lambda x: idx2la[x], y)) for y in val_pred_label]
#     con_dict = conlleval(predword_val, groundtruth_val, words_val, 'r.txt')
#     print con_dict
#     val_f_scores.append(con_dict['f1'])
    
#     print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

    print classification_report(list(itertools.chain.from_iterable(groundtruth_val)), 
                                list(itertools.chain.from_iterable(predword_val)))

    
#     if con_dict['f1'] > best_val_f1:
#     	best_val_f1 = con_dict['f1']
#     	open('model_architecture.json','w').write(model.to_json())
#     	model.save_weights('best_model_weights.h5',overwrite=True)
#     	print("Best validation F1 score = {}".format(best_val_f1))
#     print()

Epoch 0
Training =>


100% |########################################################################|
  'precision', 'predicted', average, warn_for)
  8% |######                                                                  |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         2
 GO:0005694       0.00      0.00      0.00         1
 GO:0030424       0.00      0.00      0.00         1
         NA       1.00      1.00      1.00      3749

avg / total       1.00      1.00      1.00      3753

Validating =>


100% |########################################################################|
  0% |                                                                        |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         1
 GO:0005694       0.00      0.00      0.00         2
         NA       1.00      1.00      1.00      1485

avg / total       1.00      1.00      1.00      1488

Epoch 1
Training =>


100% |########################################################################|
  8% |######                                                                  |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         2
 GO:0005694       0.00      0.00      0.00         1
 GO:0030424       0.00      0.00      0.00         1
         NA       1.00      1.00      1.00      3749

avg / total       1.00      1.00      1.00      3753

Validating =>


100% |########################################################################|
  0% |                                                                        |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         1
 GO:0005694       0.00      0.00      0.00         2
         NA       1.00      1.00      1.00      1485

avg / total       1.00      1.00      1.00      1488

Epoch 2
Training =>


100% |########################################################################|
  8% |######                                                                  |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         2
 GO:0005694       0.00      0.00      0.00         1
 GO:0030424       0.00      0.00      0.00         1
         NA       1.00      1.00      1.00      3749

avg / total       1.00      1.00      1.00      3753

Validating =>


100% |########################################################################|
  0% |                                                                        |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         1
 GO:0005694       0.00      0.00      0.00         2
         NA       1.00      1.00      1.00      1485

avg / total       1.00      1.00      1.00      1488

Epoch 3
Training =>


100% |########################################################################|
  8% |######                                                                  |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         2
 GO:0005694       0.00      0.00      0.00         1
 GO:0030424       0.00      0.00      0.00         1
         NA       1.00      1.00      1.00      3749

avg / total       1.00      1.00      1.00      3753

Validating =>


100% |########################################################################|
  0% |                                                                        |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         1
 GO:0005694       0.00      0.00      0.00         2
         NA       1.00      1.00      1.00      1485

avg / total       1.00      1.00      1.00      1488

Epoch 4
Training =>


100% |########################################################################|
  8% |######                                                                  |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         2
 GO:0005694       0.00      0.00      0.00         1
 GO:0030424       0.00      0.00      0.00         1
         NA       1.00      1.00      1.00      3749

avg / total       1.00      1.00      1.00      3753

Validating =>


100% |########################################################################|
  0% |                                                                        |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         1
 GO:0005694       0.00      0.00      0.00         2
         NA       1.00      1.00      1.00      1485

avg / total       1.00      1.00      1.00      1488

Epoch 5
Training =>


100% |########################################################################|
  8% |######                                                                  |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         2
 GO:0005694       0.00      0.00      0.00         1
 GO:0030424       0.00      0.00      0.00         1
         NA       1.00      1.00      1.00      3749

avg / total       1.00      1.00      1.00      3753

Validating =>


100% |########################################################################|
  0% |                                                                        |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         1
 GO:0005694       0.00      0.00      0.00         2
         NA       1.00      1.00      1.00      1485

avg / total       1.00      1.00      1.00      1488

Epoch 6
Training =>


100% |########################################################################|
  8% |######                                                                  |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         2
 GO:0005694       0.00      0.00      0.00         1
 GO:0030424       0.00      0.00      0.00         1
         NA       1.00      1.00      1.00      3749

avg / total       1.00      1.00      1.00      3753

Validating =>


100% |########################################################################|
  0% |                                                                        |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         1
 GO:0005694       0.00      0.00      0.00         2
         NA       1.00      1.00      1.00      1485

avg / total       1.00      1.00      1.00      1488

Epoch 7
Training =>


100% |########################################################################|
  8% |######                                                                  |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         2
 GO:0005694       0.00      0.00      0.00         1
 GO:0030424       0.00      0.00      0.00         1
         NA       1.00      1.00      1.00      3749

avg / total       1.00      1.00      1.00      3753

Validating =>


100% |########################################################################|
  0% |                                                                        |

             precision    recall  f1-score   support

 GO:0005623       0.00      0.00      0.00         1
 GO:0005694       0.00      0.00      0.00         2
         NA       1.00      1.00      1.00      1485

avg / total       1.00      1.00      1.00      1488

Epoch 8
Training =>


 37% |###########################                                             |

KeyboardInterrupt: 