In [66]:
import collections
import nltk
import copy

import pandas as pd
import numpy as np

from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence

from keras.models import Sequential

from keras.layers.recurrent import GRU, LSTM
from keras.layers.core import Activation, Dense, Dropout, RepeatVector, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import TimeDistributed

import keras.layers as layers

from sklearn.model_selection import train_test_split

In [67]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [68]:
sentences = []
s = []

with open('../../data/gene-trainF17.txt') as f:
    for line in f:
        if line != '\n':
            s.append(line)
        else:
            sentences.append(s)
            s = []

In [69]:
sentences[:2]

[['1\tComparison\tO\n',
  '2\twith\tO\n',
  '3\talkaline\tB\n',
  '4\tphosphatases\tI\n',
  '5\tand\tO\n',
  '6\t5\tB\n',
  '7\t-\tI\n',
  '8\tnucleotidase\tI\n',
  '9\t.\tO\n'],
 ['1\tPharmacologic\tO\n',
  '2\taspects\tO\n',
  '3\tof\tO\n',
  '4\tneonatal\tO\n',
  '5\thyperbilirubinemia\tO\n',
  '6\t.\tO\n']]

In [70]:
len(sentences)

13795

In [71]:
sentences_processed = []
tags_processed = []

for arr in sentences:
    tmp_s = []
    tmp_t = []
    for line in arr:
        line = line.replace('\n', '').split('\t')
        tmp_s.append(line[1])
        tmp_t.append(line[2])
        
    sentences_processed.append(' '.join(tmp_s))
    tags_processed.append(' '.join(tmp_t))

In [72]:
sentences_processed[0], tags_processed[0]

('Comparison with alkaline phosphatases and 5 - nucleotidase .',
 'O O B I O B I I O')

In [73]:
VOCAB_LEN = 100

In [74]:
one_hot_sentences = []

for sent in sentences_processed:
    one_hot_sentences.append(one_hot(sent,
                                     VOCAB_LEN,
                                     filters='',
                                     lower=True,
                                     split=' '))
    

In [75]:
tags_processed = [t.split(' ') for t in tags_processed]

In [76]:
one_hot_sentences[0], tags_processed[0]

([80, 13, 82, 88, 45, 68, 66, 11, 29],
 ['O', 'O', 'B', 'I', 'O', 'B', 'I', 'I', 'O'])

In [77]:
one_hot_tags = []

for arr in tags_processed:
    tmp = []
    for t in arr:
        if t == 'O':
            tmp.append(0)
        elif t == 'B':
            tmp.append(1)
        #elif t == 'I':
            #tmp.append(3)
    one_hot_tags.append(tmp)

In [78]:
len(one_hot_sentences), len(one_hot_tags)

(13795, 13795)

In [79]:
len(one_hot_sentences[0]), len(one_hot_tags[0])

(9, 6)

In [80]:
X_train, X_test, y_train, y_test = one_hot_sentences[:11036], one_hot_sentences[11036:], one_hot_tags[:11036], one_hot_tags[11036:]

In [81]:
len(X_train[0]), len(y_train[0])

(9, 6)

In [82]:
X_train = sequence.pad_sequences(X_train, maxlen=VOCAB_LEN)
y_train = sequence.pad_sequences(y_train, maxlen=VOCAB_LEN)

X_test = sequence.pad_sequences(X_test, maxlen=VOCAB_LEN)
y_test = sequence.pad_sequences(y_test, maxlen=VOCAB_LEN)

In [83]:
X_train.shape, y_train.shape

((11036, 100), (11036, 100))

In [61]:
X_train[0], y_train[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0, 80, 13, 82, 88, 45, 68, 66, 11, 29], dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 2, 3, 1, 2, 3, 3, 1], dtype=int32))

In [84]:
model = Sequential()
model.add(Embedding(20000, 100))
model.add(layers.SpatialDropout1D(rate=0.2))
model.add(LSTM(100, recurrent_dropout=0.2, dropout=0.2))
model.add(Dense(100, activation='sigmoid'))

In [85]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [86]:
model.fit(X_train, y_train, batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7efd5e6a9f28>

In [114]:
pred = model.predict(X_train)

In [115]:
pred[0]

array([ 0.00061274,  0.0004118 ,  0.00023229,  0.00054413,  0.00042412,
        0.00034046,  0.00039539,  0.00029955,  0.00022793,  0.00045904,
        0.00026651,  0.00023481,  0.00043725,  0.00027319,  0.00056219,
        0.00031066,  0.0002515 ,  0.00052577,  0.00037056,  0.00035301,
        0.00021549,  0.00027685,  0.00038887,  0.00025538,  0.00047931,
        0.00055766,  0.00027968,  0.00095912,  0.00057123,  0.00096723,
        0.00035761,  0.00035325,  0.00093486,  0.00067629,  0.00092027,
        0.00080224,  0.00064375,  0.00056723,  0.00095558,  0.00075516,
        0.00086278,  0.00072632,  0.00071401,  0.00072003,  0.00130736,
        0.00121851,  0.00088507,  0.00157245,  0.00077896,  0.0017687 ,
        0.00242277,  0.00145118,  0.0023219 ,  0.00141712,  0.00320582,
        0.00270173,  0.00289237,  0.0016919 ,  0.0036343 ,  0.0034896 ,
        0.00690444,  0.00357192,  0.01140114,  0.00867703,  0.00756624,
        0.00641707,  0.00680938,  0.01143492,  0.00900083,  0.01

In [116]:
pred_rounded = np.round(pred)

In [138]:
pred_rounded[11]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)

In [137]:
y_train[11]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [258]:
def write_gold_standard(s):
    with open('gold_lstm.txt', 'w') as f:
        for arr in s:
            for line in arr:
                f.write(line)     

In [None]:
def write_submission(s):

In [259]:
write_gold_standard(sentences[11036:])