In [478]:
import collections
import nltk
import copy

import pandas as pd
import numpy as np

from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence

from keras.models import Sequential

from keras.layers.recurrent import GRU, LSTM
from keras.layers.core import Activation, Dense, Dropout, RepeatVector, SpatialDropout1D
from keras.layers.embeddings import Embedding

import keras.layers as layers
from sklearn.model_selection import train_test_split

In [479]:
sentences = []
s = []

with open('../../data/gene-trainF17.txt') as f:
    for line in f:
        if line != '\n':
            s.append(line)
        else:
            sentences.append(s)
            s = []

In [480]:
sentences[:2]

[['1\tComparison\tO\n',
  '2\twith\tO\n',
  '3\talkaline\tB\n',
  '4\tphosphatases\tI\n',
  '5\tand\tO\n',
  '6\t5\tB\n',
  '7\t-\tI\n',
  '8\tnucleotidase\tI\n',
  '9\t.\tO\n'],
 ['1\tPharmacologic\tO\n',
  '2\taspects\tO\n',
  '3\tof\tO\n',
  '4\tneonatal\tO\n',
  '5\thyperbilirubinemia\tO\n',
  '6\t.\tO\n']]

In [481]:
len(sentences)

13795

In [482]:
sentences_processed = []
tags_processed = []

for arr in sentences:
    tmp_s = []
    tmp_t = []
    for line in arr:
        line = line.replace('\n', '').split('\t')
        tmp_s.append(line[1])
        tmp_t.append(line[2])
        
    sentences_processed.append(' '.join(tmp_s))
    tags_processed.append(' '.join(tmp_t))

In [483]:
sentences_processed[0], tags_processed[0]

('Comparison with alkaline phosphatases and 5 - nucleotidase .',
 'O O B I O B I I O')

In [484]:
VOCAB_LEN = 100

In [485]:
one_hot_sentences = []

for sent in sentences_processed:
    one_hot_sentences.append(one_hot(sent,
                                     VOCAB_LEN,
                                     filters='',
                                     lower=True,
                                     split=' '))
    

In [486]:
tags_processed = [t.split(' ') for t in tags_processed]

In [487]:
one_hot_sentences[0], tags_processed[0]

([78, 69, 87, 99, 79, 42, 94, 88, 59],
 ['O', 'O', 'B', 'I', 'O', 'B', 'I', 'I', 'O'])

In [488]:
one_hot_tags = []

for arr in tags_processed:
    tmp = []
    for t in arr:
        if t == 'O':
            tmp.append(1)
        elif t == 'B' or t =='I':
            tmp.append(2)
        #elif t == 'I':
            #tmp.append(3)
    one_hot_tags.append(tmp)

In [489]:
len(one_hot_sentences), len(one_hot_tags)

(13795, 13795)

In [490]:
len(one_hot_sentences[0]), len(one_hot_tags[0])

(9, 9)

In [491]:
X_train, X_test, y_train, y_test = one_hot_sentences[:11036], one_hot_sentences[11036:], one_hot_tags[:11036], one_hot_tags[11036:]

In [492]:
len(X_train[0]), len(y_train[0])

(9, 9)

In [493]:
X_train = sequence.pad_sequences(X_train, maxlen=VOCAB_LEN)
y_train = sequence.pad_sequences(y_train, maxlen=VOCAB_LEN)

X_test = sequence.pad_sequences(X_test, maxlen=VOCAB_LEN)
y_test = sequence.pad_sequences(y_test, maxlen=VOCAB_LEN)

In [494]:
X_train.shape, y_train.shape

((11036, 100), (11036, 100))

In [495]:
X_train[0], y_train[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0, 78, 69, 87, 99, 79, 42, 94, 88, 59], dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 2, 2, 1, 2, 2, 2, 1], dtype=int32))

In [499]:
model = Sequential()
model.add(Embedding(20000, 100))
layers.SpatialDropout1D(rate=0.2)
model.add(LSTM(100, recurrent_dropout=0.2, dropout=0.2)) 
model.add(Dense(100, activation='sigmoid'))

In [500]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [501]:
model.fit(X_train, y_train, batch_size=32, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x159d0def0>

In [502]:
pred = model.predict(X_test)

In [503]:
pred[0]

array([ 0.00399625,  0.00451629,  0.00470455,  0.00431235,  0.00452046,
        0.00478798,  0.00479583,  0.00579276,  0.00764863,  0.00862107,
        0.00853349,  0.00974358,  0.01058936,  0.01062682,  0.01134948,
        0.01254676,  0.01336103,  0.01342479,  0.01382928,  0.01415101,
        0.01460239,  0.01547332,  0.01564126,  0.0155693 ,  0.01631906,
        0.01625492,  0.01724258,  0.01747582,  0.01776485,  0.01812462,
        0.0194335 ,  0.0181213 ,  0.02008784,  0.02200219,  0.02020856,
        0.02260959,  0.02501399,  0.0258617 ,  0.02491872,  0.03179868,
        0.0297458 ,  0.03418303,  0.03678126,  0.03691932,  0.04687715,
        0.04643258,  0.04995934,  0.05935208,  0.06266194,  0.07038938,
        0.08200552,  0.08241323,  0.08180037,  0.09249543,  0.10478768,
        0.12683056,  0.13947581,  0.15672916,  0.17497657,  0.18590653,
        0.19969314,  0.22793089,  0.24542642,  0.25714716,  0.29192111,
        0.31625396,  0.33217794,  0.36404753,  0.38144052,  0.42

In [511]:
pred_rounded = np.round(pred)

In [514]:
for arr in pred_rounded:
    for i in arr:
        if i == 2.:
            print(arr)

In [258]:
def write_gold_standard(s):
    with open('gold_lstm.txt', 'w') as f:
        for arr in s:
            for line in arr:
                f.write(line)     

In [None]:
def write_submission(s):

In [259]:
write_gold_standard(sentences[11036:])