In [1]:
import copy
import re

from tqdm import tqdm_notebook

from nltk.stem.snowball import SnowballStemmer

import numpy as np
import keras.layers as layers

from scipy.sparse import issparse

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer

from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.core import Activation, Dense, Dropout, RepeatVector, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers import Reshape, Bidirectional, CuDNNLSTM, LSTM, SimpleRNN, Conv1D, MaxPooling1D, Flatten, RepeatVector

Using TensorFlow backend.


# Evaluation

In [2]:
def write_gold_standard(t, name):

    with open(name + '.txt', 'w') as f:
        for arr in t:
            arr.remove('<s>')
            arr.remove('</s>')
            for line in arr:
                f.write(line)
            
    f.close()

In [3]:
def write_submission(t, name):
    with open(name + '.txt', 'w') as f:
        for arr in t:
            for tup in arr:
                line = '\t'.join(['1', tup[0], tup[1]])
                line += '\n'
                f.write(line)
            
    f.close()

In [4]:
def eval(keys, predictions):
    """ Given a stream of gold standard word/tag pairs and a stream of system pairs. Figure out the the recall, precision and F1 """


    goldStandardEntities = findEntities(taggedData(keys))     # get the entities in the gold standard
    systemEntities = findEntities(taggedData(predictions))    # and the entities in the system output

    numEntities = len(goldStandardEntities)                   # number of entities there should be
    numReturned = len(systemEntities)                         # number actually tagged by system
    numTruePositives = len(set.intersection(goldStandardEntities,systemEntities))    # number of those that were right

    precision = float(numTruePositives)/numReturned
    recall = float(numTruePositives)/numEntities
    f1 = 2 * (precision * recall)/(precision + recall)

    print(numEntities, " entities in gold standard.")
    print(numReturned, " total entities found.")
    print(numTruePositives, " of which were correct.")
    
    print("Precision: ", precision, "Recall: ", recall, "F1-measure: ", f1)

def findEntities(data):
    """ Find all the IOB delimited entities in the data.  Return as a set of (begin, end) tuples. Data is sequence of word, tag pairs. """

    entities = set()

    entityStart = 0
    entityEnd = 0
    
    currentState = "Q0"
    count = 0

    for arr in list(data):
        for word, tag in arr.items():
            count = count + 1
            if currentState == "Q0":
                if tag == 'B':
                    currentState = "Q1"
                    entityStart = count
            elif currentState == "Q1":
                if tag == "B":
                    entityEnd = count - 1
                    entities.add((entityStart, entityEnd))
                    entityStart = count
                if tag == "O":
                    entityEnd = count - 1
                    entities.add((entityStart, entityEnd))
                    currentState = "Q0"

    if currentState == "Q1":
        entities.add((entityStart, entityEnd))

    return entities

def taggedData(file):
    for line in file:
        if line.strip() == '':
            yield({'</s>': 'O'})
        else:
            spl = line.strip().split()[1:]
            yield({spl[0] : spl[1]})

# Create Datasets

In [5]:
sentences = []
s = []

with open('../../data/gene-trainF17.txt') as f:
    for line in f:
        if line != '\n':
            s.append(line)
        else:
            sentences.append(s)
            s = []
            
for i in range(len(sentences)):
    sentences[i].insert(0, '<s>')
    sentences[i].append('</s>')
    
train, test = train_test_split(sentences, test_size=.20, random_state=2)

gold_test = copy.deepcopy(test)

for arr in test:
    for i in range(len(arr)):
        if arr[i] != '<s>' and arr[i] != '</s>':
            spl = arr[i].split('\t')
            arr[i] = '\t'.join((spl[0], spl[1]))

In [6]:
new_train = []

for arr in train:
    tmp = []
    for i in arr:
        if i != '<s>' and i != '</s>':
            line = i.replace('\n', '').split('\t')
            tmp.append((line[1], line[2]))
        else:
            pass
        
    new_train.append(tmp)

In [7]:
new_train[3]

[('To', 'O'),
 ('confirm', 'O'),
 ('the', 'O'),
 ('binding', 'O'),
 ('of', 'O'),
 ('protein', 'O'),
 ('to', 'O'),
 ('these', 'O'),
 ('sites', 'O'),
 ('in', 'O'),
 ('cells', 'O'),
 (',', 'O'),
 ('we', 'O'),
 ('carried', 'O'),
 ('out', 'O'),
 ('an', 'O'),
 ('in', 'O'),
 ('vivo', 'O'),
 ('genomic', 'O'),
 ('footprinting', 'O'),
 ('analysis', 'O'),
 ('of', 'O'),
 ('this', 'O'),
 ('portion', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('TGF', 'B'),
 ('alpha', 'I'),
 ('promoter', 'I'),
 ('in', 'O'),
 ('normal', 'O'),
 ('and', 'O'),
 ('transformed', 'O'),
 ('rat', 'O'),
 ('liver', 'O'),
 ('epithelial', 'O'),
 ('cell', 'O'),
 ('lines', 'O'),
 ('that', 'O'),
 ('express', 'O'),
 ('the', 'O'),
 ('endogenous', 'O'),
 ('gene', 'O'),
 ('at', 'O'),
 ('varying', 'O'),
 ('levels', 'O'),
 ('.', 'O')]

In [8]:
new_test = []

for arr in test:
    tmp = []
    for i in arr:
        if i != '<s>' and i != '</s>':
            tmp.append(i.replace('\n', '').split('\t')[1])
        else:
            pass
        
    new_test.append(' '.join(tmp))

In [9]:
new_test[0]

'The results showed that MIP , MMIF , FIC , Wimax , P0 . 1 and minute ventilation ( Vr ) were significantly increased after administration of methylphenidatum and aminophylline .'

# RNN

In [10]:
def get_shape(word):
    if re.match('[A-Z][a-z]+$', word):
        shape = 'capital_letter'
    elif re.match('[A-Z]+$', word):
        shape = 'uppercase'
    elif re.match('[a-z]+$', word):
        shape = 'lowercase'
    elif re.match('[A-Z][a-z]+[A-Z][a-z]+[A-Za-z]*$', word):
        shape = 'camelcase'
    elif re.match('[A-Za-z]+$', word):
        shape = 'mixedcase'
    elif re.match('\W+$', word):
        shape = 'punc'
    elif re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
        shape = 'number'
    elif re.match('[A-Za-z0-9]+\.[A-Za-z0-9\.]+\.$', word):
        shape = 'abbrev'
    elif re.match('[A-Za-z0-9]+\-[A-Za-z0-9\-]+.*$', word):
        shape = 'contains_hyphen'
    elif re.match('[A-Za-z0-9]+\.$', word):
        shape = 'word_dot'
    elif re.match('__.+__$', word):
        shape = 'other'
    else:
        shape = 'unk'
 
    return shape

In [11]:
def get_features(sent, index, look_back):
    #construct a feature template
    index += 2
    
    sent = [('<s2>', '<s2>'), ('<s1>', '<s1>')] + sent + [('</s1>', '</s1>'), ('</s2>', '</s2>')]
    look_back = ['<s1>', '<s1>'] + look_back
    
    word = sent[index][0]
    n_word = sent[index + 1][0]
    nn_word = sent[index + 2][0]
    p_word = sent[index - 1][0]
    pp_word = sent[index - 2][0]
    p_iob = look_back[-1][0]
    pp_iob = look_back[-2][0]
 
    template = {
            'word': word, 'stem': stemmer.stem(word), 'shape': get_shape(word),
            'n_word': n_word, 'n_stem': stemmer.stem(n_word), 'n_shape': get_shape(n_word),
            'nn_word': nn_word, 'nn_stem': stemmer.stem(nn_word), 'nn_shape': get_shape(nn_word),
            'p_word': p_word, 'p_stem': stemmer.stem(p_word), 'p_iob': p_iob, 'p_shape': get_shape(p_word),
            'pp_word': pp_word, 'pp_stem': stemmer.stem(pp_word), 'pp_iob': pp_iob, 'pp_shape': get_shape(pp_word)
    }
    
    return template

In [12]:
class RNNClassifer:
    def __init__(self):
        self._clf = self.build_nn()
        self._dvec = DictVectorizer(sparse=False)
        self._onehot = LabelBinarizer()
        
    def build_nn(self):
        #FF w/ LSTM end
        model = Sequential()
        model.add(Dense(2000, activation='relu', input_dim=10989))
        model.add(RepeatVector(20))
        model.add(CuDNNLSTM(50))
        model.add(Dense(3, activation='softmax'))
    
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        model.summary()
        
        """
        #Convolutional NN
        model = Sequential()
        model.add(Dense(10989, input_dim=10989))
        model.add(RepeatVector(10))
        model.add(Conv1D(111, kernel_size=4, strides=4, 
                         activation='relu'))
        model.add(Flatten())
        model.add(Dense(1000, activation='relu'))
        model.add(Dense(3, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop', metrics=['acc'])
        model.summary()
        """
        return model
                
    def online_training(self, train, batch_size):
        #limit batch size to 1000
        assert batch_size <= 1000
        self._onehot.fit(['O', 'B', 'I'])
        
        first_batch = True
        start = 0
        end = batch_size
        for i in tqdm_notebook(range(len(train)//batch_size - 1)):
            sentences = train[start:end]
            
            X, y = [], []
            for tagged in sentences:
                iob_tags = [i[1] for i in tagged]
            
                for index in range(len(tagged)):
                    X.append(get_features(tagged, index, iob_tags[:index]))
                    encode = self._onehot.transform([iob_tags[index]])
                    y.append(encode[0])
                    
            if first_batch == True:
                self._dvec.fit(X)
                X = self._dvec.transform(X)
                self._clf.train_on_batch(X, np.array(y))
                
                first_batch = False
                start += batch_size
                end += batch_size
            else:
                X = self._dvec.transform(X)
                self._clf.train_on_batch(X, np.array(y))
                
                start += batch_size
                end += batch_size
        
            print('Upper batch index: ', end)
        
        #train on last batch
        print('Last batch start index: ', start)
        print('Number of samples in last batch: ', len(train[start:]))
        sentences = train[start:]
            
        X, y = [], []
        for tagged in sentences:
            iob_tags = [i[1] for i in tagged]
            
            for index in range(len(tagged)):
                X.append(get_features(tagged, index, iob_tags[:index]))
                encode = self._onehot.transform([iob_tags[index]])
                y.append(encode[0])
                    
        X = self._dvec.transform(X)
            
        self._clf.train_on_batch(X, np.array(y))
          
    def predict_test(self, sentences):
        look_back = []
        predictions = []
        
        for arr in tqdm_notebook(sentences):
            iob_tagged_tokens = []
            for index, word in list(enumerate(arr)):
                feat = self._dvec.transform(get_features(arr, index, look_back))
                iob_tag = self._onehot.inverse_transform(self._clf.predict(feat))
                history.append(iob_tag[0])
                iob_tagged_tokens.append((word[0], iob_tag[0]))
                
            predictions.append(iob_tagged_tokens)
 
        return predictions

In [13]:
model = RNNClassifer()

model.online_training(new_train, batch_size=100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2000)              21980000  
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 20, 2000)          0         
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 50)                410400    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 153       
Total params: 22,390,553
Trainable params: 22,390,553
Non-trainable params: 0
_________________________________________________________________


Upper batch index:  200
Upper batch index:  300
Upper batch index:  400
Upper batch index:  500
Upper batch index:  600
Upper batch index:  700
Upper batch index:  800
Upper batch index:  900
Upper batch index:  1000
Upper batch index:  1100
Upper batch index:  1200
Upper batch index:  1300
Upper batch index:  1400
Upper batch index:  1500
Upper batch index:  1600
Upper batch index:  1700
Upper batch index:  1800
Upper batch index:  1900
Upper batch index:  2000
Upper batch index:  2100
Upper batch index:  2200
Upper batch index:  2300
Upper batch index:  2400
Upper batch index:  2500
Upper batch index:  2600
Upper batch index:  2700
Upper batch index:  2800
Upper batch index:  2900
Upper batch index:  3000
Upper batch index:  3100
Upper batch index:  3200
Upper batch index:  3300
Upper batch index:  3400
Upper batch index:  3500
Upper batch index:  3600
Upper batch index:  3700
Upper batch index:  3800
Upper batch index:  3900
Upper batch index:  4000
Upper batch index:  4100
Upper ba

In [None]:
prep_test = [[(i,) for i in arr.split()] for arr in new_test]
prep_test[0]

[('The',),
 ('results',),
 ('showed',),
 ('that',),
 ('MIP',),
 (',',),
 ('MMIF',),
 (',',),
 ('FIC',),
 (',',),
 ('Wimax',),
 (',',),
 ('P0',),
 ('.',),
 ('1',),
 ('and',),
 ('minute',),
 ('ventilation',),
 ('(',),
 ('Vr',),
 (')',),
 ('were',),
 ('significantly',),
 ('increased',),
 ('after',),
 ('administration',),
 ('of',),
 ('methylphenidatum',),
 ('and',),
 ('aminophylline',),
 ('.',)]

In [None]:
predictions = model.predict_test(prep_test)

In [None]:
predictions[0]

In [None]:
write_gold_standard(gold_test, 'gold_nng')
write_submission(predictions, 'submit_keras_nng')

In [None]:
keys = open('gold_nng.txt')
predictions = open('submit_keras_nng.txt')
eval(keys, predictions)

In [None]:
[i.replace('\n', '').split('\t')[1:] for i in gold_test[0] if i != '<s>' and i !='</s>']

In [None]:
#max(x_0), max(x_1), #(9002, 25480, 9002)