In [1]:
import copy
import re

from nltk.stem.snowball import SnowballStemmer

from tqdm import tqdm_notebook

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveClassifier

In [2]:
def write_gold_standard(t, name):
    with open(name + '.txt', 'w') as f:
        for arr in t:
            arr.remove('<s>')
            arr.remove('</s>')
            for line in arr:
                f.write(line)
            
    f.close()

In [3]:
def write_submission(t, name):
    with open(name + '.txt', 'w') as f:
        for arr in t:
            for tup in arr:
                line = '\t'.join(['1', tup[0], tup[1]])
                line += '\n'
                f.write(line)
            
    f.close()

In [4]:
def eval(keys, predictions):
    """ Given a stream of gold standard word/tag pairs and a stream of system pairs. Figure out the the recall, precision and F1 """


    goldStandardEntities = findEntities(taggedData(keys))     # get the entities in the gold standard
    systemEntities = findEntities(taggedData(predictions))    # and the entities in the system output

    numEntities = len(goldStandardEntities)                   # number of entities there should be
    numReturned = len(systemEntities)                         # number actually tagged by system
    numTruePositives = len(set.intersection(goldStandardEntities,systemEntities))    # number of those that were right

    precision = float(numTruePositives)/numReturned
    recall = float(numTruePositives)/numEntities
    f1 = 2 * (precision * recall)/(precision + recall)

    print(numEntities, " entities in gold standard.")
    print(numReturned, " total entities found.")
    print(numTruePositives, " of which were correct.")
    
    print("Precision: ", precision, "Recall: ", recall, "F1-measure: ", f1)

def findEntities(data):
    """ Find all the IOB delimited entities in the data.  Return as a set of (begin, end) tuples. Data is sequence of word, tag pairs. """

    entities = set()

    entityStart = 0
    entityEnd = 0
    
    currentState = "Q0"
    count = 0

    for arr in list(data):
        for word, tag in arr.items():
            count = count + 1
            if currentState == "Q0":
                if tag == 'B':
                    currentState = "Q1"
                    entityStart = count
            elif currentState == "Q1":
                if tag == "B":
                    entityEnd = count - 1
                    entities.add((entityStart, entityEnd))
                    entityStart = count
                if tag == "O":
                    entityEnd = count - 1
                    entities.add((entityStart, entityEnd))
                    currentState = "Q0"

    if currentState == "Q1":
        entities.add((entityStart, entityEnd))

    return entities

def taggedData(file):
    for line in file:
        if line.strip() == '':
            yield({'</s>': 'O'})
        else:
            spl = line.strip().split()[1:]
            yield({spl[0] : spl[1]})

In [5]:
sentences = []
s = []

with open('../../data/gene-trainF17.txt') as f:
    for line in f:
        if line != '\n':
            s.append(line)
        else:
            sentences.append(s)
            s = []
            
for i in range(len(sentences)):
    sentences[i].insert(0, '<s>')
    sentences[i].append('</s>')
    
train, test = train_test_split(sentences, test_size=.20, random_state=108)

gold_test = copy.deepcopy(test)

for arr in test:
    for i in range(len(arr)):
        if arr[i] != '<s>' and arr[i] != '</s>':
            spl = arr[i].split('\t')
            arr[i] = '\t'.join((spl[0], spl[1]))

In [6]:
gold_test

[['<s>',
  '1\tMarked\tO\n',
  '2\tsedative\tO\n',
  '3\tactivity\tO\n',
  '4\twas\tO\n',
  '5\tobserved\tO\n',
  '6\twith\tO\n',
  '7\t9\tO\n',
  '8\t-\tO\n',
  '9\tmethoxy\tO\n',
  '10\t-\tO\n',
  '11\t5\tO\n',
  '12\t-\tO\n',
  '13\tphenylpyrrolo\tO\n',
  '14\t[\tO\n',
  '15\t2\tO\n',
  '16\t,\tO\n',
  '17\t1\tO\n',
  '18\t-\tO\n',
  '19\td\tO\n',
  '20\t][\tO\n',
  '21\t1\tO\n',
  '22\t,\tO\n',
  '23\t5\tO\n',
  '24\t]\tO\n',
  '25\tbenzothiazepin\tO\n',
  '26\t-\tO\n',
  '27\t6\tO\n',
  '28\t,\tO\n',
  '29\t6\tO\n',
  '30\t-\tO\n',
  '31\tdioxide\tO\n',
  '32\t(\tO\n',
  '33\tNF19\tO\n',
  '34\t)\tO\n',
  '35\tand\tO\n',
  '36\t9\tO\n',
  '37\t-\tO\n',
  '38\tchloro\tO\n',
  '39\t-\tO\n',
  '40\t5\tO\n',
  '41\t-\tO\n',
  '42\tp\tO\n',
  '43\t-\tO\n',
  '44\tnitrophenylpyrrolo\tO\n',
  '45\t[\tO\n',
  '46\t2\tO\n',
  '47\t,\tO\n',
  '48\t1\tO\n',
  '49\t-\tO\n',
  '50\td\tO\n',
  '51\t][\tO\n',
  '52\t1\tO\n',
  '53\t,\tO\n',
  '54\t5\tO\n',
  '55\t]\tO\n',
  '56\tbenzothiazepin\t

In [7]:
new_train = []

for arr in train:
    tmp = []
    for i in arr:
        if i != '<s>' and i != '</s>':
            line = i.replace('\n', '').split('\t')
            tmp.append((line[1], line[2]))
        else:
            pass
        
    new_train.append(tmp)

In [8]:
new_train[3]

[('We', 'O'),
 ('discuss', 'O'),
 ('these', 'O'),
 ('results', 'O'),
 ('with', 'O'),
 ('respect', 'O'),
 ('to', 'O'),
 ('the', 'O'),
 ('transcriptional', 'O'),
 ('induction', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('HNF', 'B'),
 ('-', 'I'),
 ('3', 'I'),
 ('alpha', 'I'),
 ('gene', 'I'),
 ('in', 'O'),
 ('respiratory', 'O'),
 ('epithelium', 'O'),
 ('during', 'O'),
 ('embryogenesis', 'O'),
 ('.', 'O')]

In [9]:
new_test = []

for arr in test:
    tmp = []
    for i in arr:
        if i != '<s>' and i != '</s>':
            tmp.append(i.replace('\n', '').split('\t')[1])
        else:
            pass
        
    new_test.append(' '.join(tmp))

In [10]:
new_test[0]

'Marked sedative activity was observed with 9 - methoxy - 5 - phenylpyrrolo [ 2 , 1 - d ][ 1 , 5 ] benzothiazepin - 6 , 6 - dioxide ( NF19 ) and 9 - chloro - 5 - p - nitrophenylpyrrolo [ 2 , 1 - d ][ 1 , 5 ] benzothiazepin - 6 , 6 - dioxide ( NF20 ).'

# Sklearn: Passive Agressive Classifier
> http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf

In [11]:
stemmer = SnowballStemmer('english')

In [12]:
def get_shape(word):
    if re.match('[A-Z][a-z]+$', word):
        shape = 'capital_letter'
    elif re.match('[A-Z]+$', word):
        shape = 'uppercase'
    elif re.match('[a-z]+$', word):
        shape = 'lowercase'
    elif re.match('[A-Z][a-z]+[A-Z][a-z]+[A-Za-z]*$', word):
        shape = 'camelcase'
    elif re.match('[A-Za-z]+$', word):
        shape = 'mixedcase'
    elif re.match('\W+$', word):
        shape = 'punc'
    elif re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
        shape = 'number'
    elif re.match('[A-Za-z0-9]+\.[A-Za-z0-9\.]+\.$', word):
        shape = 'abbrev'
    elif re.match('[A-Za-z0-9]+\-[A-Za-z0-9\-]+.*$', word):
        shape = 'contains_hyphen'
    elif re.match('[A-Za-z0-9]+\.$', word):
        shape = 'word_dot'
    elif re.match('__.+__$', word):
        shape = 'other'
    else:
        shape = 'unk'
 
    return shape

In [13]:
def get_features(sent, index, look_back):
    #construct a feature template
    index += 2
    
    sent = [('<s2>', '<s2>'), ('<s1>', '<s1>')] + sent + [('</s1>', '</s1>'), ('</s2>', '</s2>')]
    look_back = ['<s1>', '<s1>'] + look_back
    
    word = sent[index][0]
    n_word = sent[index + 1][0]
    nn_word = sent[index + 2][0]
    p_word = sent[index - 1][0]
    pp_word = sent[index - 2][0]
    p_iob = look_back[-1][0]
    pp_iob = look_back[-2][0]
 
    template = {
            'word': word, 'stem': stemmer.stem(word), 'shape': get_shape(word),
            'n_word': n_word, 'n_stem': stemmer.stem(n_word), 'n_shape': get_shape(n_word),
            'nn_word': nn_word, 'nn_stem': stemmer.stem(nn_word), 'nn_shape': get_shape(nn_word),
            'p_word': p_word, 'p_stem': stemmer.stem(p_word), 'p_iob': p_iob, 'p_shape': get_shape(p_word),
            'pp_word': pp_word, 'pp_stem': stemmer.stem(pp_word), 'pp_iob': pp_iob, 'pp_shape': get_shape(pp_word)
    }
    
    return template

In [14]:
class PAClassifer:
    def __init__(self):
        self._clf = PassiveAggressiveClassifier(C=1.0, loss='hinge', verbose=1, n_jobs=-1)
        self._dvec = DictVectorizer(sparse=False)
        self._iob_labels = ['O', 'B', 'I']
        
    def online_training(self, train, batch_size):
        #limit batch size to 1000
        assert batch_size <= 1000
        
        first_batch = True
        start = 0
        end = batch_size
        for i in tqdm_notebook(range(len(train)//batch_size - 1)):
            sentences = train[start:end]
            
            X, y = [], []
            for tagged in sentences:
                iob_tags = [i[1] for i in tagged]
            
                for index in range(len(tagged)):
                    X.append(get_features(tagged, index, iob_tags[:index]))
                    y.append(iob_tags[index])
                    
            if first_batch == True:
                self._dvec.fit(X)
                X = self._dvec.transform(X)
                self._clf.partial_fit(X, y, self._iob_labels)
                
                first_batch = False
                start += batch_size
                end += batch_size
            else:
                X = self._dvec.transform(X)
                self._clf.partial_fit(X, y, self._iob_labels)
                
                start += batch_size
                end += batch_size
        
            print('Upper batch index: ', end)
        
        #train on last batch
        print('Last batch start index: ', start)
        print('Number of samples in last batch: ', len(train[start:]))
        sentences = train[start:]
            
        X, y = [], []
        for tagged in sentences:
            iob_tags = [i[1] for i in tagged]
            
            for index in range(len(tagged)):
                X.append(get_features(tagged, index, iob_tags[:index]))
                y.append(iob_tags[index])
                    
        X = self._dvec.transform(X)
            
        self._clf.partial_fit(X, y, self._iob_labels)
            
        #Need this to be able to apply predictions
        self._clf = Pipeline([
            ('vectorizer', self._dvec),
            ('classifier', self._clf)
        ])
            
    def predict_samples(self, test):
        test_parse = []
        for sent in test:
            tmp = []
            sent = sent.split()
            for i in sent:
                tmp.append((i, ))
                
            test_parse.append(tmp)
        
        history = []
        predictions = []
        
        for arr in tqdm_notebook(test_parse):
            iob_tagged_tokens = []
            for index, word in list(enumerate(arr)):
                iob_tag = self._clf.predict([get_features(arr, index, history)])[0]
                history.append(iob_tag)
                iob_tagged_tokens.append((word[0], iob_tag))
                
            predictions.append(iob_tagged_tokens)
 
        return predictions

In [15]:
model = PAClassifer()

model.online_training(new_train, batch_size=1000)

-- Epoch 1
-- Epoch 1-- Epoch 1

Norm: 8.57, NNZs: 7861, Bias: -0.298072, T: 28142, Avg. loss: 0.050313
Total training time: 25.29 seconds.
Norm: 10.81, NNZs: 14836, Bias: -0.798136, T: 28142, Avg. loss: 0.102032
Total training time: 25.74 seconds.
Norm: 13.13, NNZs: 18412, Bias: -0.213852, T: 28142, Avg. loss: 0.150338
Total training time: 25.74 seconds.
Upper batch index:  2000


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   25.8s finished


-- Epoch 1-- Epoch 1
-- Epoch 1

Norm: 10.80, NNZs: 9217, Bias: -0.294642, T: 27770, Avg. loss: 0.032816
Total training time: 22.13 seconds.
Norm: 14.47, NNZs: 17754, Bias: -1.148814, T: 27770, Avg. loss: 0.089690
Total training time: 22.57 seconds.
Norm: 17.52, NNZs: 21475, Bias: -0.171516, T: 27770, Avg. loss: 0.128113
Total training time: 22.63 seconds.
Upper batch index:  3000


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   22.6s finished


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 12.62, NNZs: 10205, Bias: -0.406359, T: 28691, Avg. loss: 0.029826
Total training time: 22.84 seconds.
Norm: 17.16, NNZs: 19736, Bias: -1.481559, T: 28691, Avg. loss: 0.087003
Total training time: 23.28 seconds.
Norm: 20.68, NNZs: 23460, Bias: -0.244041, T: 28691, Avg. loss: 0.120796
Total training time: 23.31 seconds.
Upper batch index:  4000


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   23.3s finished


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 13.87, NNZs: 10970, Bias: -0.301254, T: 28378, Avg. loss: 0.028037
Total training time: 22.68 seconds.
Norm: 19.19, NNZs: 21139, Bias: -1.265046, T: 28378, Avg. loss: 0.079024
Total training time: 23.11 seconds.
Norm: 22.88, NNZs: 24997, Bias: -0.108921, T: 28378, Avg. loss: 0.111349
Total training time: 23.28 seconds.
Upper batch index:  

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   23.3s finished


5000
-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 15.01, NNZs: 11568, Bias: -0.374351, T: 27447, Avg. loss: 0.028692
Total training time: 22.63 seconds.
Norm: 20.89, NNZs: 22248, Bias: -1.339710, T: 27447, Avg. loss: 0.079126
Total training time: 22.98 seconds.
Norm: 24.79, NNZs: 26106, Bias: -0.347311, T: 27447, Avg. loss: 0.112502
Total training time: 23.11 seconds.
Upper batch index:  6000

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   23.1s finished



-- Epoch 1
-- Epoch 1-- Epoch 1

Norm: 15.90, NNZs: 12174, Bias: -0.388723, T: 27794, Avg. loss: 0.025159
Total training time: 22.11 seconds.
Norm: 22.49, NNZs: 23249, Bias: -1.400475, T: 27794, Avg. loss: 0.078028
Total training time: 22.28 seconds.
Norm: 26.54, NNZs: 27070, Bias: -0.108349, T: 27794, Avg. loss: 0.108588
Total training time: 22.43 seconds.
Upper batch index:  7000


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   22.4s finished


-- Epoch 1
-- Epoch 1-- Epoch 1

Norm: 16.82, NNZs: 12577, Bias: -0.521448, T: 27607, Avg. loss: 0.026077
Total training time: 26.84 seconds.
Norm: 23.98, NNZs: 24149, Bias: -1.484770, T: 27607, Avg. loss: 0.076499
Total training time: 26.85 seconds.
Norm: 28.34, NNZs: 27944, Bias: -0.307771, T: 27607, Avg. loss: 0.109358
Total training time: 26.98 seconds.
Upper batch index: 

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   27.0s finished


 8000
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 17.74, NNZs: 13103, Bias: -0.151247, T: 27535, Avg. loss: 0.028434
Total training time: 22.70 seconds.
Norm: 29.85, NNZs: 28704, Bias: -0.349569, T: 27535, Avg. loss: 0.111675
Total training time: 23.30 seconds.
Norm: 25.26, NNZs: 24880, Bias: -1.534863, T: 27535, Avg. loss: 0.076343
Total training time: 23.32 seconds.
Upper batch index:  9000


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   23.3s finished


-- Epoch 1
-- Epoch 1-- Epoch 1

Norm: 18.53, NNZs: 13557, Bias: -0.555264, T: 27941, Avg. loss: 0.026579
Total training time: 26.97 seconds.
Norm: 26.48, NNZs: 25512, Bias: -1.625634, T: 27941, Avg. loss: 0.073351
Total training time: 27.46 seconds.
Norm: 31.24, NNZs: 29323, Bias: -0.769162, T: 27941, Avg. loss: 0.107605
Total training time: 27.47 seconds.
Upper batch index:  10000


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   27.5s finished


-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 19.26, NNZs: 13908, Bias: -0.614833, T: 27864, Avg. loss: 0.025349
Total training time: 25.88 seconds.
Norm: 27.51, NNZs: 26113, Bias: -1.553780, T: 27864, Avg. loss: 0.071167
Total training time: 26.33 seconds.
Norm: 32.59, NNZs: 29862, Bias: -0.083304, T: 27864, Avg. loss: 0.104142
Total training time: 26.45 seconds.
Upper batch index:  11000


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   26.5s finished



Last batch start index:  10000
Number of samples in last batch:  1036
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 19.99, NNZs: 14300, Bias: -0.315786, T: 28142, Avg. loss: 0.022663
Total training time: 23.40 seconds.
Norm: 28.61, NNZs: 26691, Bias: -1.293059, T: 28142, Avg. loss: 0.072090
Total training time: 23.52 seconds.
Norm: 33.79, NNZs: 30403, Bias: -0.367450, T: 28142, Avg. loss: 0.103891
Total training time: 23.53 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   23.5s finished


In [16]:
predictions = model.predict_samples(new_test)




In [17]:
write_gold_standard(gold_test, 'gold_pa')
write_submission(predictions, 'submit_pa')

In [18]:
keys = open('gold_pa.txt')
predictions = open('submit_pa.txt')
eval(keys, predictions)

3467  entities in gold standard.
4655  total entities found.
2003  of which were correct.
Precision:  0.43029001074113854 Recall:  0.5777329102970868 F1-measure:  0.49322826889928584
