# Mars Target Encyclopedia  - NER
Thamme Gowda (Thamme.Gowda@jpl.nasa.gov)

Named Entity Recognition / Sequence Tagging
This notebook contains NER tagging using CRF suite


### Notes:
 + Use python3, Reason: we need unicode strings, which is default in python3
 + install Python-crfsuite
 + Start CoreNLP Server

In [46]:
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
from pycorenlp import StanfordCoreNLP
from codecs import open as copen

import os, glob
import pickle

print(sklearn.__version__)

0.18.1


In [2]:
#accept_labels = set(['Element', 'Mineral', 'Target', 'Material', 'Locality', 'Site'])
accept_labels = set(['Target'])


class BratToCRFSuitFeaturizer(object):
    def __init__(self, corenlp_url='http://localhost:9000', iob=False):
        '''
        Create Converter for converting brat annotations to Core NLP NER CRF
        classifier training data.
        @param corenlp_url: URL to corenlp server.
                To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
        @param iob: set 'True' for IOB encoding
        '''
        self.corenlp = StanfordCoreNLP(corenlp_url)
        self.iob = iob

    def convert(self, text_file, ann_file):
        text, tree = self.parse(text_file, ann_file)
        props = { 'annotators': 'tokenize,ssplit,lemma,pos', 'outputFormat': 'json'}
        if text[0].isspace():
            text = '.' + text[1:]
            # Reason: some tools trim/strip off the white spaces
            # which will mismatch the character offsets
        output = self.corenlp.annotate(text, properties=props)
        records = []
        for sentence in output['sentences']:
            sent_features = []
            continue_ann, continue_ann_en = None, None
            for tok in sentence['tokens']:
                begin, tok_end = tok['characterOffsetBegin'], tok['characterOffsetEnd']
                label = 'O'
                if begin in tree:
                    node = tree[begin]
                    if len(node) > 1:
                        print("WARN: multiple starts at ", begin, node)
                        if tok_end in node:
                            node = {tok_end: node[tok_end]} # picking one
                            print("Chose:", node)

                    ann_end, labels = list(node.items())[0]
                    if not len(labels) == 1:
                        print("WARN: Duplicate labels for token: %s, label:%s.\
                              Using the first one!" % (tok['word'], str(labels)))
                    if accept_labels is not None and labels[0] in accept_labels:
                        label = labels[0]

                    if tok_end == ann_end: # annotation ends where token ends
                        continue_ann = None
                    elif tok_end < ann_end and label != 'O':
                        #print("Continue for the next %d chars" % (ann_end - tok_end))
                        continue_ann = label
                        continue_ann_end = ann_end 
                    if label != 'O' and self.iob:
                        label = "B-" + label
                elif continue_ann is not None and tok_end <= continue_ann_end:
                    #print("Continuing the annotation %s, %d:%d %d]" % 
                    #(continue_ann, begin, tok_end, continue_ann_end))
                    label = continue_ann            # previous label is this label
                    if continue_ann_end == tok_end: # continuation ends here
                        #print("End")
                        continue_ann = None
                    if self.iob:
                        label = "I-" + label
                sent_features.append([tok['word'], tok['lemma'], tok['pos'], label])
            yield sent_features


    def parse(self, txt_file, ann_file):
        with copen(ann_file, 'r', encoding='utf-8') as ann_file:
            with copen(txt_file, 'r', encoding='utf-8') as text_file:
                texts = text_file.read()
            anns = map(lambda x: x.strip().split('\t'), ann_file)
            anns = filter(lambda x: len(x) > 2, anns)
            # FIXME: ignoring the annotatiosn which are complex

            anns = filter(lambda x: ';' not in x[1], anns)
            # FIXME: some annotations' spread have been split into many, separated by ; ignoring them

            def __parse_ann(ann):
                spec = ann[1].split()
                name = spec[0]
                markers = list(map(lambda x: int(x), spec[1:]))
                #t = ' '.join([texts[begin:end] for begin,end in zip(markers[::2], markers[1::2])])
                t = texts[markers[0]:markers[1]]
                if not t == ann[2]:
                    print("Error: Annotation mis-match, file=%s, ann=%s" % (txt_file, str(ann)))
                    return None
                return (name, markers, t)
            anns = map(__parse_ann, anns) # format
            anns = filter(lambda x: x, anns) # skip None

            # building a tree index for easy accessing
            tree = {}
            for entity_type, pos, name in anns:
                begin, end = pos[0], pos[1]
                if begin not in tree:
                    tree[begin] = {}
                node = tree[begin]
                if end not in node:
                    node[end] = []
                node[end].append(entity_type)

            # Re-read file in without decoding it
            text_file = copen(txt_file, 'r', encoding='utf-8')
            texts = text_file.read()
            text_file.close()
            return texts, tree

def scan_dir(dir_name):
    items = glob.glob(dir_name + "/*.ann")
    items = map(lambda f: (f, f.replace(".ann", ".txt")), items)
    return items

def preprocess_all(list_file, out_file):
    featzr = BratToCRFSuitFeaturizer(iob=True)
    tokenized = []
    with open(list_file) as f:
        examples = map(lambda l:l.strip().split(','), f.readlines())
    for txt_file, ann_file in examples:
        sents = featzr.convert(txt_file, ann_file)
        tokenized.append(list(sents))

    pickle.dump(tokenized, open(out_file, 'wb'))
    print("Dumped %d docs to %s" % (len(tokenized), out_file))

## Parse and store the corpus

In this step, we pass the text through CoreNLP pipeline, tokenize and POS tag them. 
In addition, we lookup the annotations file and match the target annotations with the token. 

Since this step is expensive, we store the results in pickle file, so that we can later load and resume our analysis for feature engineering.

In [None]:
p_dir = "/Users/thammegr/work/mte/data/newcorpus/workspace"
train_list = p_dir + "/train_62r15_685k14_384k15.list"
dev_list= p_dir + "/development.list"
test_list = p_dir + "/test.list"

train_corpus_file = 'mte-corpus-train.pickle'
preprocess_all(train_list, train_corpus_file)

# Test and Development set
dev_corpus_file = 'mte-corpus-dev.pickle'
preprocess_all(dev_list, dev_corpus_file)
test_corpus_file = 'mte-corpus-test.pickle'
preprocess_all(test_list, test_corpus_file)

## Load the corpus
Here we load the corpus from pickle file

In [5]:
corpus_file = 'mte-corpus-train.pickle'
corpus = pickle.load(open(corpus_file, 'rb'))
corpus[0][10]

[['Hollow', 'hollow', 'JJ', 'O'],
 ['spherical', 'spherical', 'JJ', 'O'],
 ['feature', 'feature', 'NN', 'O'],
 ['observed', 'observe', 'VBN', 'O'],
 ['on', 'on', 'IN', 'O'],
 ['sol', 'sol', 'NN', 'O'],
 ['122', '122', 'CD', 'O'],
 ['in', 'in', 'IN', 'O'],
 ['the', 'the', 'DT', 'O'],
 ['Yellowknife', 'Yellowknife', 'NNP', 'O'],
 ['Bay', 'Bay', 'NNP', 'O'],
 ['area', 'area', 'NN', 'O'],
 ['.', '.', '.', 'O']]

Next, we start playing with the features of CRF Suite to build a sequence tagger.

In [80]:
%%time

def word2features(sent, idx):
    word = sent[idx]
    
    words = []
    feats = []

    # current word
    words.append(('', sent[idx])) 
    # look behind
    if idx > 0:
        words.append(('-1:', sent[idx-1]))
        # look behind one more
        #if idx > 1:
        #    words.append(('-2:', sent[idx-2]))
    else:
        feats.append('BOS') # begin of sequence
    # look ahead
    if idx < len(sent) - 1:
        words.append(('+1:', sent[idx+1]))
        # look ahead one more
        if idx < len(sent) - 2:
            words.append(('+2:', sent[idx+2]))
    else:
        feats.append('EOS') # end of sequence
    for prefix, word in words:
        feats.extend([
                '%sword.bias'% (prefix),
                #'%sword.lemma=%s' %(prefix, word[1]),
                '%sword.pos=%s' %(prefix, word[2]),
                #'%sword.lower=%s' %(prefix, word[0].lower()),
                '%sword[-3:]=%s' %(prefix, word[0][-3:]),
                '%sword[-2:]=%s' %(prefix, word[0][-2:]),
                #'%sword.isupper=%s' % (prefix, word[0].isupper()),
                #'%sword.istitle=%s' % (prefix, word[0].istitle()),
                '%sword.islower=%s' % (prefix, word[0].islower()),
                #'%sword.isdigit=%s' % (prefix, word[0].isdigit()),
            ])
    return feats

def seq2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def seq2labels(sent):
    return [tok[3] for tok in sent]

def merge_sequences(doc):
    '''
    document contains multiple sentences. here all sentences in document are merged to form one large sequence.
    '''
    res = []
    for seq in doc:
        res.extend(seq)
        res.append(['|', '|', '|', 'O']) # sentence end marker
    return res
    
trainer = pycrfsuite.Trainer(verbose=False)
#print a sample
flag = True
for doc in corpus:
    seq = merge_sequences(doc)
    x_seq = seq2features(seq)
    if flag:
        p = 5
        print("\n".join(map(str, seq[p-3:p+3])))
        print("\n".join(x_seq[p]))
        flag = False
    y_seq = seq2labels(seq)
    trainer.append(x_seq, y_seq)

trainer.set_params({
    'c1': 0.5,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier
    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

print(trainer.params())
model_file = 'jpl-mars-target-ner-model.crfsuite'
trainer.train(model_file)
print("Training Done")

!ls -alh 'jpl-mars-target-ner-model.crfsuite'
trainer.logparser.last_iteration

['|', '|', '|', 'O']
['PDF', 'PDF', 'NNP', 'O']
['CENTIMETER', 'CENTIMETER', 'NNP', 'O']
['TO', 'to', 'TO', 'O']
['DECIMETER', 'DECIMETER', 'NNP', 'O']
['SIZE', 'SIZE', 'NNP', 'O']
word.bias
word.pos=TO
word[-3:]=TO
word[-2:]=TO
word.islower=False
-1:word.bias
-1:word.pos=NNP
-1:word[-3:]=TER
-1:word[-2:]=ER
-1:word.islower=False
+1:word.bias
+1:word.pos=NNP
+1:word[-3:]=TER
+1:word[-2:]=ER
+1:word.islower=False
+2:word.bias
+2:word.pos=NNP
+2:word[-3:]=IZE
+2:word[-2:]=ZE
+2:word.islower=False
['feature.minfreq', 'feature.possible_states', 'feature.possible_transitions', 'c1', 'c2', 'max_iterations', 'num_memories', 'epsilon', 'period', 'delta', 'linesearch', 'max_linesearch']
Training Done
-rw-r--r--  1 thammegr  703763885   160K Feb 16 10:57 jpl-mars-target-ner-model.crfsuite
CPU times: user 1min 38s, sys: 251 ms, total: 1min 38s
Wall time: 1min 39s


# Using the model to predict

In [81]:
tagger = pycrfsuite.Tagger()
tagger.open(model_file)
doc = corpus[20]
seq = merge_sequences(doc)

y = seq2labels(seq)
y_ = tagger.tag(seq2features(seq))

c = 0
print("Truth, Predicted, [******]")
for idx, a,p, tok in zip(range(len(seq)), y, y_, seq):
    if a != 'O' or p != 'O':
        print(idx, a, p, tok, "<<<<<ERROR" if a != p else "")
        if a != p:
            c += 1
print(c)

Truth, Predicted, [******]
337 B-Target B-Target ['Confidence', 'Confidence', 'NNP', 'B-Target'] 
338 I-Target I-Target ['Hills', 'Hills', 'NNP', 'I-Target'] 
538 B-Target O ['Confidence', 'confidence', 'NN', 'B-Target'] <<<<<ERROR
539 I-Target O ['Hill', 'Hill', 'NNP', 'I-Target'] <<<<<ERROR
712 B-Target O ['Shoemaker', 'Shoemaker', 'NNP', 'B-Target'] <<<<<ERROR
714 B-Target B-Target ['Alexander', 'Alexander', 'NNP', 'B-Target'] 
715 I-Target I-Target ['Hills', 'Hills', 'NNP', 'I-Target'] 
718 B-Target B-Target ['Chinle', 'Chinle', 'NNP', 'B-Target'] 
735 B-Target B-Target ['Pink', 'Pink', 'NNP', 'B-Target'] 
736 I-Target I-Target ['Cliffs', 'Cliffs', 'NNPS', 'I-Target'] 
738 B-Target B-Target ['Alexander', 'Alexander', 'NNP', 'B-Target'] 
739 I-Target I-Target ['Hills', 'Hills', 'NNP', 'I-Target'] 
780 B-Target B-Target ['Pink', 'Pink', 'NNP', 'B-Target'] 
781 I-Target I-Target ['Cliffs', 'Cliffs', 'NNPS', 'I-Target'] 
783 B-Target B-Target ['Book', 'book', 'VB', 'B-Target'] 
784 I-T

# Evaluate

In [82]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    #tagset.append('O')
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

def evaluate(tagger, corpus_file):    
    corpus = pickle.load(open(corpus_file, 'rb'))
    y_pred = []
    y_true = []
    for doc in corpus:
        seq = merge_sequences(doc)
        y_true.append(seq2labels(seq))
        y_pred.append(tagger.tag(seq2features(seq)))
    return bio_classification_report(y_true, y_pred)


dev_corpus_file = 'mte-corpus-dev.pickle'
test_corpus_file = 'mte-corpus-test.pickle'
print("Development")
print(evaluate(tagger, dev_corpus_file))
print("Testing")
print(evaluate(tagger, test_corpus_file))

Development
             precision    recall  f1-score   support

   B-Target       0.86      0.17      0.28       147
   I-Target       0.75      0.43      0.55        14

avg / total       1.00      1.00      0.99     34970

Testing
             precision    recall  f1-score   support

   B-Target       0.92      0.25      0.39       194
   I-Target       0.89      0.40      0.55        20

avg / total       1.00      1.00      1.00     60630



# Learning: State Transitions

In [61]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
O      -> O       2.140082
B-Target -> B-Target 1.862358
B-Target -> I-Target 1.775527
I-Target -> B-Target 1.398788
I-Target -> I-Target 0.141875
I-Target -> O       -0.285274
O      -> B-Target -0.765632
B-Target -> O       -0.982107
O      -> I-Target -9.054744

Top unlikely transitions:
O      -> O       2.140082
B-Target -> B-Target 1.862358
B-Target -> I-Target 1.775527
I-Target -> B-Target 1.398788
I-Target -> I-Target 0.141875
I-Target -> O       -0.285274
O      -> B-Target -0.765632
B-Target -> O       -0.982107
O      -> I-Target -9.054744


# Learning: State Features

In [62]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
8.374055 O      word.islower=True
4.385436 B-Target word[-3:]=h-5
4.237877 B-Target word[-3:]=bed
4.030836 B-Target word[-3:]=_RP
3.594240 B-Target word[-3:]=d_7
3.594240 B-Target word[-2:]=_7
3.569414 B-Target word[-3:]=h_5
3.551940 B-Target word[-3:]=kee
3.526859 B-Target word[-3:]=bud
3.503635 B-Target word[-3:]=Dam
3.391822 I-Target -1:word[-3:]=ind
3.321321 I-Target -1:word[-3:]=nce
3.316165 O      +1:word[-3:]=ter
3.294934 B-Target word[-3:]=kle
3.200349 I-Target -1:word[-2:]=rt
3.195360 B-Target word[-3:]=JK
3.193591 B-Target word[-2:]=JK
3.102621 B-Target word[-3:]=pau
3.082004 B-Target word[-3:]=_7P
3.079147 B-Target word[-3:]=k-5

Top negative:
-2.446953 O      word[-2:]=ah
-2.611947 O      word[-3:]=dox
-2.629782 O      +1:word[-3:]=hed
-2.683963 O      word[-3:]=aat
-2.692232 O      word[-3:]=WER
-2.726936 O      word[-2:]=ak
-2.734673 O      word[-3:]=hew
-2.743660 O      word[-2:]=_5
-2.796850 O      word[-3:]=let
-2.798843 O      word[-3:]=bed
-2.842340 O  

In [38]:
"AbCd".istitle()

False