# Mars Target Encyclopedia  - NER
Thamme Gowda (Thamme.Gowda@jpl.nasa.gov)

Named Entity Recognition / Sequence Tagging
This notebook contains NER tagging using CRF suite


### Notes:
 + Use python3, Reason: we need unicode strings, which is default in python3
 + install Python-crfsuite
 + Start CoreNLP Server

In [45]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
from pycorenlp import StanfordCoreNLP
from codecs import open as copen

import os, glob
import pickle

print(sklearn.__version__)

0.18.1


In [51]:
#accept_labels = set(['Element', 'Mineral', 'Target', 'Material', 'Locality', 'Site'])
accept_labels = set(['Target'])


class BratToCRFSuitFeaturizer(object):
    def __init__(self, corenlp_url='http://localhost:9000', iob=False):
        '''
        Create Converter for converting brat annotations to Core NLP NER CRF
        classifier training data.
        @param corenlp_url: URL to corenlp server.
                To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
        @param iob: set 'True' for IOB encoding
        '''
        self.corenlp = StanfordCoreNLP(corenlp_url)
        self.iob = iob

    def convert(self, text_file, ann_file):
        text, tree = self.parse(text_file, ann_file)
        props = { 'annotators': 'tokenize,ssplit,lemma,pos', 'outputFormat': 'json'}
        if text[0].isspace():
            text = '.' + text[1:]
            # Reason: some tools trim/strip off the white spaces
            # which will mismatch the character offsets
        output = self.corenlp.annotate(text, properties=props)
        records = []
        for sentence in output['sentences']:
            sent_features = []
            continue_ann, continue_ann_en = None, None
            for tok in sentence['tokens']:
                begin, tok_end = tok['characterOffsetBegin'], tok['characterOffsetEnd']
                label = 'O'
                if begin in tree:
                    node = tree[begin]
                    if len(node) > 1:
                        print("WARN: multiple starts at ", begin, node)
                        if tok_end in node:
                            node = {tok_end: node[tok_end]} # picking one
                            print("Chose:", node)

                    ann_end, labels = list(node.items())[0]
                    if not len(labels) == 1:
                        print("WARN: Duplicate labels for token: %s, label:%s.\
                              Using the first one!" % (tok['word'], str(labels)))
                    if accept_labels is not None and labels[0] in accept_labels:
                        label = labels[0]

                    if tok_end == ann_end: # annotation ends where token ends
                        continue_ann = None
                    elif tok_end < ann_end and label != 'O':
                        #print("Continue for the next %d chars" % (ann_end - tok_end))
                        continue_ann = label
                        continue_ann_end = ann_end 
                    if label != 'O' and self.iob:
                        label = "B-" + label
                elif continue_ann is not None and tok_end <= continue_ann_end:
                    #print("Continuing the annotation %s, %d:%d %d]" % 
                    #(continue_ann, begin, tok_end, continue_ann_end))
                    label = continue_ann            # previous label is this label
                    if continue_ann_end == tok_end: # continuation ends here
                        #print("End")
                        continue_ann = None
                    if self.iob:
                        label = "I-" + label
                sent_features.append([tok['word'], tok['lemma'], tok['pos'], label])
            yield sent_features


    def parse(self, txt_file, ann_file):
        with copen(ann_file, 'r', encoding='utf-8') as ann_file:
            with copen(txt_file, 'r', encoding='utf-8') as text_file:
                texts = text_file.read()
            anns = map(lambda x: x.strip().split('\t'), ann_file)
            anns = filter(lambda x: len(x) > 2, anns)
            # FIXME: ignoring the annotatiosn which are complex

            anns = filter(lambda x: ';' not in x[1], anns)
            # FIXME: some annotations' spread have been split into many, separated by ; ignoring them

            def __parse_ann(ann):
                spec = ann[1].split()
                name = spec[0]
                markers = list(map(lambda x: int(x), spec[1:]))
                #t = ' '.join([texts[begin:end] for begin,end in zip(markers[::2], markers[1::2])])
                t = texts[markers[0]:markers[1]]
                if not t == ann[2]:
                    print("Error: Annotation mis-match, file=%s, ann=%s" % (txt_file, str(ann)))
                    return None
                return (name, markers, t)
            anns = map(__parse_ann, anns) # format
            anns = filter(lambda x: x, anns) # skip None

            # building a tree index for easy accessing
            tree = {}
            for entity_type, pos, name in anns:
                begin, end = pos[0], pos[1]
                if begin not in tree:
                    tree[begin] = {}
                node = tree[begin]
                if end not in node:
                    node[end] = []
                node[end].append(entity_type)

            # Re-read file in without decoding it
            text_file = copen(txt_file, 'r', encoding='utf-8')
            texts = text_file.read()
            text_file.close()
            return texts, tree

def scan_dir(dir_name):
    items = glob.glob(dir_name + "/*.ann")
    items = map(lambda f: (f, f.replace(".ann", ".txt")), items)
    return items

def preprocess_all(list_file, out_file):
    featzr = BratToCRFSuitFeaturizer(iob=True)
    tokenized = []
    with open(list_file) as f:
        examples = map(lambda l:l.strip().split(','), f.readlines())
    for txt_file, ann_file in examples:
        sents = featzr.convert(txt_file, ann_file)
        tokenized.append(list(sents))

    pickle.dump(tokenized, open(out_file, 'wb'))
    print("Dumped %d docs to %s" % (len(tokenized), out_file))

## Parse and store the corpus

In this step, we pass the text through CoreNLP pipeline, tokenize and POS tag them. 
In addition, we lookup the annotations file and match the target annotations with the token. 

Since this step is expensive, we store the results in pickle file, so that we can later load and resume our analysis for feature engineering.

In [52]:
p_dir = "/Users/thammegr/work/mte/data/newcorpus/workspace"
train_list = p_dir + "/train_62r15_685k14_384k15.list"
dev_list= p_dir + "/development.list"
test_list = p_dir + "/test.list"

train_corpus_file = 'mte-corpus-train.pickle'
preprocess_all(train_list, train_corpus_file)

# Test and Development set
dev_corpus_file = 'mte-corpus-dev.pickle'
preprocess_all(dev_list, dev_corpus_file)
test_corpus_file = 'mte-corpus-test.pickle'
preprocess_all(test_list, test_corpus_file)

WARN: multiple starts at  4183 {4224: ['Feature'], 4189: ['Element']}
Chose: {4189: ['Element']}
WARN: multiple starts at  5862 {5865: ['Material'], 5878: ['Feature']}
WARN: multiple starts at  2533 {2553: ['Member'], 2546: ['Target']}
Chose: {2546: ['Target']}
WARN: multiple starts at  2648 {2659: ['Member'], 2652: ['Target']}
Chose: {2652: ['Target']}
WARN: multiple starts at  2733 {2750: ['Member'], 2743: ['Target']}
Chose: {2743: ['Target']}
WARN: multiple starts at  2816 {2832: ['Member'], 2825: ['Target']}
Chose: {2825: ['Target']}
WARN: multiple starts at  2944 {2965: ['Member'], 2958: ['Site']}
WARN: multiple starts at  3052 {3065: ['Member'], 3058: ['Target']}
Chose: {3058: ['Target']}
WARN: multiple starts at  3270 {3281: ['Member'], 3274: ['Target']}
Chose: {3274: ['Target']}
WARN: multiple starts at  4443 {4459: ['Member'], 4452: ['Target']}
Chose: {4452: ['Target']}
WARN: multiple starts at  4743 {4749: ['Target'], 4758: ['Site']}
Chose: {4749: ['Target']}
WARN: multiple s

## Load the corpus
Here we load the corpus from pickle file

In [53]:
corpus_file = 'mte-corpus-train.pickle'
corpus = pickle.load(open(corpus_file, 'rb'))
corpus[0][10]

[['Hollow', 'hollow', 'JJ', 'O'],
 ['spherical', 'spherical', 'JJ', 'O'],
 ['feature', 'feature', 'NN', 'O'],
 ['observed', 'observe', 'VBN', 'O'],
 ['on', 'on', 'IN', 'O'],
 ['sol', 'sol', 'NN', 'O'],
 ['122', '122', 'CD', 'O'],
 ['in', 'in', 'IN', 'O'],
 ['the', 'the', 'DT', 'O'],
 ['Yellowknife', 'Yellowknife', 'NNP', 'O'],
 ['Bay', 'Bay', 'NNP', 'O'],
 ['area', 'area', 'NN', 'O'],
 ['.', '.', '.', 'O']]

Next, we start playing with the features of CRF Suite to build a sequence tagger.

In [84]:
%%time

def word2features(sent, idx):
    word = sent[idx]
    feats = [
        'word.bias',
        'word.lemma=' + word[1],
        'word.pos=' + word[2],
        'word.lower=' + word[0].lower(),
        'word[-3:]=' + word[0][-3:],
        'word[-2:]=' + word[0][-2:],
        'word.isupper=%s' % word[0].isupper(),
        'word.istitle=%s' % word[0].istitle(),
        'word.isdigit=%s' % word[0].isdigit(),
    ]
    if idx > 0:
        word = sent[idx-1]
        feats.extend([
            '-1:word.bias',
            '-1:word.lemma=' + word[1],
            '-1:word.pos=' + word[2],
            '-1:word.lower=' + word[0].lower(),
            '-1:word[-3:]=' + word[0][-3:],
            '-1:word[-2:]=' + word[0][-2:],
            '-1:word.isupper=%s' % word[0].isupper(),
            '-1:word.istitle=%s' % word[0].istitle(),
            '-1:word.isdigit=%s' % word[0].isdigit(),
        ])
    else:
        feats.append('BOS')
    if idx < len(sent) - 1:
        word = sent[idx + 1]
        feats.extend([
            '+1:word.bias',
            '+1:word.lemma=' + word[1],
            '+1:word.pos=' + word[2],
            '+1:word.lower=' + word[0].lower(),
            '+1:word[-3:]=' + word[0][-3:],
            '+1:word[-2:]=' + word[0][-2:],
            '+1:word.isupper=%s' % word[0].isupper(),
            '+1:word.istitle=%s' % word[0].istitle(),
            '+1:word.isdigit=%s' % word[0].isdigit(),
        ])
    else:
        feats.append('EOS')
    return feats

def seq2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def seq2labels(sent):
    return [tok[3] for tok in sent]

def merge_sequences(doc):
    '''
    document contains multiple sentences. here all sentences in document are merged to form one large sequence.
    '''
    res = []
    for seq in doc:
        res.extend(seq)
        res.append(['|', '|', '|', 'O']) # sentence end marker
    return res
    
trainer = pycrfsuite.Trainer(verbose=False)
for doc in corpus:
    seq = merge_sequences(doc)
    x_seq = seq2features(seq)
    y_seq = seq2labels(seq)
    trainer.append(x_seq, y_seq)

trainer.set_params({
    'c1': 0.5,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier
    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

print(trainer.params())
model_file = 'jpl-mars-target-ner-model.crfsuite'
trainer.train(model_file)
print("Training Done")

!ls -alh 'jpl-mars-target-ner-model.crfsuite'
trainer.logparser.last_iteration

['feature.minfreq', 'feature.possible_states', 'feature.possible_transitions', 'c1', 'c2', 'max_iterations', 'num_memories', 'epsilon', 'period', 'delta', 'linesearch', 'max_linesearch']
Training Done
-rw-r--r--  1 thammegr  703763885   214K Feb 15 16:53 jpl-mars-target-ner-model.crfsuite
CPU times: user 2min 18s, sys: 1.08 s, total: 2min 19s
Wall time: 2min 19s


# Using the model to predict

In [85]:
tagger = pycrfsuite.Tagger()
tagger.open(model_file)
doc = corpus[20]
seq = merge_sequences(doc)

y = seq2labels(seq)
y_ = tagger.tag(seq2features(seq))

c = 0
for a,p, tok in zip(y,y_, seq):
    #if a != 'O' and p != 'O':
    if a != p:
        print(a,p,tok)
        c += 1
print(c)

B-Target O ['Shoemaker', 'Shoemaker', 'NNP', 'B-Target']
1


# Evaluate

In [86]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    #tagset.append('O')
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

def evaluate(tagger, corpus_file):    
    corpus = pickle.load(open(corpus_file, 'rb'))
    y_pred = []
    y_true = []
    for doc in corpus:
        seq = merge_sequences(doc)
        y_true.append(seq2labels(seq))
        y_pred.append(tagger.tag(seq2features(seq)))
    return bio_classification_report(y_true, y_pred)

print(evaluate(tagger, dev_corpus_file))
print(evaluate(tagger, test_corpus_file))

             precision    recall  f1-score   support

   B-Target       0.95      0.24      0.39       147
   I-Target       0.88      0.50      0.64        14

avg / total       1.00      1.00      1.00     34970

             precision    recall  f1-score   support

   B-Target       0.95      0.30      0.46       194
   I-Target       1.00      0.35      0.52        20

avg / total       1.00      1.00      1.00     60630



# Learning: State Transitions

In [88]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
B-Target -> I-Target 2.317607
I-Target -> I-Target 1.922520
O      -> O       1.387535
B-Target -> B-Target 1.191201
I-Target -> B-Target 1.121970
I-Target -> O       -0.366761
O      -> B-Target -0.370077
B-Target -> O       -2.414795
O      -> I-Target -6.240270

Top unlikely transitions:
B-Target -> I-Target 2.317607
I-Target -> I-Target 1.922520
O      -> O       1.387535
B-Target -> B-Target 1.191201
I-Target -> B-Target 1.121970
I-Target -> O       -0.366761
O      -> B-Target -0.370077
B-Target -> O       -2.414795
O      -> I-Target -6.240270


# Learning: State Features

In [87]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
10.330524 B-Target lemma=bonanza_king
9.577672 B-Target lemma=windjana
8.777970 B-Target lemma=Chinle
8.748215 B-Target lemma=epworth_5
8.467253 B-Target lemma=Winnipesaukee
8.425027 B-Target lemma=stovepipe_wells
8.273857 B-Target lemma=mondooma
7.867740 B-Target lemma=perdido2
7.620713 B-Target lemma=Tappers
7.592996 B-Target lemma=rocknest3
7.560412 B-Target lemma=Epworth
7.324442 B-Target lemma=epworth-5
7.252722 B-Target lemma=blackhead_7
7.207652 B-Target lemma=square_top
7.067435 B-Target lemma=yarrada
7.053535 B-Target lemma=Bathurst
6.935831 B-Target lemma=Kilian
6.654012 B-Target lemma=Methuselah
6.568420 B-Target lemma=Green_Head
6.448084 B-Target lemma=Pinnacle

Top negative:
-3.421099 O      lemma=Stark
-3.428704 O      lemma=Watchtower
-3.441445 O      lemma=rocknest3_3
-3.552088 O      lemma=Harrison
-3.670150 O      lemma=Darwin
-3.726463 O      +1]lemma=class
-4.053420 O      lemma=Comanche
-4.124759 O      lemma=Cumberland
-4.191936 O      lemma=RN
-4.26

False