In [29]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
from pycorenlp import StanfordCoreNLP
from codecs import open as copen

import os
os.environ['PYTHONIOENCODING'] = 'utf-8'
import sys
print("Encoding::" + sys.getdefaultencoding())

print(sklearn.__version__)

Encoding::utf-8
0.18.1


In [58]:
accept_labels = set(['Element', 'Mineral', 'Target', 'Material', 'Locality', 'Site'])
accept_labels = set(['Target'])


class BratToCRFSuitFeaturizer(object):
    def __init__(self, corenlp_url='http://localhost:9000', iob=False):
        '''
        Create Converter for converting brat annotations to Core NLP NER CRF
        classifier training data.
        @param corenlp_url: URL to corenlp server.
                To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
        @param iob: set 'True' for IOB encoding
        '''
        self.corenlp = StanfordCoreNLP(corenlp_url)
        self.iob = iob

    def convert(self, text_file, ann_file):
        text, tree = self.parse(text_file, ann_file)
        props = { 'annotators': 'tokenize,ssplit,lemma,pos', 'outputFormat': 'json'}
        if text[0].isspace():
            text = '.' + text[1:]
            # Reason: some tools trim/strip off the white spaces
            # which will mismatch the character offsets
        output = self.corenlp.annotate(text, properties=props)
        records = []
        for sentence in output['sentences']:
            sent_features = []
            continue_ann, continue_ann_en = None, None
            for tok in sentence['tokens']:
                begin, tok_end = tok['characterOffsetBegin'], tok['characterOffsetEnd']
                label = 'O'
                if begin in tree:
                    node = tree[begin]
                    if len(node) > 1:
                        print("WARN: multiple starts at ", begin, node)
                        if tok_end in node:
                            node = {tok_end: node[tok_end]} # picking one
                            print("Chose:", node)

                    ann_end, labels = list(node.items())[0]
                    if not len(labels) == 1:
                        print("WARN: Duplicate labels for token: %s, label:%s. Using the first one!" % (tok['word'], str(labels)))
                    if accept_labels is not None and labels[0] in accept_labels:
                        label = labels[0]

                    if tok_end == ann_end: # annotation ends where token ends
                        continue_ann = None
                    elif tok_end < ann_end and label != 'O':
                        #print("Continue for the next %d chars" % (ann_end - tok_end))
                        continue_ann = label
                        continue_ann_end = ann_end 
                    if label != 'O' and self.iob:
                        label = "B-" + label
                elif continue_ann is not None and tok_end <= continue_ann_end:
                    #print("Continuing the annotation %s, %d:%d %d]" % (continue_ann, begin, tok_end, continue_ann_end))
                    label = continue_ann            # previous label is this label
                    if continue_ann_end == tok_end: # continuation ends here
                        #print("End")
                        continue_ann = None
                    if self.iob:
                        label = "I-" + label
                sent_features.append((tok['word'], label, tok['lemma'], tok['pos']))
            yield sent_features


    def parse(self, txt_file, ann_file):
        with open(txt_file, 'r', 'utf-8') as text_file, open(ann_file, 'r', 'utf-8') as ann_file:
            texts = text_file.read()
            text_file.close()
            #texts = text_file.read()
            anns = map(lambda x: x.strip().split('\t'), ann_file)
            anns = filter(lambda x: len(x) > 2, anns)
            # FIXME: ignoring the annotatiosn which are complex

            anns = filter(lambda x: ';' not in x[1], anns)
            # FIXME: some annotations' spread have been split into many, separated by ; ignoring them

            def __parse_ann(ann):
                spec = ann[1].split()
                name = spec[0]
                markers = list(map(lambda x: int(x), spec[1:]))
                #t = ' '.join([texts[begin:end] for begin,end in zip(markers[::2], markers[1::2])])
                t = texts[markers[0]:markers[1]]
                if not t == ann[2]:
                    print("Error: Annotation mis-match, file=%s, ann=%s" % (txt_file, str(ann)))
                    return None
                return (name, markers, t)
            anns = map(__parse_ann, anns) # format
            anns = filter(lambda x: x, anns) # skip None

            # building a tree index for easy accessing
            tree = {}
            for entity_type, pos, name in anns:
                begin, end = pos[0], pos[1]
                if begin not in tree:
                    tree[begin] = {}
                node = tree[begin]
                if end not in node:
                    node[end] = []
                node[end].append(entity_type)

            # Re-read file in without decoding it
            text_file = open(txt_file)
            texts = text_file.read()
            text_file.close()
            return texts, tree

    def convert_all(self, input_paths, output):
        with open(input_paths) as paths, open(output, 'w') as out:
            for p in map(lambda x: x.strip(), paths):
                d = p.split(',')
                print(d)
                for line in self.convert(d[0], d[1]):
                    out.write(line)
                    out.write("\n")
                out.write("\n") # end of document


In [71]:
def word2features(sent, idx):
    pass

def sentence2features(sent):
    pass

p_dir = "/Users/thammegr/work/mte/data/lpsc15-C-raymond-sol1159"
txt_file = p_dir + "/2855.txt"
ann_file = p_dir + "/2855.ann"
txts, tree = read_brat_ann(txt_file, ann_file)

featzr = BratToCRFSuitFeaturizer(iob=True)
train_sents = list(featzr.convert(txt_file, ann_file))

print("\n".join(map(str, train_sents[10])))

('The', 'O', 'the', 'DT')
('red', 'O', 'red', 'JJ')
('dots', 'O', 'dot', 'NNS')
('represent', 'O', 'represent', 'VBP')
('points', 'O', 'point', 'NNS')
('where', 'O', 'where', 'WRB')
('contact', 'O', 'contact', 'NN')
('science', 'O', 'science', 'NN')
('was', 'O', 'be', 'VBD')
('performed', 'O', 'perform', 'VBN')
('during', 'O', 'during', 'IN')
('loop', 'O', 'loop', 'NN')
('2', 'O', '2', 'CD')
('and', 'O', 'and', 'CC')
('the', 'O', 'the', 'DT')
('blue', 'O', 'blue', 'JJ')
('dot', 'O', 'dot', 'NN')
('represents', 'O', 'represent', 'VBZ')
('the', 'O', 'the', 'DT')
('location', 'O', 'location', 'NN')
('of', 'O', 'of', 'IN')
('a', 'O', 'a', 'DT')
('drill', 'O', 'drill', 'NN')
('campaign', 'O', 'campaign', 'NN')
('.', 'O', '.', '.')
