# Testing the chemical highlighting abilities of tmChem

2015-06-16 Tong Shu Li

Our crowdsourcing approach relies upon being able to exhaustively annotate all chemical annotations in the original raw text. Here we test to see how well tmChem can annotate chemicals.

In [1]:
from __future__ import division

In [2]:
import sys

In [3]:
sys.path.append("/home/toby/Code/util")
from file_util import read_file

### We first take the data for biocreative V and strip it down to the original raw text:

Training data:

In [4]:
with open("data/tmchem_training.txt", "w") as out:
    for line in read_file("data/training/CDR_TrainingSet.txt"):
        vals = line.split('|')
        if len(vals) == 3 or len(line) == 0:
            out.write("{0}\n".format("|".join(vals)))

Development data:

In [5]:
with open("data/tmchem_development.txt", "w") as out:
    for line in read_file("data/development/CDR_DevelopmentSet.txt"):
        vals = line.split('|')
        if len(vals) == 3 or len(line) == 0:
            out.write("{0}\n".format("|".join(vals)))

### Run tmChem:

In [6]:
% mv data/tmchem_*.txt ~/Code/tmChem/tmChem/input

In [7]:
% cd ~/Code/tmChem/tmChem

/home/toby/Code/tmChem/tmChem


In [8]:
% pwd

u'/home/toby/Code/tmChem/tmChem'

In [9]:
! perl tmChem.pl -i input -o output Model/All.Model

Input format: PubTator
Running tmChem on 500 docs in tmchem_training.txt ... Finished in 56 seconds. 
Input format: PubTator
Running tmChem on 500 docs in tmchem_development.txt ... Finished in 57 seconds. 


In [10]:
% mv output/*.tmChem ~/Research/Projects/biocreativeV/data

In [11]:
% cd ~/Research/Projects/biocreativeV

/home/toby/Research/Projects/biocreativeV


In [12]:
% pwd

u'/home/toby/Research/Projects/biocreativeV'

### Our representations of the data:

In [13]:
class Annotation:
    def __init__(self, uid, stype, text, start, stop):
        if uid.startswith("MESH:"):
            uid = uid[5 : ]
        
        self.uid = uid
        self.stype = stype.lower()
        assert self.stype in ["chemical", "disease"]
        self.text = text
        self.start = int(start)
        self.stop = int(stop)
        assert self.start < self.stop
        
    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.__dict__ == other.__dict__
            
        return False
    
    def __ne__(self, other):
        return not self.__eq__(other)
        
    def output(self):
        print self.uid
        print self.start
        print self.stop
        print self.text
        print

In [14]:
class Relation:
    def __init__(self, drug, disease):
        assert drug != "-1"
        assert disease != "-1"
        self.drug = drug
        self.disease = disease
        
    def output(self):
        print self.drug, self.disease

In [15]:
def make_annotations(annotations):
    """
    Annotations with an identifier of -1 or with
    no known identifier are ignored because they
    never show up in a relationship.
    
    Ignored for comparision too for the above
    reason.
    """
    chemicals = []
    diseases = []
    
    for group in annotations:
        if group[5] != "-1":
            res = Annotation(group[5], group[4], group[3], group[1], group[2])
            if res.stype == "chemical":
                chemicals.append(res)
            else:
                diseases.append(res)
                
    return (chemicals, diseases)

def make_relations(relations):
    res = []
    for group in relations:
        res.append(Relation(group[2], group[3]))
    
    return res
        
class Paper:
    def __init__(self, pmid, title, abstract, annotations, relations):
        self.pmid = pmid
        self.title = title
        self.abstract = abstract
        
        self.chemicals, self.diseases = make_annotations(annotations)
        self.relations = make_relations(relations)
        
    def output(self):
        print self.pmid
        print len(self.annotations), len(self.relations)

In [16]:
def parse_input(loc, fname):
    """
    Parses the given input file and returns a list
    of Paper objects.
    """
    papers = []

    counter = 0
    annotations = []
    relations = []
    for i, line in enumerate(read_file(fname, loc)):
        if len(line) == 0:
            # time to finish up this paper and prepare a new one
            papers.append(Paper(pmid, title, abstract, annotations, relations))

            counter = 0

            annotations = []
            relations = []
        else:
            if 0 <= counter <= 1:
                vals = line.split('|')
                assert len(vals) == 3
            else:
                vals = line.split('\t')

            if counter == 0:
                assert vals[1] == 't'
                pmid = vals[0]            
                title = vals[2]
            elif counter == 1:
                assert vals[1] == 'a'
                abstract = vals[2]
            elif len(vals) == 4:
                relations.append(vals)
            else:
                assert 5 <= len(vals) <= 7, pmid
                # 5 fields means it determined that the text span
                # was a chemical, but could not assign an identifier
                
                # 7 means it was a mistake in the original input (extra tab)
                # 6 is the ideal output
                
                if len(vals) == 5:
                    vals.append("-1")
                
                annotations.append(vals) # 6 or 7 fields

            counter += 1
            
    return papers

### Grab tmChem's output:

In [18]:
tmchem_training = parse_input("data", "tmchem_training.txt.tmChem")

In [20]:
tmchem_development = parse_input("data", "tmchem_development.txt.tmChem")

### Grab the gold standard data:

In [22]:
gold_training = parse_input("data/training", "CDR_TrainingSet.txt")

In [23]:
gold_development = parse_input("data/development", "CDR_DevelopmentSet.txt")

### Look at the performance of tmChem:

In [24]:
def results(program_output, gold_std_data):
    TP = 0
    FP = 0
    sum_chemicals = 0
    for p_data, gold_std in zip(program_output, gold_std_data):
        assert p_data.pmid == gold_std.pmid

        # check tmChem's output against the gold standard

        tp = 0
        fp = 0
        for annot in p_data.chemicals:
            if annot in gold_std.chemicals:
                tp += 1
            else:
                fp += 1

        sum_chemicals += len(gold_std.chemicals)
        TP += tp
        FP += fp

    print "recall: {0}".format(TP / sum_chemicals)
    print "precision: {0}".format(TP / (TP + FP))

    print "TP: {0}".format(TP)
    print "FP: {0}".format(FP)
    print "all gold annotations: {0}".format(sum_chemicals)

In [25]:
results(tmchem_training, gold_training)

recall: 0.984350850077
precision: 0.993952399532
TP: 5095
FP: 31
all gold annotations: 5176


In [26]:
results(tmchem_development, gold_development)

recall: 0.812028657617
precision: 0.940803844474
TP: 4307
FP: 271
all gold annotations: 5304


## In conclusion, it looks like tmChem does pretty well at identifying the chemicals in a piece of text. The recall is a lot lower on the development set, but is still high

If we really were worried about recall, then we could always add more concept recognizers to drive up total recall.

# for the development set, look at the loss of recall