# NER Output Combiner

Tong Shu Li<br>
Created on: Monday 2015-08-17<br>
Last updated: 2015-08-17

This notebook combines the outputs of tmChem and DNorm into one file for processing.

In [1]:
from src.lingpipe.file_util import read_file

In [2]:
tmchem_fname = "data/tmchem/tmchem_training.txt.tmChem"
dnorm_fname = "data/dnorm/dnorm_training_output.txt"

In [3]:
output_fname = "data/final_eval/training.txt"

In [7]:
def read_output(fname):
    res = dict()
    
    counter = 0
    pmid = -1
    title = ""
    abstract = ""
    concepts = []
    for line in read_file(fname):
        if len(line) == 0:
            res[pmid] = (title, abstract, concepts)
            counter = 0
            concepts = []
        else:
            if 0 <= counter <= 1:
                vals = line.split("|")
                assert len(vals) == 3
                
                pmid = int(vals[0])
                
                if vals[1] == "t":
                    title = vals[2]
                elif vals[1] == "a":
                    abstract = vals[2]
            else:
                concepts.append(line)
            
            counter += 1
            
    return res

In [10]:
chem = read_output(tmchem_fname)
dise = read_output(dnorm_fname)

In [14]:
assert set(chem.keys()) == set(dise.keys())

In [15]:
pmids = set(chem.keys())

In [19]:
with open(output_fname, "w") as fout:
    for pmid in pmids:
        assert chem[pmid][0] == dise[pmid][0] # title same
        assert chem[pmid][1] == dise[pmid][1] # abstract same
        
        title = chem[pmid][0]
        abstract = chem[pmid][1]
        
        concepts = chem[pmid][2] + dise[pmid][2]
        
        fout.write("{0}|t|{1}\n".format(pmid, title))
        fout.write("{0}|a|{1}\n".format(pmid, abstract))
        fout.write("{0}\n".format("\n".join(concepts)))
        fout.write("\n")