# NER processing of BioCreative V Task 3 development dataset

Tong Shu Li<br>
Created on: Monday 2015-08-17<br>
Last updated: 2015-08-25

Preprocessing for the 100 abstract testing set from the gold standard development set.

In [1]:
import random

In [2]:
random.seed(3249601771853041575)

In [3]:
TESTSET_SIZE = 100

In [4]:
from src.lingpipe.file_util import read_file

### Grab all PMIDs for random selection

In [5]:
def read_pmids(fname):
    pmids = []
    for line in read_file(fname):
        if "|" in line:
            vals = line.split("|")
            if len(vals) == 3 and vals[1] in ["t", "a"]:
                pmids.append(int(vals[0]))
                
    return set(pmids)

In [6]:
devset_pmids = read_pmids("data/gold_standard/CDR_DevelopmentSet.txt")

In [7]:
len(devset_pmids)

500

In [8]:
devset_pmids

{2004,
 28952,
 33969,
 48362,
 85485,
 133615,
 188339,
 326460,
 343678,
 384871,
 430165,
 458486,
 591536,
 625456,
 663266,
 761833,
 783197,
 803783,
 804391,
 839274,
 871943,
 873132,
 921394,
 950631,
 982002,
 1079693,
 1141447,
 1255900,
 1286498,
 1300436,
 1355091,
 1415380,
 1423339,
 1445986,
 1504402,
 1535072,
 1545575,
 1564236,
 1595783,
 1610717,
 1628552,
 1636026,
 1687392,
 1700207,
 1711760,
 1728915,
 1732442,
 1848636,
 1899352,
 1928887,
 1969772,
 2004015,
 2021202,
 2051906,
 2055425,
 2220369,
 2257294,
 2320800,
 2322844,
 2334179,
 2339463,
 2343592,
 2422478,
 2435991,
 2466960,
 2528969,
 2533791,
 2557556,
 2564649,
 2569282,
 2576810,
 2594614,
 2598570,
 2709684,
 2716967,
 2722224,
 2750819,
 2790457,
 2819587,
 2826064,
 2840807,
 2886572,
 2893236,
 2894433,
 2907585,
 2917114,
 2980315,
 3001299,
 3015327,
 3057041,
 3084782,
 3088349,
 3108839,
 3115150,
 3120485,
 3123611,
 3131282,
 3137399,
 3183120,
 3220106,
 3300918,
 3311455,
 3411101,
 

In [9]:
devset_pmids = list(devset_pmids)

### Randomly sample for testing:

In [10]:
devset_testset = random.sample(devset_pmids, TESTSET_SIZE)

In [11]:
len(devset_testset)

100

In [12]:
devset_testset

[6111982,
 8492347,
 18483878,
 7542793,
 2917114,
 1535072,
 3973521,
 11704023,
 9636837,
 8111719,
 8424298,
 7880714,
 3088349,
 15565293,
 11868798,
 1610717,
 11860278,
 7650771,
 2598570,
 7803371,
 1628552,
 16192988,
 7437994,
 10225068,
 2322844,
 15096016,
 20042557,
 8800187,
 2894433,
 12452237,
 982002,
 9041081,
 16225977,
 11337188,
 8686832,
 10910842,
 15858223,
 10328196,
 16471092,
 8586822,
 1564236,
 6118280,
 19370593,
 2004015,
 3703509,
 15811908,
 6454943,
 18657397,
 10743694,
 9564988,
 11999899,
 18356633,
 20683499,
 20466178,
 12911170,
 1595783,
 7651879,
 1969772,
 16596970,
 803783,
 15266215,
 8305357,
 18006530,
 10840460,
 6150641,
 9270571,
 2257294,
 3969369,
 326460,
 21418164,
 3708922,
 2826064,
 20533999,
 9098464,
 3300918,
 6892185,
 3311455,
 19135948,
 15893386,
 16174948,
 11282081,
 12448656,
 11063349,
 11524350,
 9875685,
 6496797,
 11077455,
 8480959,
 7710775,
 8267029,
 1423339,
 11705128,
 15266362,
 19761039,
 17682013,
 33969,
 1

---

### Clean data for NER annotation

In [13]:
def strip_file(pmids, fin_loc, fout_loc):
    with open(fout_loc, "w") as fout:
        newline = False
        for line in read_file(fin_loc):
            if len(line) == 0 and newline:
                fout.write("\n")
                newline = False
            elif len(line) > 0 and "|" in line:
                vals = line.split("|")
                if vals[1] in ["t", "a"] and int(vals[0]) in pmids:
                    fout.write("{0}\n".format(line))
                    newline = (vals[1] == 'a')

In [14]:
strip_file(set(devset_testset), "data/gold_standard/CDR_DevelopmentSet.txt", "data/devset_100_test/stripped_CDR_devset.txt")

### Run file through tmChem to annotate chemicals:

In [15]:
%%bash

# move things to the correct directory
cur_path=$(pwd)
cp data/devset_100_test/stripped_CDR_devset.txt src/tmChem.M2.ver02/input/stripped_CDR_devset.txt
cd src/tmChem.M2.ver02

# run tmChem
perl tmChem.pl -i input -o output Model/All.Model

# move results back
mv output/*.tmChem $cur_path/data/devset_100_test/tmchem
rm input/*
cd $cur_path

Input format: PubTator
Running tmChem on 100 docs in stripped_CDR_devset.txt ...Running tmChem on 100 docs in stripped_CDR_devset.txt ... Finished in 11 seconds. 


---

### Run DNorm:

Make a inloc and outloc folder in DNorm's folder to hold our data files.

In [16]:
%%bash

cur_path=$(pwd)
dnorm_path=$cur_path/src/DNorm-0.0.7
ab3p_loc=$cur_path/src/Ab3P-v1.5

cp data/devset_100_test/stripped_CDR_devset.txt $dnorm_path/inloc/stripped_CDR_devset_input.txt
cd $dnorm_path

for fin in inloc/*_input.txt;
do
    fname=`basename $fin`;
    
    outpath="outloc/${fname/input/output}";
    
    ./ApplyDNorm.sh config/banner_BC5CDR_UMLS2013AA_SAMPLE.xml data/CTD_diseases-2015-06-04.tsv output/simmatrix_BC5CDR_e4_TRAINDEV.bin $ab3p_loc TEMP $fin $outpath
done

# move everything back to the original directory
mv inloc/*_input.txt $cur_path/data/devset_100_test/dnorm
mv outloc/*_output.txt $cur_path/data/devset_100_test/dnorm

Creating index
Not adding alternate name Alpha-1 Antitrypsin Deficiency to concept MESH:C566273 because it is the primary name of a parent
Not adding alternate name Anemia, Hypoplastic Congenital to concept MESH:D029503 because it is the primary name of a parent
Not adding alternate name Anemias, Hypoplastic Congenital to concept MESH:D029503 because it is the primary name of a parent
Not adding alternate name Congenital Anemia, Hypoplastic to concept MESH:D029503 because it is the primary name of a parent
Not adding alternate name Congenital Anemias, Hypoplastic to concept MESH:D029503 because it is the primary name of a parent
Not adding alternate name Hypoplastic Congenital Anemia to concept MESH:D029503 because it is the primary name of a parent
Not adding alternate name Hypoplastic Congenital Anemias to concept MESH:D029503 because it is the primary name of a parent
Not adding alternate name ANIRIDIA to concept MESH:C536372 because it is the primary name of a parent
Not adding alt

### Combine outputs of DNorm and tmChem into one file:

In [17]:
tmchem_fname = "data/devset_100_test/tmchem/stripped_CDR_devset.txt.tmChem"
dnorm_fname = "data/devset_100_test/dnorm/stripped_CDR_devset_output.txt"

In [18]:
output_fname = "data/devset_100_test/processed_CDR_devset.txt"

In [19]:
def read_output(fname):
    res = dict()
    
    counter = 0
    pmid = -1
    title = ""
    abstract = ""
    concepts = []
    for line in read_file(fname):
        if len(line) == 0:
            res[pmid] = (title, abstract, concepts)
            counter = 0
            concepts = []
        else:
            if 0 <= counter <= 1:
                vals = line.split("|")
                assert len(vals) == 3
                
                pmid = int(vals[0])
                
                if vals[1] == "t":
                    title = vals[2]
                elif vals[1] == "a":
                    abstract = vals[2]
            else:
                concepts.append(line)
            
            counter += 1
            
    return res

In [20]:
chem = read_output(tmchem_fname)
dise = read_output(dnorm_fname)

In [21]:
assert set(chem.keys()) == set(dise.keys())

In [22]:
pmids = set(chem.keys())

In [23]:
with open(output_fname, "w") as fout:
    for pmid in pmids:
        assert chem[pmid][0] == dise[pmid][0] # title same
        assert chem[pmid][1] == dise[pmid][1] # abstract same
        
        title = chem[pmid][0]
        abstract = chem[pmid][1]
        
        concepts = chem[pmid][2] + dise[pmid][2]
        
        fout.write("{0}|t|{1}\n".format(pmid, title))
        fout.write("{0}|a|{1}\n".format(pmid, abstract))
        fout.write("{0}\n".format("\n".join(concepts)))
        fout.write("\n")