# BioCreative V Task 3 Evaluation CrowdFlower Work Unit Formatter

Tong Shu Li<br>
Created on: Monday 2015-08-17<br>
Last updated: 2015-08-21

This file formats the data for the final evaluation of BioCreative V for CrowdFlower.

The <code>classify_relations()</code> routine of the <code>Sentence</code> and <code>Paper</code> objects have already separated all possible chemical-disease relation pairs into three disjoint categories:

1. Relations which follow the "[chemical]-induced [disease]" (CID) structure.
2. Relations which co-occur within a sentence but do not follow the CID structure.
3. Relations which do not co-occur within any sentences.

This notebook takes the relation pairs in each category and generates the information needed for the CrowdFlower interface. No decision making about which category each relation belong to is performed here.

In [1]:
from collections import defaultdict
import os
import pandas as pd
import pickle

In [2]:
from src.data_model import parse_input
from src.make_sections import create_sections

In [3]:
def add_simple_tag(tag_name, tag_class, text):
    return "<{0} class=\"{1}\">{2}</{0}>".format(tag_name, tag_class, text)

---

### Read the evaluation test set:

In [4]:
loc = "data/final_eval"
fname = "CDR_annotated_testset.txt"

testset = parse_input(loc, fname, is_gold = False, return_format = "list")

In [5]:
len(testset)

500

---

### Highlighting functions:

In [6]:
def highlight_concepts(text, breaks):
    """
    Inserts HTML tags around the pieces of text
    which need to be highlighted in a string.
    """
    breaks = sorted(breaks, key = lambda x: x[0])
    
    final = []
    for i in range(len(breaks) - 1):
        s = text[breaks[i][0] : breaks[i+1][0]]
        if breaks[i][1] != "n":
            s = add_simple_tag("span", breaks[i][1], s)
            
        final.append(s)
        
    return "".join(final)

In [7]:
def highlight_text(text, offset, uniq_spans):
    """
    Given a string and the annotations which fall
    within this string, highlights the concepts.
    """
    # index of break, type of break (n = nothing)
    breaks = [(0, "n"), (len(text), "n")]
    
    for span in uniq_spans:
        breaks.append((span.start - offset, span.stype))
        breaks.append((span.stop - offset, "n"))
            
    return highlight_concepts(text, breaks)

---

### Processors for each of the subtasks:

In [8]:
def flat_repr(id_set):
    """Makes a string out of the ID frozenset."""
    vals = sorted(["{0}:{1}".format(v.uid_type, v.uid) for v in id_set])
    return "|".join(vals)

In [9]:
def grab_names(annotations):
    """Determines the unique names of the annotations."""
    used_names = defaultdict(set) # lower case set of used names (to avoid repeats)
    real_name = defaultdict(set) # set of unique names verbatim (to preseve capitalization)
    for annotation in annotations:
        if annotation.text.lower() not in used_names[annotation.stype]:
            used_names[annotation.stype].add(annotation.text.lower())
            real_name[annotation.stype].add(annotation.text)
            
    return real_name

In [10]:
def process_sentence_task(sentence, rel_pairs):
    """
    Given a Sentence object, and the set of chemical-disease relation
    identifier pairs, creates a set of sentence-level verification tasks.
    """
    data = defaultdict(list)
    for chemical_id, disease_id in rel_pairs:
        spans = [annotation for annotation in sentence.annotations if annotation.uid in [chemical_id, disease_id]]
        real_name = grab_names(spans)

        data["pmid"].append(sentence.pmid)
        
        data["form_sentence"].append(highlight_text(sentence.text, sentence.start, spans))
        
        flat_chem_id = flat_repr(chemical_id)
        flat_dise_id = flat_repr(disease_id)
        
        data["chemical_id"].append(flat_chem_id)
        data["disease_id"].append(flat_dise_id)
        
        data["chemical_name"].append(add_simple_tag("span", "chemical", "/".join(real_name["chemical"])))
        data["disease_name"].append(add_simple_tag("span", "disease", "/".join(real_name["disease"])))
        
        data["relation_pair_id"].append("{0}_{1}_{2}".format(sentence.pmid, flat_chem_id, flat_dise_id))
        
        data["sentence_id"].append(sentence.uid)
        
    return pd.DataFrame(data)

In [11]:
def process_abstract_task(paper, rel_pairs):
    """Makes a set of abstract-level tasks for one paper."""
    data = defaultdict(list)
    for chemical_id, disease_id in rel_pairs:
        spans = [annotation for annotation in paper.annotations if annotation.uid in [chemical_id, disease_id]]
        real_name = grab_names(spans)

        form_title = highlight_text(paper.title, 0,
                                    filter(lambda x: x.stop <= len(paper.title), spans))

        form_abstract = highlight_text(paper.abstract, len(paper.title) + 1,
                                       filter(lambda x: x.start > len(paper.title), spans))

        form_abstract = create_sections(form_abstract)

        data["pmid"].append(paper.pmid)

        data["form_title"].append(form_title)
        data["form_abstract"].append(form_abstract)
        
        flat_chem_id = flat_repr(chemical_id)
        flat_dise_id = flat_repr(disease_id)

        data["chemical_id"].append(flat_chem_id)
        data["disease_id"].append(flat_dise_id)
        data["chemical_name"].append(add_simple_tag("span", "chemical", "/".join(real_name["chemical"])))
        data["disease_name"].append(add_simple_tag("span", "disease", "/".join(real_name["disease"])))
            
    return pd.DataFrame(data)

---

### Create the work units for CrowdFlower:

In [12]:
def create_work_units(dataset):
    """
    Given a list of Paper objects representing the abstracts
    we wish to find the CID relations in, this function
    creates the work units for the CrowdFlower tasks.
    
    CID relations are judged to be always true and no crowd
    worker ever sees that relation.
    
    Each sentence-bound non-CID relation can create one or
    multiple sentence-level work units, depending on how
    many sentences in that abstract contain the relationship.
    
    Each non-sentence bound relation creates one abstract-level
    work unit.
    
    Relation type classification is already done by the Paper
    objects.
    """
    cid_relations = dict()
    easy_units = []
    hard_units = []
    for paper in dataset:
        cid_relations[paper.pmid] = paper.poss_relations["CID"]
        
        # create the sentence-level tasks:
        for sentence in paper.sentences:
            work = sentence.poss_relations[False] - paper.poss_relations["CID"]
            easy_units.append(process_sentence_task(sentence, work))
                
        # create the abstract-level tasks:
        hard_units.append(process_abstract_task(paper, paper.poss_relations["not_sentence_bound"]))
            
    # return two dataframes
    easy_units = pd.concat(easy_units).reset_index(drop = True)
    hard_units = pd.concat(hard_units).reset_index(drop = True)
    
    easy_units["uniq_id"] = pd.Series(["bcv_final_eval_easy_{0}".format(i) for i in range(len(easy_units))])
    hard_units["uniq_id"] = pd.Series(["bcv_final_eval_hard_{0}".format(i) for i in range(len(hard_units))])
    
    return (cid_relations, easy_units, hard_units)

In [13]:
cid_relations, easy_units, hard_units = create_work_units(testset)

In [14]:
with open("data/final_eval/cid_relations.pickle", "wb") as fout:
    pickle.dump(cid_relations, fout)

---

In [15]:
easy_units.shape

(2940, 9)

In [16]:
easy_units.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,form_sentence,pmid,relation_pair_id,sentence_id,uniq_id
0,MESH:D015738,"<span class=""chemical"">Famotidine</span>",MESH:D003693,"<span class=""disease"">delirium</span>","<span class=""chemical"">Famotidine</span>-assoc...",0,0_MESH:D015738_MESH:D003693,0_0,bcv_final_eval_easy_0
1,MESH:D006632,"<span class=""chemical"">histamine</span>",MESH:D014456,"<span class=""disease"">ulcers</span>","Famotidine is a <span class=""chemical"">histami...",0,0_MESH:D006632_MESH:D014456,0_1,bcv_final_eval_easy_1
2,MESH:D015738,"<span class=""chemical"">Famotidine</span>",MESH:D014456,"<span class=""disease"">ulcers</span>","<span class=""chemical"">Famotidine</span> is a ...",0,0_MESH:D015738_MESH:D014456,0_1,bcv_final_eval_easy_2
3,MESH:D015738,"<span class=""chemical"">famotidine</span>",MESH:D003693,"<span class=""disease"">delirium</span>",Although all of the currently available H2-rec...,0,0_MESH:D015738_MESH:D003693,0_2,bcv_final_eval_easy_3
4,MESH:D015738,"<span class=""chemical"">famotidine</span>",MESH:D003693,"<span class=""disease"">delirium</span>",The authors report on six cases of <span class...,0,0_MESH:D015738_MESH:D003693,0_3,bcv_final_eval_easy_4


In [17]:
hard_units.shape

(2760, 8)

In [18]:
hard_units.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,form_abstract,form_title,pmid,uniq_id
0,MESH:D006632,"<span class=""chemical"">histamine</span>",MESH:D003693,"<span class=""disease"">delirium</span>","Famotidine is a <span class=""chemical"">histami...","Famotidine-associated <span class=""disease"">de...",0,bcv_final_eval_hard_0
1,MESH:D000809,"<span class=""chemical"">angiotensin</span>",MESH:D007022,"<span class=""disease"">hypotension</span>",After a single oral dose of 4 mg/kg indomethac...,"Indomethacin induced <span class=""disease"">hyp...",1,bcv_final_eval_hard_1
2,MESH:D011239,"<span class=""chemical"">prednisolone</span>",MESH:D012595,"<span class=""disease"">systemic sclerosis/SSc</...",Scleroderma renal crisis (SRC) is a rare compl...,Late-onset scleroderma renal crisis induced by...,2,bcv_final_eval_hard_2
3,MESH:D011239,"<span class=""chemical"">prednisolone</span>",MESH:D057049,"<span class=""disease"">thrombotic microangiopat...",Scleroderma renal crisis (SRC) is a rare compl...,Late-onset scleroderma renal crisis induced by...,2,bcv_final_eval_hard_3
4,MESH:D016572,"<span class=""chemical"">cyclosporine</span>",MESH:D007674,"<span class=""disease"">scleroderma renal crisis...","<span class=""disease"">Scleroderma renal crisis...","Late-onset <span class=""disease"">scleroderma r...",2,bcv_final_eval_hard_4


### Add the test questions:

In [19]:
sent_test_ques = pd.read_csv("data/crowdflower/test_questions/job_764099_test_questions.tsv", sep = '\t')

In [20]:
sent_test_ques.head()

Unnamed: 0,verify_relationship_gold,verify_relationship_gold_reason,chemical_id,chemical_name,disease_id,disease_name,form_sentence,original_job_id,pmid,relation_pair_id,uniq_id
0,no_relation,The receptors to 5-HT6 are related to psychoti...,D012701,"<span class=""chemical"">5-HT</span>",D011605,"<span class=""disease"">psychotic disorders</span>",These animal models were considered to reflect...,762850,20705401,20705401_D012701_D011605,bcv_easy_68_sent_dev_set_762850
1,no_relation,D-penicillamine was used to treat the patients.,D010396,"<span class=""chemical"">D-penicillamine</span>",D012594,"<span class=""disease"">localized scleroderma</s...","Case reports of 11 patients with severe, exten...",762850,2334179,2334179_D010396_D012594,bcv_easy_2399_sent_dev_set_762850
2,no_relation,The sentence says the affective disorders were...,D015016,"<span class=""chemical"">yohimbine</span>",D019964,"<span class=""disease"">affective disorders</span>",METHOD: Six patients with either obsessive com...,762850,1535072,1535072_D015016_D019964,bcv_easy_642_sent_dev_set_762850
3,no_relation,The sentence says calcium supplementation cann...,D002118,"<span class=""chemical"">calcium</span>",D013035,"<span class=""disease"">muscle spasms</span>",While severe hypokalemia may cause muscle weak...,762850,8492347,8492347_D002118_D013035,bcv_easy_1271_sent_dev_set_762850
4,no_relation,Dexrazoxane is being used to try and make the ...,D064730,"<span class=""chemical"">dexrazoxane</span>",D006402,"<span class=""disease"">hematologic toxicity</span>",Clinical trials in patients with brain metasta...,762850,15897593,15897593_D064730_D006402,bcv_easy_2793_sent_dev_set_762850


In [21]:
sent_test_ques.loc[:, "original_job_id"] = sent_test_ques.loc[:, "original_job_id"].map(int)

sent_test_ques.loc[:, "verify_relationship_gold"] = sent_test_ques.loc[:, "verify_relationship_gold"].map(lambda x: "{0}\n".format(x))

sent_test_ques["_golden"] = "TRUE"

In [22]:
abs_test_ques = pd.read_csv("data/crowdflower/test_questions/final_eval_abs_test_ques.tsv", sep = '\t')

In [23]:
abs_test_ques.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,form_abstract,form_title,old_cf_work_unit_id,original_job_id,pmid,uniq_id,verify_relationship_gold,verify_relationship_gold_reason
0,D011188,"<span class=""chemical"">potassium</span>",D014202,"<span class=""disease"">tremor</span>",<p>BACKGROUND: The septo-hippocampal cholinerg...,The relationship between hippocampal acetylcho...,756391641,754530,12198388,bcv_hard_59_job_754530_testq,no_relation,The scientists investigated how genetics contr...
1,D020849,"<span class=""chemical"">raloxifene/raloxifene h...",D001851,"<span class=""disease"">osteopenia</span>","<p>BACKGROUND: Markers of fibrinolysis, thromb...","The effects of short-term <span class=""chemica...",756398159,754530,16167916,bcv_hard_92_job_754530_testq,no_relation,Women already with osteopenia were given the c...
2,D049971,"<span class=""chemical"">thiazide</span>",D006333,"<span class=""disease"">heart failure</span>",<p>BACKGROUND: A previous randomized controlle...,Spironolactone-induced renal insufficiency and...,756399808,754530,15632880,bcv_hard_46_job_754530_testq,no_relation,The text doesn't say that thiazide has anythin...
3,C016986,"<span class=""chemical"">apraclonidine/aponidine...",D004774,"<span class=""disease"">entropion</span>",We prospectively evaluated the adverse reactio...,Evaluation of adverse reactions of <span class...,756405204,754530,8590259,bcv_hard_54_job_754530_testq,yes_direct,"One subject developed ""mechanical entropion"" a..."
4,C016986,"<span class=""chemical"">apraclonidine/aponidine...",D003316,"<span class=""disease"">corneal abrasion</span>",We prospectively evaluated the adverse reactio...,Evaluation of adverse reactions of <span class...,756408483,754530,8590259,bcv_hard_51_job_754530_testq,yes_direct,One patient developed corneal abrasion 3 hours...


In [24]:
abs_test_ques.loc[:, "original_job_id"] = abs_test_ques.loc[:, "original_job_id"].map(int)

abs_test_ques.loc[:, "verify_relationship_gold"] = abs_test_ques.loc[:, "verify_relationship_gold"].map(lambda x: "{0}\n".format(x))

abs_test_ques["_golden"] = "TRUE"

In [25]:
final_sent_task = pd.concat([sent_test_ques, easy_units])

In [26]:
final_sent_task.shape

(3491, 13)

In [27]:
final_sent_task.to_csv("data/crowdflower/data_for_final_eval_sent_task.tsv", sep = '\t', index = False)

In [28]:
final_abs_task = pd.concat([abs_test_ques, hard_units])

In [29]:
final_abs_task.shape

(2994, 13)

In [30]:
final_abs_task.to_csv("data/crowdflower/data_for_final_eval_abs_task.tsv", sep = '\t', index = False)