# Work unit creator for 100 abstract test

Tong Shu Li<br>
Created on: 2015-08-25

Our 200 abstract run did poorly overall since we had spammers in the abstract-level task (#758438) and the sentence-level task (#761593) never completed due to a lack of test questions and worker judgment capping. Therefore we want to run one more test on another 100 abstracts so that we can discuss and compare to our official results with the evaluation dataset.

We will keep everything the same as the settings and parameters we used for the 500 evaluation dataset run, so that the results are directly comparable.

In [1]:
from collections import defaultdict
import os
import pandas as pd
import pickle
import random

In [2]:
from src.data_model import parse_input
from src.make_sections import create_sections

In [3]:
def add_simple_tag(tag_name, tag_class, text):
    return "<{0} class=\"{1}\">{2}</{0}>".format(tag_name, tag_class, text)

---

### Read the development set 100 abstracts for testing:

In [4]:
loc = "data/devset_100_test"
fname = "processed_CDR_devset.txt"

testset = parse_input(loc, fname, is_gold = False, return_format = "list", fix_acronyms = True)

In [5]:
for paper in testset:
    print(paper.pmid)

9669632
20466178
18006530
7542793
7880714
6496797
7651879
15858223
2004015
16471092
11063349
3300918
7710775
15266362
12911170
8586822
1564236
17682013
1535072
19370593
2894433
8111719
1969772
18657397
10910842
11868798
10328196
9201797
6892185
11999899
6454943
11282081
18483878
7437994
2598570
10565806
33969
21418164
16225977
9041081
10523326
8480959
8305357
15096016
3703509
11704023
9098464
9875685
20683499
6111982
20533999
8686832
2917114
11524350
8267029
11705128
9270571
11860278
8492347
9564988
20042557
326460
15811908
11077455
2826064
3969369
3311455
16174948
8424298
1595783
6118280
1628552
15893386
10840460
12452237
2257294
10743694
19761039
12448656
3973521
18356633
2322844
15266215
10225068
8800187
803783
19135948
7650771
16192988
1610717
3088349
11337188
9636837
16596970
7803371
1423339
15565293
6150641
982002
3708922


In [35]:
len(testset)

100

### Highlighting functions:

In [6]:
def highlight_concepts(text, breaks):
    """
    Inserts HTML tags around the pieces of text
    which need to be highlighted in a string.
    """
    breaks = sorted(breaks, key = lambda x: x[0])
    
    final = []
    for i in range(len(breaks) - 1):
        s = text[breaks[i][0] : breaks[i+1][0]]
        if breaks[i][1] != "n":
            s = add_simple_tag("span", breaks[i][1], s)
            
        final.append(s)
        
    return "".join(final)

In [7]:
def highlight_text(text, offset, uniq_spans):
    """
    Given a string and the annotations which fall
    within this string, highlights the concepts.
    """
    # index of break, type of break (n = nothing)
    breaks = [(0, "n"), (len(text), "n")]
    
    for span in uniq_spans:
        breaks.append((span.start - offset, span.stype))
        breaks.append((span.stop - offset, "n"))
            
    return highlight_concepts(text, breaks)

### Subtask processors:

In [8]:
def flat_repr(id_set):
    """Makes a string out of the ID frozenset."""
    vals = sorted(["{0}:{1}".format(v.uid_type, v.uid) for v in id_set])
    return "|".join(vals)

In [9]:
def grab_names(annotations):
    """Determines the unique names of the annotations."""
    used_names = defaultdict(set) # lower case set of used names (to avoid repeats)
    real_name = defaultdict(set) # set of unique names verbatim (to preseve capitalization)
    for annotation in annotations:
        if annotation.text.lower() not in used_names[annotation.stype]:
            used_names[annotation.stype].add(annotation.text.lower())
            real_name[annotation.stype].add(annotation.text)
            
    return real_name

In [10]:
def process_sentence_task(sentence, rel_pairs):
    """
    Given a Sentence object, and the set of chemical-disease relation
    identifier pairs, creates a set of sentence-level verification tasks.
    """
    data = defaultdict(list)
    for chemical_id, disease_id in rel_pairs:
        spans = [annotation for annotation in sentence.annotations if annotation.uid in [chemical_id, disease_id]]
        real_name = grab_names(spans)

        data["pmid"].append(sentence.pmid)
        
        data["form_sentence"].append(highlight_text(sentence.text, sentence.start, spans))
        
        flat_chem_id = flat_repr(chemical_id)
        flat_dise_id = flat_repr(disease_id)
        
        data["chemical_id"].append(flat_chem_id)
        data["disease_id"].append(flat_dise_id)
        
        data["chemical_name"].append(add_simple_tag("span", "chemical", "/".join(real_name["chemical"])))
        data["disease_name"].append(add_simple_tag("span", "disease", "/".join(real_name["disease"])))
        
        data["relation_pair_id"].append("{0}_{1}_{2}".format(sentence.pmid, flat_chem_id, flat_dise_id))
        
        data["sentence_id"].append(sentence.uid)
        
    return pd.DataFrame(data)

In [11]:
def process_abstract_task(paper, rel_pairs):
    """Makes a set of abstract-level tasks for one paper."""
    data = defaultdict(list)
    for chemical_id, disease_id in rel_pairs:
        spans = [annotation for annotation in paper.annotations if annotation.uid in [chemical_id, disease_id]]
        real_name = grab_names(spans)

        form_title = highlight_text(paper.title, 0,
                                    filter(lambda x: x.stop <= len(paper.title), spans))

        form_abstract = highlight_text(paper.abstract, len(paper.title) + 1,
                                       filter(lambda x: x.start > len(paper.title), spans))

        form_abstract = create_sections(form_abstract)

        data["pmid"].append(paper.pmid)

        data["form_title"].append(form_title)
        data["form_abstract"].append(form_abstract)
        
        flat_chem_id = flat_repr(chemical_id)
        flat_dise_id = flat_repr(disease_id)

        data["chemical_id"].append(flat_chem_id)
        data["disease_id"].append(flat_dise_id)
        data["chemical_name"].append(add_simple_tag("span", "chemical", "/".join(real_name["chemical"])))
        data["disease_name"].append(add_simple_tag("span", "disease", "/".join(real_name["disease"])))
            
    return pd.DataFrame(data)

## Generate CrowdFlower work units:

In [12]:
def create_work_units(dataset):
    """
    Given a list of Paper objects representing the abstracts
    we wish to find the CID relations in, this function
    creates the work units for the CrowdFlower tasks.
    
    CID relations are judged to be always true and no crowd
    worker ever sees that relation.
    
    Each sentence-bound non-CID relation can create one or
    multiple sentence-level work units, depending on how
    many sentences in that abstract contain the relationship.
    
    Each non-sentence bound relation creates one abstract-level
    work unit.
    
    Relation type classification is already done by the Paper
    objects.
    """
    cid_relations = dict()
    easy_units = []
    hard_units = []
    for paper in dataset:
        cid_relations[paper.pmid] = paper.poss_relations["CID"]
        
        # create the sentence-level tasks:
        for sentence in paper.sentences:
            work = sentence.poss_relations[False] - paper.poss_relations["CID"]
            easy_units.append(process_sentence_task(sentence, work))
                
        # create the abstract-level tasks:
        hard_units.append(process_abstract_task(paper, paper.poss_relations["not_sentence_bound"]))
            
    # return two dataframes
    easy_units = pd.concat(easy_units).reset_index(drop = True)
    hard_units = pd.concat(hard_units).reset_index(drop = True)
    
    easy_units["uniq_id"] = pd.Series(["bcv_devset_test_100_easy_{0}".format(i) for i in range(len(easy_units))])
    hard_units["uniq_id"] = pd.Series(["bcv_devset_test_100_hard_{0}".format(i) for i in range(len(hard_units))])
    
    return (cid_relations, easy_units, hard_units)

In [13]:
cid_relations, easy_units, hard_units = create_work_units(testset)

In [14]:
with open("data/devset_100_test/devset_100_cid_relations.pickle", "wb") as fout:
    pickle.dump(cid_relations, fout)

---

In [15]:
easy_units.shape

(649, 9)

In [16]:
hard_units.shape

(540, 8)

In [17]:
easy_units.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,form_sentence,pmid,relation_pair_id,sentence_id,uniq_id
0,MESH:D020117,"<span class=""chemical"">cisapride</span>",MESH:D043183,"<span class=""disease"">irritable bowel syndrome...","Effects of <span class=""chemical"">cisapride</s...",9669632,9669632_MESH:D020117_MESH:D043183,9669632_0,bcv_devset_test_100_easy_0
1,MESH:D020117,"<span class=""chemical"">cisapride</span>",MESH:D043183,"<span class=""disease"">IBS/irritable bowel synd...",Our aim was to assess the effects of long-term...,9669632,9669632_MESH:D020117_MESH:D043183,9669632_2,bcv_devset_test_100_easy_1
2,MESH:D020117,"<span class=""chemical"">cisapride</span>",MESH:D043183,"<span class=""disease"">IBS</span>",METHODS: Thirty-eight patients with <span clas...,9669632,9669632_MESH:D020117_MESH:D043183,9669632_3,bcv_devset_test_100_easy_2
3,MESH:D020117,"<span class=""chemical"">cisapride</span>",MESH:D003967,"<span class=""disease"">diarrhoea</span>",METHODS: Thirty-eight patients with IBS (const...,9669632,9669632_MESH:D020117_MESH:D003967,9669632_3,bcv_devset_test_100_easy_3
4,MESH:D020117,"<span class=""chemical"">cisapride</span>",MESH:D003248,"<span class=""disease"">constipation</span>",METHODS: Thirty-eight patients with IBS (<span...,9669632,9669632_MESH:D020117_MESH:D003248,9669632_3,bcv_devset_test_100_easy_4


In [18]:
hard_units.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,form_abstract,form_title,pmid,uniq_id
0,MESH:D020117,"<span class=""chemical"">cisapride</span>",MESH:D015746,"<span class=""disease"">abdominal pain</span>",<p>BACKGROUND: Irritable bowel syndrome is a c...,"Effects of <span class=""chemical"">cisapride</s...",9669632,bcv_devset_test_100_hard_0
1,MESH:D020117,"<span class=""chemical"">cisapride</span>",MESH:D005767,"<span class=""disease"">disordered gastrointesti...",<p>BACKGROUND: Irritable bowel syndrome is a c...,"Effects of <span class=""chemical"">cisapride</s...",9669632,bcv_devset_test_100_hard_1
2,MESH:C117268,"<span class=""chemical"">pimecrolimus</span>",MESH:D003875,"<span class=""disease"">eruptions</span>",We describe herein 3 patients who developed ro...,Rosaceiform dermatitis associated with topical...,20466178,bcv_devset_test_100_hard_2
3,MESH:C117268,"<span class=""chemical"">pimecrolimus</span>",MESH:D012393,"<span class=""disease"">rosacea</span>",We describe herein 3 patients who developed <s...,Rosaceiform dermatitis associated with topical...,20466178,bcv_devset_test_100_hard_3
4,MESH:D016559,"<span class=""chemical"">tacrolimus</span>",MESH:D013684,"<span class=""disease"">telangiectasia</span>",We describe herein 3 patients who developed ro...,Rosaceiform dermatitis associated with topical...,20466178,bcv_devset_test_100_hard_4


---

In [19]:
abs_test_ques = pd.read_csv("data/crowdflower/gold_reports/job_767273_gold_report.csv", sep = ',')

In [20]:
abs_test_ques.shape

(234, 21)

In [21]:
abs_test_ques = abs_test_ques.query("~_hidden")

In [22]:
abs_test_ques.shape

(228, 21)

In [23]:
# columns to keep
columns = ["verify_relationship_gold",
           "verify_relationship_gold_reason",
           "chemical_id", "chemical_name",
           "disease_id", "disease_name",
           "form_title", "form_abstract", "original_job_id",
           "pmid", "uniq_id", "old_cf_work_unit_id"]

In [24]:
abs_test_ques = abs_test_ques[columns]

In [25]:
abs_test_ques["verify_relationship_gold"].value_counts()

no_relation    134
yes_direct      94
dtype: int64

In [26]:
sent_test_ques = pd.read_csv("data/crowdflower/test_questions/job_764099_test_questions.tsv", sep = '\t')

In [27]:
sent_test_ques.loc[:, "original_job_id"] = sent_test_ques.loc[:, "original_job_id"].map(int)

sent_test_ques.loc[:, "verify_relationship_gold"] = sent_test_ques.loc[:, "verify_relationship_gold"].map(lambda x: "{0}\n".format(x))

sent_test_ques["_golden"] = "TRUE"

In [28]:
abs_test_ques.loc[:, "original_job_id"] = abs_test_ques.loc[:, "original_job_id"].map(int)

abs_test_ques.loc[:, "verify_relationship_gold"] = abs_test_ques.loc[:, "verify_relationship_gold"].map(lambda x: "{0}\n".format(x))

abs_test_ques["_golden"] = "TRUE"

In [29]:
final_sent_task = pd.concat([sent_test_ques, easy_units])

In [30]:
final_abs_task = pd.concat([abs_test_ques, hard_units])

In [31]:
final_sent_task.shape

(1200, 13)

In [32]:
final_abs_task.shape

(768, 13)

In [33]:
final_sent_task.to_csv("data/crowdflower/data_for_sent_task_job_.tsv", sep = '\t', index = False)

In [34]:
final_abs_task.to_csv("data/crowdflower/data_for_abs_task_job_.tsv", sep = '\t', index = False)