# Make test questions for CrowdFlower refinement

Tong Shu Li<br>
Created on: 2015-12-21<br>
Last updated: 2015-12-21

Now we will make test questions for CrowdFlower. Since our task interface has changed, we will generate the test questions from scratch.

In [1]:
from collections import defaultdict
import numpy as np
import os
import pandas as pd
import sys

In [2]:
sys.path.append("..")

In [3]:
from src.data_model import Relation
from src.data_model import parse_file
from src.make_cf_work_units import create_work_units

In [4]:
rand_state = np.random.RandomState(seed = 4289623645)

---

### Read the 600 abstracts we will use for test questions

In [5]:
save_loc = os.path.join("..", "data", "refinement", "CDR_train_for_test_ques.pickle")
train_tq = parse_file(save_loc, loc = "../data/refinement",
                     fname = "CDR_train_for_test_ques.txt",
                     fix_acronyms = False)

In [6]:
save_loc = os.path.join("..", "data", "refinement", "CDR_dev_for_test_ques.pickle")
dev_tq = parse_file(save_loc, loc = "../data/refinement",
                     fname = "CDR_dev_for_test_ques.txt",
                     fix_acronyms = False)

In [7]:
tq_data = train_tq.copy()
tq_data.update(dev_tq)

In [8]:
len(tq_data)

600

---

### Generate work units

In [9]:
cid, work_units = create_work_units(tq_data, "refine_test_ques_all")

In [10]:
len(work_units)

6851

In [11]:
work_units.head(2)

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,form_body,form_title,pmid,rel_origin,uniq_id
0,MESH:D020117,"<span class=""chemical"">cisapride/Cisapride</span>",MESH:D043183,"<span class=""disease"">IBS/irritable bowel synd...","<p>BACKGROUND: <span class=""disease"">Irritable...","<span class=""sentence"">Effects of <span class=...",9669632,sent,refine_test_ques_all_0
1,MESH:D020117,"<span class=""chemical"">cisapride/Cisapride</span>",MESH:D003967,"<span class=""disease"">diarrhoea/Diarrhoea</span>",<p>BACKGROUND: Irritable bowel syndrome is a c...,"Effects of <span class=""chemical"">cisapride</s...",9669632,sent,refine_test_ques_all_1


---

### Check against gold standard to see if each possible relation is true

In [12]:
def get_all_gold_rels(dataset):
    res = defaultdict(list)
    for pmid, paper in dataset.items():
        for relation in paper.gold_relations:
            res["pmid"].append(pmid)
            res["chemical_id"].append(relation.chem.flat_repr)
            res["disease_id"].append(relation.dise.flat_repr)
            res["rel_origin"].append(relation.origin)
            
    res = pd.DataFrame(res)
    res["in_gold"] = 1
    return res

In [13]:
gold_rels = get_all_gold_rels(tq_data)

In [14]:
gold_rels.head()

Unnamed: 0,chemical_id,disease_id,pmid,rel_origin,in_gold
0,MESH:D020117,MESH:D015746,9669632,abs,1
1,MESH:D011718,MESH:D000857,19674115,sent,1
2,MESH:D014700,MESH:D018376,9100294,sent,1
3,MESH:D009543,MESH:D018376,9100294,sent,1
4,MESH:D020748,MESH:D018376,9100294,CID,1


In [15]:
work_units = pd.merge(work_units, gold_rels, how = "left",
                      on = ["pmid", "chemical_id", "disease_id", "rel_origin"])

In [16]:
work_units.loc[:, "in_gold"] = work_units.loc[:, "in_gold"].fillna(0)
work_units.loc[:, "in_gold"] = work_units.loc[:, "in_gold"].map(lambda f: int(f))

In [17]:
work_units.shape

(6851, 10)

In [18]:
work_units.head(2)

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,form_body,form_title,pmid,rel_origin,uniq_id,in_gold
0,MESH:D020117,"<span class=""chemical"">cisapride/Cisapride</span>",MESH:D043183,"<span class=""disease"">IBS/irritable bowel synd...","<p>BACKGROUND: <span class=""disease"">Irritable...","<span class=""sentence"">Effects of <span class=...",9669632,sent,refine_test_ques_all_0,0
1,MESH:D020117,"<span class=""chemical"">cisapride/Cisapride</span>",MESH:D003967,"<span class=""disease"">diarrhoea/Diarrhoea</span>",<p>BACKGROUND: Irritable bowel syndrome is a c...,"Effects of <span class=""chemical"">cisapride</s...",9669632,sent,refine_test_ques_all_1,0


In [19]:
for info, group in work_units.groupby(["rel_origin", "in_gold"]):
    print("{}: {}".format(info, len(group)))

('abs', 0): 3597
('abs', 1): 381
('sent', 0): 2089
('sent', 1): 784


As previously noted, most of the true gold standard relations cooccur in at least one sentence.

### Selecting work units as test questions

We will take 200 work units from each of the four categories (abs/sent x in/not in gold) as test questions.

In [20]:
def subsample(dataset, N):
    res = [group.sample(n = N, random_state = rand_state)
        for info, group in work_units.groupby(["rel_origin", "in_gold"])
    ]
        
    return pd.concat(res).reset_index(drop = True)

In [21]:
test_ques = subsample(work_units, 200)

In [22]:
test_ques.shape

(800, 10)

In [23]:
test_ques.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,form_body,form_title,pmid,rel_origin,uniq_id,in_gold
0,MESH:D013726,"<span class=""chemical"">terbutaline/Terbutaline...",MESH:D002658,"<span class=""disease"">neurodevelopmental disor...","Autism is a <span class=""disease"">neurodevelop...",Neuroinflammation and behavioral abnormalities...,17400887,abs,refine_test_ques_all_2404,0
1,MESH:C005618,"<span class=""chemical"">benzoylecgonine</span>",MESH:D012206,"<span class=""disease"">rhabdomyolysis</span>",We describe an outbreak of deaths from cocaine...,Fatal excited delirium following cocaine use: ...,8988571,abs,refine_test_ques_all_6325,0
2,MESH:D011188,"<span class=""chemical"">potassium</span>",MESH:D000435,"<span class=""disease"">acute alcohol intoxicati...",Atrial fibrillation in young patients without ...,Lone atrial fibrillation associated with creat...,15899738,abs,refine_test_ques_all_559,0
3,MESH:D015215,"<span class=""chemical"">zidovudine/3'-azido-2',...",MESH:D009205,"<span class=""disease"">myocarditis</span>",<p>BACKGROUND: Dilated cardiomyopathy (DCM) an...,Myocardial Fas ligand expression increases sus...,17943461,abs,refine_test_ques_all_6381,0
4,MESH:D003404,"<span class=""chemical"">creatinine</span>",MESH:D006947,"<span class=""disease"">hyperkalemia</span>",<p>BACKGROUND: A previous randomized controlle...,Spironolactone-induced renal insufficiency and...,15632880,abs,refine_test_ques_all_6356,0


In [24]:
test_ques["rel_resp_gold"] = test_ques["in_gold"].map(
    lambda v: "yes_direct\n" if v == 1 else "no_relation\n"
)

test_ques["rel_resp_gold_reason"] = ""
test_ques["_golden"] = "TRUE"

In [25]:
test_ques.to_csv("refine_800_test_ques.tsv", sep = '\t', index = False)