# Final result aggregation for BioCreative V Task 3 evaluation dataset

Tong Shu Li<br>
Created on: 2015-08-19<br>
Last updated: 2015-08-19

In this notebook we will re-aggregate the results of both the abstract-level and the sentence-level CrowdFlower tasks along with the CID relations we automatically extracted. The results will then be formatted for final formatting in the PubTator format.

In [1]:
from collections import defaultdict
import pandas as pd
import pickle

In [2]:
from src.data_model import *

In [3]:
def flat_repr(id_set):
    """
    Makes a string out of the ID frozenset.
    """
    vals = map(lambda v: "{0}:{1}".format(v.uid_type, v.uid), id_set)
    vals = sorted(vals)
    return "|".join(vals)

---

### Read the abstract-level task results:

In [4]:
abs_res = pd.read_csv("data/final_eval/results/abstract_relation_res.tsv", sep = '\t')

### Read the sentence-level task abstract level results:

In [5]:
sent_res = pd.read_csv("data/final_eval/results/sentence_relation_abs_res.tsv", sep = '\t')

In [6]:
abs_res = abs_res.rename(columns = {"percent_agree": "norm_conf_score"})

In [7]:
sent_res = sent_res.rename(columns = {"conf_score": "norm_conf_score",
                                      "score_vote_max": "num_votes"})

In [8]:
abs_res.head()

Unnamed: 0,uniq_id,verify_relationship,conf_score,num_votes,norm_conf_score,pmid,unit_id,chemical_id,disease_id
0,bcv_final_eval_hard_0,yes_direct,0.8182,1,0.198438,0,773935948,MESH:D006632,MESH:D003693
1,bcv_final_eval_hard_1,yes_direct,0.0,0,0.0,1,773935949,MESH:D000809,MESH:D007022
2,bcv_final_eval_hard_10,yes_direct,0.0,0,0.0,5,773935958,MESH:D013498,MESH:D003556
3,bcv_final_eval_hard_100,yes_direct,0.0,0,0.0,21,773936048,MESH:D000431,MESH:D012893
4,bcv_final_eval_hard_1000,yes_direct,0.0,0,0.0,200,773936948,MESH:D004837,MESH:D009202


In [9]:
sent_res.head()

Unnamed: 0,chemical_id,norm_conf_score,disease_id,pmid,num_votes,sentence_ids,uniq_ids,unit_ids
0,MESH:D006632,0.0,MESH:D014456,0,0,0_1,bcv_final_eval_easy_1,773931696
1,MESH:D015738,1.0,MESH:D003693,0,5,0_0|0_2|0_3,bcv_final_eval_easy_0|bcv_final_eval_easy_3|bc...,773931695|773931698|773931699
2,MESH:D015738,0.0,MESH:D014456,0,0,0_1,bcv_final_eval_easy_2,773931697
3,MESH:D012964,0.189597,MESH:D007022,1,1,1_0,bcv_final_eval_easy_5,773931700
4,MESH:D000305,0.802824,MESH:D007674,2,4,2_4,bcv_final_eval_easy_10,773931705


### Read the automatically determined CID relations:

In [10]:
with open("data/final_eval/cid_relations.pickle", "rb") as fin:
    cid_rels = pickle.load(fin)

### Read the original annotated file into memory:

We also need to write the annotations to file. Since our data model performed some simple acronym resolution, we need to write these to file as well.

In [11]:
loc = "data/final_eval"
fname = "CDR_annotated_testset.txt"

testset = parse_input(loc, fname, is_gold = False, return_format = "dict")

### Combine results together and write to file:

Now that we have everything read into memory, we will for each paper:
1. Print the text and annotations to file.
2. Combine the relations together.
3. Split into two sets: all relations or only those with >= 4 votes.
4. Write to file.

### Combine relations together using conf_score and votes

In [None]:
def get_data(dataframe, pmid):
    """
    Grab data from the dataframe. Return empty otherwise.
    """
    temp = dataframe.query("pmid == {0}".format(pmid))
    if temp.empty:
        return "empty"
    
    return temp[["pmid", "chemical_id", "disease_id", "norm_conf_score", "num_votes"]]

def get_final_res():
    final_ans = []
    for pmid in range(500):
        this_paper_rels = []
        
        # abstract relations
        temp = get_data(abs_res, pmid)
        if not isinstance(temp, str):
            temp["rel_origin"] = "abstract_task"
            this_paper_rels.append(temp)
            
        # sentence relations
        temp = get_data(sent_res, pmid)
        if not isinstance(temp, str):
            temp["rel_origin"] = "sentence_task"
            this_paper_rels.append(temp)
            
        things = defaultdict(list)
        paper_cid_rels = cid_rels[pmid]
        if paper_cid_rels: # not empty
            for cid_relation in paper_cid_rels:
                things["pmid"].append(pmid)
                things["chemical_id"].append(flat_repr(cid_relation[0]))
                things["disease_id"].append(flat_repr(cid_relation[1]))
                things["norm_conf_score"].append(1)
                things["num_votes"].append(5)
                things["rel_origin"].append("cid_relation")
                
            this_paper_rels.append(pd.DataFrame(things))
            
        if len(this_paper_rels) > 0:
            # some papers have no detectable relations (eg, pmid 250)
            final_ans.append(pd.concat(this_paper_rels))
        
    return pd.concat(final_ans)

In [None]:
final_results = get_final_res()

In [None]:
final_results

In [None]:
final_results["chemical_id"].map(lambda v: v.split(":")[0]).value_counts()

In [None]:
final_results["disease_id"].map(lambda v: v.split(":")[0]).value_counts()

The chemical and disease identifiers are all single MeSH identifiers and are not complexed. This will make printing easier.

### Print everything to file:

In [None]:
def print_paper_obj(paper, relations, fout):
    """
    We print a paper object to file.
    """
    fout.write("{0}|t|{1}\n".format(paper.pmid, paper.title))
    fout.write("{0}|a|{1}\n".format(paper.pmid, paper.abstract))
    
    # write the annotations to file
    # since we did not generate the relations,
    # we will just write a conf score of 1.0 for all of them
    for annotation in paper.annotations:
        fout.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(paper.pmid, annotation.start,
                                                   annotation.stop, annotation.text,
                                                   annotation.stype.capitalize(),
                                                   annotation.flat_repr(), "1.0"))
        
    # write the relations to file
    relations = relations.sort("norm_conf_score", ascending = False)
    for idx, row in relations.iterrows():
        fout.write("{0}\tCID\t{1}\t{2}\t{3}\n".format(paper.pmid, row["chemical_id"],
                                                     row["disease_id"], row["norm_conf_score"]))

### Print everything to file:

In [None]:
# all possible relations, including ones we are confident are false
with open("data/final_eval/results/crowd_all_relations.txt", "w") as fout:
    for pmid in range(500):
        rels = final_results.query("pmid == {0}".format(pmid))
        print_paper_obj(testset[pmid], rels, fout)
        fout.write("\n")

In [None]:
# only the relations which we are confident are true (>= 4 positive votes)
with open("data/final_eval/results/crowd_good_relations.txt", "w") as fout:
    for pmid in range(500):
        rels = final_results.query("pmid == {0} and num_votes >= 4".format(pmid))
        print_paper_obj(testset[pmid], rels, fout)
        fout.write("\n")