# Confusion matrix for BioCreative Database paper

Tong Shu Li<br>
Created on: 2016-01-21<br>
Last updated: 2016-01-21

We need to generate the confusion matrix at the request of the reviewers.

In [1]:
from collections import defaultdict
import os
import pandas as pd
import sys

In [2]:
sys.path.append("..")

In [3]:
from src.data_model import *
from src.eval_perf import official_F_score

In [4]:
TRIPLE = ["pmid", "chemical_id", "disease_id"]

---

In [5]:
def get_triples(dataframe):
    return set(
        dataframe[TRIPLE].apply(
            lambda row: (int(row["pmid"]), row["chemical_id"], row["disease_id"]),
            axis = 1
        )
    )

## Read the original crowd results

In [6]:
paper_mapping = save_file("testset_mapping.pickle")

In [7]:
def remap(papers):
    res = dict()
    for uid, paper in papers.items():
        paper.pmid = paper_mapping[uid]
        res[paper_mapping[uid]] = paper
        
    return res

In [8]:
def read_orig():
    save_loc = os.path.join("..", "data", "final_eval", "results", "new_crowd_all_rels.pickle")
    res = save_file(save_loc)
    
    if res is not None:
        return res
    
    loc = os.path.join("..", "data", "final_eval", "results")    
    res = parse_input(loc, "crowd_all_relations.txt", fix_acronyms = True)
    
    res = remap(res)
    
    save_file(save_loc, res)
    return res

In [9]:
crowd_full = read_orig()

---

## Read the gold standard

In [10]:
save_loc = os.path.join("..", "data", "gold_standard", "new_test_gold.pickle")
loc = os.path.join("..", "data", "gold_standard")
eval_gold = parse_file(save_loc, loc = loc, fname = "CDR_TestSet.txt", fix_acronyms = False)

---

## Read the crowdsourcing responses

In [11]:
loc = os.path.abspath(os.path.join("..", "data", "final_eval", "results", "abstract_relation_res.tsv"))

abs_res = pd.read_csv(loc, sep = '\t')
abs_res = abs_res.rename(columns = {"percent_agree": "norm_conf_score"})

In [12]:
loc = os.path.abspath(os.path.join("..", "data", "final_eval", "results", "sentence_relation_abs_res.tsv"))
sent_res = pd.read_csv(loc, sep = '\t')

sent_res = sent_res.rename(columns = {"conf_score": "norm_conf_score",
                                      "score_vote_max": "num_votes"})

In [13]:
abs_res.loc[:, "pmid"] = abs_res.loc[:, "pmid"].map(lambda val: paper_mapping[val])

In [14]:
sent_res.loc[:, "pmid"] = sent_res.loc[:, "pmid"].map(lambda val: paper_mapping[val])

In [15]:
cid_rels = {pmid: paper.poss_relations["CID"] for pmid, paper in crowd_full.items()}

In [16]:
def merge_results():
    temp = []    
    cols = ["pmid", "chemical_id", "disease_id", "num_votes"]
    
    sub = abs_res.loc[:, cols]
    sub.loc[:, "rel_origin"] = "abstract_task"
    temp.append(sub)

    sub = sent_res.loc[:, cols]
    sub.loc[:, "rel_origin"] = "sentence_task"
    temp.append(sub)
    
    things = defaultdict(list)
    for pmid, rels in cid_rels.items():
        for cid_relation in rels:
            things["pmid"].append(pmid)
            things["chemical_id"].append(cid_relation[0].flat_repr)
            things["disease_id"].append(cid_relation[1].flat_repr)
            
    things = pd.DataFrame(things)
    things.loc[:, "num_votes"] = 5
    things.loc[:, "rel_origin"] = "cid_relation"
    temp.append(things)
    
    ans = pd.concat(temp).sort(["pmid", "num_votes", "rel_origin",
                                "chemical_id", "disease_id"]).reset_index(drop = True)
    
    ans.loc[:, "num_votes"] = ans.num_votes.map(
        lambda v: 5 if v > 5 else v
    )
    
    return ans

In [17]:
crowd_res = merge_results()



In [18]:
crowd_res.head()

Unnamed: 0,chemical_id,disease_id,num_votes,pmid,rel_origin
0,MESH:C009695,MESH:D000699,0,35781,sentence_task
1,MESH:C009695,MESH:D002375,0,35781,sentence_task
2,MESH:D003000,MESH:D000699,0,35781,sentence_task
3,MESH:D009278,MESH:D002375,0,35781,sentence_task
4,MESH:D009638,MESH:D000699,1,35781,abstract_task


---

In [19]:
def get_gold_rels(gold_std):
    res = set()
    for pmid, paper in gold_std.items():
        for rel in paper.gold_relations:
            res.add((int(pmid), rel.chem.flat_repr, rel.dise.flat_repr))
            
    return res

In [20]:
gold_relations = get_gold_rels(eval_gold)

In [21]:
len(gold_relations)

1066

In [22]:
official_F_score("num_votes", gold_relations, crowd_res)

  return pd.DataFrame(res).sort("threshold").reset_index(drop = True)


Unnamed: 0,F_score,precision,recall,threshold
0,0.267624,0.162805,0.751407,0
1,0.356317,0.236486,0.722326,1
2,0.444779,0.327426,0.693246,2
3,0.49648,0.410288,0.628518,3
4,0.505929,0.47564,0.540338,4
5,0.465066,0.556136,0.399625,5


Results are reproducible, so let's continue..

---

In [23]:
def make_df(triples):
    """Converts a given set of (pmid, chemical_id, disease_id)
    triples into a three column dataframe."""
    return pd.DataFrame(list(triples), columns = TRIPLE)

In [24]:
gold_std = make_df(gold_relations)

In [25]:
gold_std.head()

Unnamed: 0,pmid,chemical_id,disease_id
0,24100257,MESH:C089750,MESH:D054549
1,17255138,MESH:D000894,MESH:D006471
2,20495512,MESH:D006493,MESH:D013921
3,12119460,MESH:D016685,MESH:D007970
4,7479194,MESH:D015662,MESH:D013262


In [26]:
def extract_ids(annotations):
    concepts = defaultdict(set)
    for annot in annotations:
        concepts[annot.stype].add(annot.uid.uid)

    return concepts

def expand_set(vals):
    res = set()
    for v in vals:
        res |= v
        
    return res

In [27]:
def find_perfect_concepts(predict, gold_std):
    """Finds the concepts having the exact same annotations
    in both data sets.
    """
    res = dict()
    for pmid, gold_paper in gold_std.items():
        paper = predict[pmid]

        # annotations
        predict_annot = set(paper.annotations)
        gold_annot = set(gold_paper.annotations)
        
        shared_concepts = extract_ids(gold_annot & predict_annot)
        missed_concepts = extract_ids(gold_annot ^ predict_annot)
        
        perf_chem = shared_concepts["chemical"] - missed_concepts["chemical"]
        perf_dise = shared_concepts["disease"] - missed_concepts["disease"]
        
        chems = expand_set(perf_chem)
        dises = expand_set(perf_dise)

        res[pmid] = (chems, dises)

    return res

def get_perf_subset_triples(perf_concepts):
    res = set()
    for pmid, (chem, dise) in perf_concepts.items():
        res |= set([(pmid, c.flat_repr, d.flat_repr) for c in chem for d in dise])
        
    return res

In [28]:
def ner_filter(predict_full, predict_df, gold_full, gold_df):
    """Given a dataframe of a solution's predicted CID relations,
    applies a NER filter with the gold standard to return only those
    relations which were generated using perfectly annotated concepts.
    """
    # what were the concepts perfectly annotated by the predictions and gold std?
    common_concepts = find_perfect_concepts(predict_full, gold_full)
    
    # what are all possible triples using the perfect concepts?
    poss_good_trips = get_perf_subset_triples(common_concepts)
    
    print(len(poss_good_trips))
    
    poss_df = make_df(poss_good_trips)
    
    # filter predictions and gold standard using the set of possible triples
    predict_sub = pd.merge(predict_df, poss_df, how = "inner", on = TRIPLE)
    gold_sub = pd.merge(gold_df, poss_df, how = "inner", on = TRIPLE)
    
    return (predict_sub, gold_sub)

In [29]:
crowd_no_ner, crowd_good_gold = ner_filter(crowd_full, crowd_res, eval_gold, gold_std)

2569


In [30]:
len(crowd_good_gold["pmid"].unique())

290

In [31]:
crowd_no_ner_perf = official_F_score("num_votes", get_triples(crowd_good_gold), crowd_no_ner)
crowd_no_ner_perf

  return pd.DataFrame(res).sort("threshold").reset_index(drop = True)


Unnamed: 0,F_score,precision,recall,threshold
0,0.318346,0.189305,1.0,0
1,0.434339,0.28024,0.964948,1
2,0.540541,0.381356,0.927835,2
3,0.615958,0.482477,0.851546,3
4,0.645447,0.565015,0.752577,4
5,0.603732,0.64554,0.56701,5


In [32]:
len(crowd_good_gold)

485

---

In [33]:
gold_std.head()

Unnamed: 0,pmid,chemical_id,disease_id
0,24100257,MESH:C089750,MESH:D054549
1,17255138,MESH:D000894,MESH:D006471
2,20495512,MESH:D006493,MESH:D013921
3,12119460,MESH:D016685,MESH:D007970
4,7479194,MESH:D015662,MESH:D013262


In [34]:
temp = pd.merge(crowd_res, gold_std, how = "inner", on = TRIPLE)

In [35]:
temp.shape

(801, 5)

In [36]:
temp.head()

Unnamed: 0,chemical_id,disease_id,num_votes,pmid,rel_origin
0,MESH:C009695,MESH:D002375,0,35781,sentence_task
1,MESH:D009278,MESH:D002375,0,35781,sentence_task
2,MESH:D003061,MESH:D002375,2,35781,sentence_task
3,MESH:D009020,MESH:D002375,3,35781,sentence_task
4,MESH:D003000,MESH:D002375,5,35781,sentence_task


In [37]:
temp.query("num_votes >= 4")["rel_origin"].value_counts()

sentence_task    301
cid_relation     143
abstract_task    132
dtype: int64

In [38]:
temp.query("num_votes >= 4").shape

(576, 5)

In [39]:
temp = pd.merge(crowd_res, gold_std, how = "right", on = TRIPLE)

In [40]:
temp.shape

(1066, 5)

In [41]:
temp.query("num_votes <= 3").shape

(225, 5)

In [42]:
temp.query("num_votes <= 3")["rel_origin"].value_counts()

sentence_task    123
abstract_task    102
dtype: int64

In [43]:
crowd_res["in_gold"] = crowd_res.loc[:, TRIPLE].apply(
    lambda r: (int(r["pmid"]), r["chemical_id"], r["disease_id"]) in gold_relations,
    axis = 1
)

In [44]:
crowd_res["in_gold"].value_counts()

False    4119
True      801
dtype: int64

In [45]:
crowd_res.query("in_gold & num_votes <= 3").shape

(225, 6)

In [46]:
crowd_res.query("~in_gold & num_votes >= 4").shape

(635, 6)

In [47]:
crowd_res.query("~in_gold & num_votes >= 4")["rel_origin"].value_counts()

sentence_task    339
abstract_task    245
cid_relation      51
dtype: int64

In [48]:
crowd_res.query("~in_gold & num_votes < 4").shape

(3484, 6)

In [49]:
crowd_res.query("~in_gold & num_votes < 4")["rel_origin"].value_counts()

abstract_task    2281
sentence_task    1203
dtype: int64

---

In [50]:
crowd_no_ner.head()

Unnamed: 0,chemical_id,disease_id,num_votes,pmid,rel_origin
0,MESH:D000157,MESH:D001145,2,44072,sentence_task
1,MESH:D004837,MESH:D001145,3,44072,sentence_task
2,MESH:C024986,MESH:D001145,4,44072,sentence_task
3,MESH:D002725,MESH:D001145,4,44072,sentence_task
4,MESH:D011433,MESH:D006973,2,48835,abstract_task


In [51]:
crowd_good_gold.head()

Unnamed: 0,pmid,chemical_id,disease_id
0,3925479,MESH:D006220,MESH:D002375
1,11745287,MESH:D004317,MESH:D009325
2,24739405,MESH:D006220,MESH:D002375
3,19108278,MESH:D004837,MESH:D001145
4,1158089,MESH:D013752,MESH:D005234


In [52]:
crowd_good_gold.shape

(485, 3)

In [53]:
temp = pd.merge(crowd_no_ner, crowd_good_gold, on = TRIPLE, how = "inner")

In [54]:
temp.shape

(485, 5)

In [55]:
temp.head()

Unnamed: 0,chemical_id,disease_id,num_votes,pmid,rel_origin
0,MESH:D000157,MESH:D001145,2,44072,sentence_task
1,MESH:D004837,MESH:D001145,3,44072,sentence_task
2,MESH:C024986,MESH:D001145,4,44072,sentence_task
3,MESH:D002725,MESH:D001145,4,44072,sentence_task
4,MESH:D011433,MESH:D006973,2,48835,abstract_task


In [56]:
temp.query("num_votes >= 4").shape

(365, 5)

In [57]:
temp.query("num_votes >= 4")["rel_origin"].value_counts()

sentence_task    188
cid_relation     102
abstract_task     75
dtype: int64

In [58]:
temp.query("num_votes < 4").shape

(120, 5)

In [59]:
temp.query("num_votes < 4")["rel_origin"].value_counts()

abstract_task    65
sentence_task    55
dtype: int64

In [61]:
crowd_no_ner["in_gold"] = crowd_no_ner.loc[:, TRIPLE].apply(
    lambda r: (int(r["pmid"]), r["chemical_id"], r["disease_id"]) in gold_relations,
    axis = 1
)

In [62]:
crowd_no_ner.shape

(2562, 6)

In [63]:
crowd_no_ner.query("~in_gold & num_votes >= 4").shape

(281, 6)

In [64]:
crowd_no_ner.query("~in_gold & num_votes >= 4")["rel_origin"].value_counts()

sentence_task    156
abstract_task    100
cid_relation      25
dtype: int64

In [65]:
crowd_no_ner.query("~in_gold & num_votes < 4").shape

(1796, 6)

In [66]:
crowd_no_ner.query("~in_gold & num_votes < 4")["rel_origin"].value_counts()

abstract_task    1221
sentence_task     575
dtype: int64