# Results for 200 development set abstracts

Tong Shu Li<br>
Created on: 2015-08-25<br>
Last updated: 2015-08-25

In this notebook we will determine how many papers were completely finished by our crowdsourcing technique.

In [1]:
from collections import defaultdict
from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle

In [2]:
matplotlib inline

In [3]:
from src.filter_data import filter_data
from src.data_model import parse_input
from src.data_model import Relation
from src.data_model import is_MeSH_id
from src.get_AUC_value import get_AUC_value
from src.F_score import *
from src.aggregate_results import *
from src.lingpipe.file_util import read_file

---

In [4]:
testset_pmids = {48362,
 85485,
 188339,
 326460,
 384871,
 430165,
 458486,
 625456,
 663266,
 761833,
 871943,
 921394,
 982002,
 1255900,
 1355091,
 1504402,
 1687392,
 1899352,
 1928887,
 1969772,
 2004015,
 2021202,
 2051906,
 2320800,
 2339463,
 2435991,
 2564649,
 2576810,
 2716967,
 2722224,
 2790457,
 2840807,
 2886572,
 2893236,
 2980315,
 3084782,
 3108839,
 3115150,
 3123611,
 3131282,
 3137399,
 3183120,
 3411101,
 3413271,
 3670965,
 3686155,
 3703509,
 3732088,
 3762968,
 3769769,
 3780697,
 3780814,
 3934126,
 3950060,
 3952818,
 3961813,
 3962737,
 3973521,
 4038130,
 4090988,
 6127992,
 6150641,
 6293644,
 6323692,
 6454943,
 6540303,
 6615052,
 6637851,
 6699841,
 6732043,
 6769133,
 6892185,
 7007443,
 7053705,
 7072798,
 7282516,
 7604176,
 7619765,
 7647582,
 7650771,
 7710775,
 7752389,
 7803371,
 7843916,
 8012887,
 8106150,
 8111719,
 8251368,
 8267029,
 8268147,
 8494478,
 8586822,
 8617710,
 8766220,
 8888541,
 8911359,
 8958188,
 9098464,
 9209318,
 9214597,
 9226773,
 9305828,
 9545159,
 9549528,
 9564988,
 9672273,
 9867728,
 9875685,
 9952311,
 10074612,
 10193809,
 10225068,
 10354657,
 10411803,
 10414674,
 10524660,
 10539815,
 10565806,
 10692744,
 10840460,
 11009181,
 11063349,
 11079278,
 11185967,
 11195262,
 11208990,
 11230490,
 11302406,
 11337188,
 11694026,
 11705128,
 11847945,
 11860495,
 12013711,
 12091028,
 12093990,
 12231232,
 12448656,
 12600698,
 12691807,
 12734532,
 12739036,
 12851669,
 12907924,
 14736955,
 14975762,
 14982270,
 15042318,
 15075188,
 15096016,
 15579441,
 15673851,
 15814210,
 15867025,
 15893386,
 15897593,
 15974569,
 15985056,
 16006300,
 16047871,
 16157917,
 16418614,
 16428827,
 16574713,
 16586083,
 16710500,
 16820346,
 17285209,
 17366349,
 17490790,
 17490864,
 17574447,
 17682013,
 17786501,
 17943461,
 17965424,
 18006530,
 18086064,
 18201582,
 18356633,
 18441470,
 18483878,
 18589141,
 18657397,
 18674790,
 18997632,
 19105845,
 19234905,
 19274460,
 19674115,
 19707748,
 19721134,
 19893084,
 19940105,
 19944736,
 20164825,
 20533999,
 20552622,
 20727411,
 20735774}

In [5]:
def read_gold_standard(dataset, file_format = "list"):
    assert dataset in ["training", "development"]
    assert file_format in ["list", "dict"]
    
    fname = "data/gold_standard/parsed_{0}_set_{1}.pickle".format(dataset, file_format)
    
    if os.path.exists(fname):
        print("Reading cached version of {0} set ({1})".format(dataset, file_format))
        
        with open(fname, "rb") as fin:
            data = pickle.load(fin)
    else:
        print("Parsing raw {0} file".format(dataset))
        data = parse_input("data/gold_standard".format(dataset),
                           "CDR_{0}Set.txt".format(dataset.capitalize()),
                           is_gold = True,
                           return_format = file_format,
                          fix_acronyms = False)
        
        with open(fname, "wb") as fout:
            pickle.dump(data, fout)
            
    return data

In [6]:
dev_gold = read_gold_standard("development", "dict")

Reading cached version of development set (dict)


In [7]:
gold_testset = {pmid : dev_gold[pmid] for pmid in testset_pmids}

---

In [8]:
easy_units = pd.read_csv("data/crowdflower/data_for_easy_job_761593.tsv", sep = '\t')

In [9]:
hard_units = pd.read_csv("data/crowdflower/data_for_hard_job_758438.tsv", sep = '\t')

In [10]:
easy_units.shape

(1333, 15)

In [11]:
hard_units.shape

(1115, 13)

---

### Sentence-level task

In [12]:
settings = {
    "loc": "data/crowdflower/results",
    "fname": "job_761593_full_with_untrusted.csv",
    "data_subset": "normal",
    "min_accuracy": 0.7,
    "max_accuracy": 1.0
}

sent_raw_data = filter_data(settings)

In [13]:
sent_raw_data = sent_raw_data.query("~_tainted")

In [14]:
sent_raw_data.shape

(5500, 31)

### Abstract level task

In [15]:
settings = {
    "loc": "data/crowdflower/results",
    "fname": "job_758438_full_with_untrusted.csv",
    "data_subset": "normal",
    "min_accuracy": 0.7,
    "max_accuracy": 1.0
}

abs_raw_data = filter_data(settings)

In [16]:
abs_raw_data.shape

(5315, 29)

In [17]:
bad_workers = {
 31501233,
 31720388,
 31720815,
 32025293,
 33081102,
 33081299,
 33081469,
 33081531,
 33085305,
 33085428,
 33238902,
 33301062,
 33301138,
 33596095}

In [18]:
clean_abs_data = abs_raw_data.query("_worker_id not in {0}".format(list(bad_workers)))

In [19]:
clean_abs_data.shape

(3709, 29)

---

### Count number of judgments for each work unit

In [20]:
num_judgments = dict()
for uniq_id, group in sent_raw_data.groupby("uniq_id"):
    num_judgments[uniq_id] = len(group)
    
for uniq_id, group in clean_abs_data.groupby("uniq_id"):
    num_judgments[uniq_id] = len(group)

---

In [21]:
def all_ok(id_set):
    for uniq_id in id_set:
        if uniq_id not in num_judgments or num_judgments[uniq_id] < 5:
            return False
        
    return True

good_dataset = set()
for pmid in testset_pmids:
    easy_ids = set(easy_units.query("pmid == {0}".format(pmid))["uniq_id"])
    hard_ids = set(hard_units.query("pmid == {0}".format(pmid))["uniq_id"])
    
    # check to see if each is done properly
    
    if all_ok(easy_ids):
        good_dataset.add(pmid)
    
    
    
#     if all_ok(easy_ids) and all_ok(hard_ids):
#         good_dataset.add(pmid)
    
    
    
    
    
    

In [22]:
len(good_dataset)

82

In [25]:
good_dataset

{85485,
 871943,
 1504402,
 1969772,
 2051906,
 2980315,
 3137399,
 3670965,
 3686155,
 3762968,
 3769769,
 6127992,
 7282516,
 7619765,
 8012887,
 8617710,
 10411803,
 11705128,
 14975762,
 15867025,
 16418614,
 18441470,
 18483878,
 19105845,
 19707748,
 20552622}