# Qualitative Inspection for Missing Relationship types

This notebook covers the sampling of non-specific relationship annotations for qualitative inspection

## Import modules and load data

In [2]:
import pandas
import m2c_rel_basic
import relationship_dictionaries
import random
import matplotlib
from matplotlib import pyplot as mplot
from pandas import read_csv

Import the relationship annotations data for only completed concept pairs

In [3]:
savepath = 'data/'
exppath = 'results/'
#print(al_data_imported.head(2))
all_completed_anns = read_csv(exppath+'all_completed_anns.txt', delimiter='\t', header=0)
all_completed_anns.drop("Unnamed: 0",axis=1,inplace=True)

Import the dictionaries for translating hashed responses

In [4]:
rel_hash_dict,redundant_response_dict,abbreviated_rels_dict,abbreviated_rels_dict_4_hash,concept_broken_dict,concept_not_broken_dict = relationship_dictionaries.load_RE_dictionaries()

## Sample the user annotations for qualitative inspection
Based on our k vs accuracy evaluations, we know that diminishing returns on accuracy is reached at a k of 6.  Therefore, we will evaluate all per-pmid concept pairs that were marked as having an unspecified relationship by at least 6 useres. We will also segment out per-pmid concept pairs that were marked as having no relationship for a different qualitative analysis.

Note that treating redundant responses such as 'other relation, relation unclear' the same as 'relates to' does not increase the number of per-pmid concept pairs that were marked by at least six users; hence, they are treated distinctly below.

In [10]:
### Slice user annotation dataframe based on the response.
relation_unclear = all_completed_anns.loc[all_completed_anns['evtype'].astype(str).str.contains('other relation, or relation unclear')]
no_relations = all_completed_anns.loc[all_completed_anns['evtype'].astype(str).str.contains('has no relation')]
has_relation = all_completed_anns.loc[all_completed_anns['evtype'].astype(str).str.contains('relates to')]
cannot_determine = all_completed_anns.loc[all_completed_anns['evtype'].astype(str).str.contains('cannot be determined')]

### Obtain counts for each concept pair per pmid (cpmid).
### Pull results in which at least 6 users agreed

unclear_per_cpmid = relation_unclear.groupby(['concept_pair','pmid','reltype']).size().reset_index(name='counts')
unclear_per_cpmid.sort_values('counts', ascending=False,inplace=True)
at_least_6_unclear = unclear_per_cpmid.loc[unclear_per_cpmid['counts']>5]

relates_per_cpmid = has_relation.groupby(['concept_pair','pmid','reltype']).size().reset_index(name='counts')
relates_per_cpmid.sort_values('counts', ascending=False,inplace=True)
at_least_6_relates = relates_per_cpmid.loc[relates_per_cpmid['counts']>5]

norel_per_cpmid = no_relations.groupby(['concept_pair','pmid','reltype']).size().reset_index(name='counts')
norel_per_cpmid.sort_values('counts', ascending=False,inplace=True)
at_least_6_norel = norel_per_cpmid.loc[norel_per_cpmid['counts']>5]

cannot_deter_per_cpmid = cannot_determine.groupby(['concept_pair','pmid','reltype']).size().reset_index(name='counts')
cannot_deter_per_cpmid.sort_values('counts', ascending=False,inplace=True)
at_least_6_cannot_tell = cannot_deter_per_cpmid.loc[cannot_deter_per_cpmid['counts']>5]


### Dealing with per-pmid concept pairs which were annotated as having an unspecified relationship by less than six users.

In [21]:
## Translate/map and standardize responses
all_completed_anns.replace({'evtype':redundant_response_dict}, inplace=True)

no_rel_unclear = all_completed_anns.loc[all_completed_anns['evtype'].astype(str).str.contains('no relation')]
relates_to = all_completed_anns.loc[all_completed_anns['evtype'].astype(str).str.contains('relates to')]

no_rel_unclear_cpmid = no_rel_unclear.groupby(['concept_pair','pmid','reltype','cpmid']).size().reset_index(name='counts')
no_rel_unclear_cpmid.sort_values('counts', ascending=False,inplace=True)
relates_cpmid = relates_to.groupby(['concept_pair','pmid','reltype','cpmid']).size().reset_index(name='counts')
relates_cpmid.sort_values('counts', ascending=False,inplace=True)

## Subset the relationship annotations by the count and response.
under_6 = relates_cpmid.loc[relates_cpmid['counts']<=5]
#under_6 = no_rel_unclear_cpmid.loc[no_rel_unclear_cpmid['counts']<=5]

max_k=5

i=1
sample_table = pandas.DataFrame(columns=['concept_pair','pmid','reltype','cpmid','counts'])
while i<=max_k:
    sampling_list = under_6['cpmid'].loc[under_6['counts']==i].tolist()
    try:
        samples_per_iteration = 25
        sampling_set = random.sample(sampling_list, samples_per_iteration)
    except:
        samples_per_iteration = 20
        sampling_set = random.sample(sampling_list, samples_per_iteration)        
    for eachcpmid in sampling_set:
        tmp_table = under_6.loc[under_6['cpmid']==eachcpmid]
        sample_table = pandas.concat((sample_table,tmp_table))
    i=i+1

sample_table['pmid'] = sample_table['pmid'].astype(int)
sample_table['counts'] = sample_table['counts'].astype(int)

print(sample_table.head(n=2))

          concept_pair      pmid reltype                       cpmid  counts
251     8086_x_D010523  18628786     g_d     18628786_8086_x_D010523       1
365  C404789_x_D004403  10858229     c_d  10858229_C404789_x_D004403       1


## Export out the samples for qualitative inspection

In [None]:
#at_least_6.to_csv(exppath+'relation_unclear_min_6.txt', sep='\t', header=True)
#at_least_6_norel.to_csv(exppath+'no_rel_min_6.txt', sep='\t', header=True)
#at_least_6_relates.to_csv(exppath+'relates_min_6.txt', sep='\t', header=True)
#sample_table.to_csv(exppath+'relates_under_6.txt', sep='\t', header=True)
#sample_table.to_csv(exppath+'unrelated_under_6.txt', sep='\t', header=True)