# User response on dropped concepts

This notebook compares the concept annotations that users marked as broken/incorrect with concept annotations that are missing from updated Pubtator files (ie- were dropped by Pubtator).  This book focuses only on concept annotations WITH an associated identifier or concepts that come from abstracts with one or less identifier-free concepts, since there are issues mapping concepts without identifiers in the Mark2Cure database.

## Import modules and data

In [2]:
import pandas
import numpy
import random
import matplotlib
from matplotlib import pyplot as mplot
import m2c_rel_basic
import relationship_dictionaries
from pandas import read_csv

Import the relationship annotations data for only completed concept pairs. This data should have already been filtered to remove annotations from test accounts

In [7]:
savepath = 'data/'
exppath = 'results/'
all_completed_anns = read_csv(exppath+'all_completed_anns.txt', delimiter='\t', header=0)
all_completed_anns.drop("Unnamed: 0",axis=1,inplace=True)
print(all_completed_anns.head(n=2))

  kind             ann_date  user_id                           evtype reltype  \
0   re  2016-05-24 06:42:40      364  gene has no relation to disease     g_d   
1   re  2016-05-24 06:44:12      364                       c_1_broken     g_d   

      pmid      concept_created      concept_updated refid1   refid2  \
0  9621534  2015-05-21 17:54:51  2015-05-21 17:54:51  59330  D001750   
1  9621534  2015-05-21 17:54:51  2015-05-21 17:54:51  59330  D005764   

      concept_pair refid1_type refid2_type  user_count  relation_count  \
0  59330_x_D001750           g           d        49.0            28.0   
1  59330_x_D005764           g           d        33.0            17.0   

   test_completions  true_responses  response_ratio                    cpmid  
0               0.0            49.0        0.571429  9621534_59330_x_D001750  
1               1.0            32.0        0.531250  9621534_59330_x_D005764  


Import data from Supp Fig 1. Investigating Concept Annotations.ipynb

In [8]:
no_species = read_csv(exppath+'concept_anns_from_updated_pub_files.txt',delimiter='\t',header=0)
no_species.drop("Unnamed: 0",axis=1,inplace=True)
all_concept_imported = read_csv(exppath+'concepts_anns_from_db.txt',delimiter='\t',header=0)
all_concept_imported.drop("Unnamed: 0",axis=1,inplace=True)
dropped_anns_with_text = read_csv(exppath+'dropped_by_pubtator.txt',delimiter='\t',header=0)
dropped_anns_with_text.drop("Unnamed: 0",axis=1,inplace=True)

#### Separate out pubtator annotations that lack identifiers from both sources. These will be treated differently.

In [22]:
###############################################################################
#### Deal with concepts for which there are no identifiers
###############################################################################

## Pull the annotations for which there is no identifier
no_ideas = no_species.loc[no_species['identifier']=='no identifier']

## Remove duplicated conceptual entries
no_ideas_no_dups = no_ideas.groupby(['pmid','text','identifier','type','length']).size().reset_index(name='appearance_count')

## Get counts of identifier-free annotations for each pmid
no_ideas_per_pmid = no_ideas_no_dups.groupby(['pmid','identifier']).size().reset_index(name='id_free_per_pmid')

## Merge back to original unique conceptual entries table 
no_ideas_no_dup_key = no_ideas_no_dups.merge(no_ideas_per_pmid, on=('pmid','identifier'), how='left')

## Get table of pmids which only have ONE concept lacking an identifier.  This table will be used as a key for the annotations without identifiers
pmid_no_id_key = no_ideas_no_dup_key.loc[no_ideas_no_dup_key['id_free_per_pmid']==1]
#print(pmid_no_id_key)

### Learn about the pmids for which id-less concept absolutely cannot be matched
## Get pmids for which there are multiple concepts with no identifier
pmids_too_many_no_ids = no_ideas_per_pmid.loc[no_ideas_per_pmid['id_free_per_pmid']>1]
## Get concepts for which there are multiple concepts with no identifier
concepts_too_many_no_ids = no_ideas_no_dup_key.loc[no_ideas_no_dup_key['id_free_per_pmid']>1]

print(len(pmids_too_many_no_ids))
print(len(concepts_too_many_no_ids))

196
471


In [23]:
completed_anns = all_completed_anns.drop(['kind','ann_date','user_id','concept_created','concept_updated'], axis=1)
annresults = completed_anns.drop_duplicates(subset=['evtype','cpmid'],keep='first')

#### Map the 'none' concepts to annotation text if it comes from a pmid that is mappable
missing_concept1 = annresults.loc[annresults['refid1']=='None']
tmp_table1 = missing_concept1.merge(pmid_no_id_key, on='pmid', how='left').fillna('None')
tmp_table1['refid1'] = tmp_table1['text']
tmp_table1.drop(['text','identifier','length','appearance_count','id_free_per_pmid','type'],axis=1, inplace=True)

missing_concept2 = annresults.loc[annresults['refid2']=='None']
tmp_table2 = missing_concept2.merge(pmid_no_id_key, on='pmid', how='left').fillna('None')
tmp_table2['refid2'] = tmp_table2['text']
tmp_table2.drop(['text','identifier','length','appearance_count','id_free_per_pmid','type'],axis=1, inplace=True)

print(missing_concept1.head(n=2))

                       evtype reltype      pmid refid1   refid2  \
7153  drug relates to disease     c_d  19322026   None  D030342   
7232               c_1_broken     c_d  25478884   None  D017772   

        concept_pair refid1_type refid2_type  user_count  relation_count  \
7153  None_x_D030342           c           d        15.0             1.0   
7232  None_x_D017772           c           d        15.0            14.0   

      test_completions  true_responses  response_ratio  \
7153               0.0            15.0        0.066667   
7232               0.0            15.0        0.933333   

                        cpmid  
7153  19322026_None_x_D030342  
7232  25478884_None_x_D017772  


In [24]:
## Create a fresh table with the mapped results
old_anns_no_nones = annresults.loc[(annresults['refid1']!='None')&(annresults['refid2']!='None')]
new_annresults = pandas.concat((old_anns_no_nones,tmp_table1,tmp_table2))

## Compare number of 'no id' annotations before and after mapping
unmapped_before = len(annresults.loc[(annresults['refid1']=='None')|(annresults['refid2']=='None')])
unmapped_after = len(new_annresults.loc[(new_annresults['refid1']=='None')|(new_annresults['refid2']=='None')])

print('unmapped_before: ',unmapped_before,' vs unmapped after: ',unmapped_after)

unmapped_before:  104  vs unmapped after:  67


In [27]:
## Get the concept annotations
no_dups = no_species.groupby(['pmid','text','identifier','type']).size().reset_index(name='no_of_mentions_pub')
no_dups['in_pubtator']='yes'

no_dups_concepts = all_concept_imported.groupby(['pmid','text','identifier','type']).size().reset_index(name='no_of_mentions_db')
no_dups_concepts['in_db']='yes'

concept_annotations = no_dups_concepts.merge(no_dups, on=(['pmid','identifier','text','type']), how='outer').fillna('no')
concept_annotations['pmid']=concept_annotations['pmid'].astype(int)
print(concept_annotations.head(n=2))

     pmid                                             text identifier type  \
0  501285  Cataractous lenses of diabetic and galactosemic    D003920    d   
1  501285                                        Galactose    D005690    c   

  no_of_mentions_db in_db no_of_mentions_pub in_pubtator  
0                 1   yes                 no          no  
1                 1   yes                 no          no  


In [31]:
###############################################################################
#### Pull the annotations that users marked as broken and see if they are ones that were thrown out by pubtator
###############################################################################
broken_c1 = new_annresults.loc[new_annresults['evtype']=='c_1_broken']
broken_c2 = new_annresults.loc[new_annresults['evtype']=='c_2_broken'] 
broken_c1['pmid']=broken_c1['pmid'].astype(int)

concept_annotations['hash']=concept_annotations['pmid'].astype(str).str.cat(concept_annotations['identifier'].astype(str).str.cat(concept_annotations['type'],sep="_x_"),sep="_x_")
unique_cps = concept_annotations.drop_duplicates(['hash'], keep='last')
unique_cps['pmid']=unique_cps['pmid'].astype(int)

broken_c1.rename(columns={'refid1':'identifier'},inplace=True)
check_broken_c1 = broken_c1.merge(unique_cps,on=(['pmid','identifier']),how='left')
print(check_broken_c1.head(n=2))

### Now do the same for broken c2s
broken_c2.rename(columns={'refid2':'identifier'},inplace=True)
check_broken_c2 = broken_c2.merge(unique_cps,on=(['pmid','identifier']),how='left')
print(check_broken_c2.head(n=2))

#check_broken_c1.to_csv(exppath+'concept_1_broken.txt', sep='\t', header=True)
#check_broken_c2.to_csv(exppath+'concept_2_broken.txt', sep='\t', header=True)

##### Inspect annotations marked as broken
broken_c1_data = check_broken_c1.copy()
broken_c1_data.drop(['refid2','reltype','concept_pair','user_count','evtype',
                             'test_completions','relation_count','true_responses', 
                             'no_of_mentions_db','no_of_mentions_pub','hash'],axis=1, inplace=True)

broken_c1_data.sort_values(['type','response_ratio'], ascending=(False,False), inplace=True)
broken_c1_data['dropped'] = broken_c1_data['in_pubtator'].replace({'yes':1,'no':-1})
broken_c1_data['degree'] = broken_c1_data['response_ratio'].multiply(broken_c1_data['dropped'])

print(broken_c1_data.head(n=2))
#broken_c1_data.to_csv(exppath+'concept_1_broken_for_plot.txt', sep='\t', header=True)
#check_broken_c1.to_csv(exppath+'concept_1_broken.txt', sep='\t', header=True)

broken_c2_data = check_broken_c2.copy()
broken_c2_data.drop(['refid1','reltype','concept_pair','user_count','evtype',
                             'test_completions','relation_count','true_responses', 
                             'no_of_mentions_db','no_of_mentions_pub','hash'],axis=1, inplace=True)

broken_c2_data.sort_values(['type','response_ratio'], ascending=(False,False), inplace=True)
broken_c2_data['dropped'] = broken_c2_data['in_pubtator'].replace({'yes':1,'no':-1})
broken_c2_data['degree'] = broken_c2_data['response_ratio'].multiply(broken_c1_data['dropped'])

print(broken_c2_data.head(n=2))
#broken_c2_data.to_csv(exppath+'concept_2_broken_for_plot.txt', sep='\t', header=True)
#check_broken_c2.to_csv(exppath+'concept_2_broken.txt', sep='\t', header=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


       evtype reltype      pmid identifier   refid2     concept_pair  \
0  c_1_broken     g_d   9621534      59330  D005764  59330_x_D005764   
1  c_1_broken     g_d  11353896       5443   202200    5443_x_202200   

  refid1_type refid2_type  user_count  relation_count         ...           \
0           g           d        33.0            17.0         ...            
1           g           d        34.0             3.0         ...            

   true_responses  response_ratio                    cpmid  \
0            32.0        0.531250  9621534_59330_x_D005764   
1            34.0        0.088235   11353896_5443_x_202200   

                              text type no_of_mentions_db in_db  \
0  gastroesophageal reflux ( GER )    g                no    no   
1                             ACTH    g                 1   yes   

  no_of_mentions_pub in_pubtator                 hash  
0                  1         yes  9621534_x_59330_x_g  
1                 no          no  11353896_x_54