# Locating Pubtator-dropped Concepts

## Import modules and data

### Import modules

In [2]:
import pandas
import m2c_rel_basic
from pandas import read_csv
from nltk import tokenize

#### Import table of pmids, titles, and abstracts derived from stored xml files

In [6]:
def get_abstracts (pubtext):
    pubtext['length'] = pubtext['text'].str.len()
    pmidslist = pubtext['pmid'].unique().tolist()
    abslist = []   
    for eachpmid in pmidslist:
        absdict = {}
        tmpdf = pubtext.loc[pubtext['pmid']==eachpmid]
        abstitle = tmpdf['text'].loc[tmpdf['kind']=='t'].iloc[0]
        absabs = tmpdf['text'].loc[tmpdf['kind']=='a'].iloc[0]
        absdict['pmid']=eachpmid
        absdict['title']=abstitle
        absdict['abstract']=absabs
        absdict['title_end']=len(abstitle)
        abslist.append(absdict)
    pubtextdf = pandas.DataFrame(abslist)
    return(pubtextdf)

### Import data

In [7]:
dropped_anns_src = 'dropped_by_pubtator.txt'
savepath = 'data/'
exppath = 'exports/'
dropped_anns = read_csv(savepath+dropped_anns_src, delimiter='\t', header=0)
dropped_anns.drop("Unnamed: 0",axis=1,inplace=True)

drp_pmidlist = dropped_anns['pmid'].unique().tolist()
#drp_pmidlist = [27103203, 27389247, 26419375]

In [8]:
pubsource = '2017.11.22 pubmed abstracts from db export.txt'   
pubtext = read_csv(savepath+pubsource, delimiter='\t', header=0)
pubtextdf = get_abstracts(pubtext)
#print(pubtextdf.head(n=2))

In [9]:
## pad with spaces and escape characters to avoid breaking up embedded text (eg- 'galactose' from 'galactosemic') 
a=(" ","","-",",") ## a concept may be the first word in a sentence, so pad with "" as well
b=(" ","-",",") ## a concept may be the last word in a sentence, but pad with "." doesn't work
escapeset = pandas.DataFrame([(x,y) for x in a for y in a])
#print(escapeset)

In [10]:
print(dropped_anns.head(n=2))

     pmid identifier type  no_of_mentions_db in_db no_of_mentions_pub  \
0  501285    C039522    c                1.0   yes                 no   
1  501285    D003920    d                3.0   yes                 no   

  in_pubtator                                             text  
0          no                              glycosylated lysine  
1          no  Cataractous lenses of diabetic and galactosemic  


In [14]:
foundlist = []

for eachdrppmid in drp_pmidlist:
    anns_to_lookup = dropped_anns.loc[dropped_anns['pmid']==eachdrppmid].reset_index(drop=True)
    pub_to_inspect = pubtextdf.loc[pubtextdf['pmid']==eachdrppmid]
    n_of_anns_to_lookup = len(anns_to_lookup)
    i=0
    ## first, check if merge character is in the text, if so, replace it with something else
    tmptitle = pub_to_inspect['title'].str.replace('~','/').iloc[0].lower() #get rid of case issues
    tmpabstract = pub_to_inspect['abstract'].str.replace('~','/').iloc[0].lower() #get rid of case issues
    ## Merge the title and the abstract
    tmptotal = tmptitle+'~'+tmpabstract
    ## Check if the split character is in the text, if so, replace it with something else
    tmpall = tmptotal.lower().replace('|','/')
    while i < n_of_anns_to_lookup:    
        annfound = 'no'
        text_to_inspect = anns_to_lookup['text'].iloc[i].lower()  
        annlength=len(text_to_inspect)
        escape_df = escapeset.copy()
        ## Create the list of annotations (with padding spaces and escape characters to look for.)
        escape_df['text_to_match']=escape_df[0].astype(str)+text_to_inspect+escape_df[1].astype(str)
        tmp_chk_list = escape_df['text_to_match'].tolist()
        ## Search using majority of the cases (combination of " "text" ")
        escape_count = 0
        ## Check for the first annotation in the list(padded with just spaces), and replace it with the delimiter
        text_to_split = tmpall.replace(tmp_chk_list[escape_count],'|')        
        splitted_txt = text_to_split.split('|')
        n_of_partitions = len(splitted_txt)
        ## split the text by the delimiter and get the positional data based on the length of the parts
        if n_of_partitions >=2:
            k=0
            placeholder=0
            while k<n_of_partitions:
                anndict = {}
                anndict['exact'] = tmp_chk_list[escape_count]
                anndict['pmid']=eachdrppmid
                anndict['cptext']=text_to_inspect
                anndict['identifier']= anns_to_lookup['identifier'].iloc[i]
                anndict['type']= anns_to_lookup['type'].iloc[i]
                anndict['length'] = annlength                
                offset = len(splitted_txt[k]) + placeholder
                endset = offset + annlength + placeholder
                anndict['appear_no'] = k
                anndict['offset'] = offset
                anndict['endset'] = endset
                foundlist.append(anndict)     
                placeholder = endset
                k=k+2   
        else:
            ## check for other cases
            escape_count = 1
            while escape_count < len(escape_df):
                text_to_split = tmpall.replace(tmp_chk_list[escape_count],'|')        
                splitted_txt = text_to_split.split('|')
                n_of_partitions = len(splitted_txt)
                if n_of_partitions >=2:
                    k=0
                    placeholder=0
                    while k<n_of_partitions:
                        anndict = {}
                        anndict['exact'] = tmp_chk_list[escape_count]
                        anndict['pmid']=eachdrppmid
                        anndict['cptext']=text_to_inspect
                        anndict['identifier']= anns_to_lookup['identifier'].iloc[i]
                        anndict['type']= anns_to_lookup['type'].iloc[i]
                        anndict['length'] = annlength                
                        offset = len(splitted_txt[k]) + placeholder
                        endset = offset + annlength + placeholder
                        anndict['appear_no'] = k
                        anndict['offset'] = offset
                        anndict['endset'] = endset
                        foundlist.append(anndict)     
                        placeholder = endset
                        k=k+2 
                else: 
                    ## save the failed attempt at matching
                    k=0
                    placeholder=0
                    anndict = {}
                    anndict['exact'] = tmp_chk_list[escape_count]
                    anndict['pmid']=eachdrppmid
                    anndict['cptext']=text_to_inspect
                    anndict['identifier']= anns_to_lookup['identifier'].iloc[i]
                    anndict['type']= anns_to_lookup['type'].iloc[i]
                    anndict['length'] = annlength                
                    offset = -1
                    endset = -1
                    anndict['appear_no'] = -1
                    anndict['offset'] = offset
                    anndict['endset'] = endset
                    foundlist.append(anndict)     
                    placeholder = endset
                    k=k+2 
                escape_count = escape_count+1
        i=i+1

founddf = pandas.DataFrame(foundlist)

dropped_anns_found = founddf.loc[founddf['offset']!=-1]
dropped_anns_missing = founddf.loc[founddf['offset']==-1]

print(len(dropped_anns_found))
print(len(dropped_anns_missing))
print(founddf.head(n=2))

4195
1701
   appear_no               cptext  endset                  exact identifier  \
0          0  glycosylated lysine     365   glycosylated lysine     C039522   
1          2  glycosylated lysine    1932   glycosylated lysine     C039522   

   length  offset    pmid type  
0      19     346  501285    c  
1      19    1548  501285    c  


In [15]:
## Store the results for concept distance analysis
founddf.to_csv(savepath+'dropped_anns_offsets.txt',sep='\t',header=True)

In [11]:
## Limit the tokenization of abstracts to just the abstracts where annotations were completed
all_completed_anns = read_csv(savepath+'all_completed_anns.txt', delimiter='\t', header=0)
all_completed_anns.drop("Unnamed: 0",axis=1,inplace=True)
relevant_pubtextdf = pubtextdf.loc[pubtextdf['pmid'].isin(set(all_completed_anns['pmid'].tolist()))].copy()
print('number of pmids with pubtator files in db: ',len(pubtextdf))
print('number of relevant pmids: ',len(relevant_pubtextdf))

number of pmids with pubtator files in db:  4241
number of relevant pmids:  234


In [12]:
relevant_pubtextdf['abstract_length']=relevant_pubtextdf['abstract'].str.len()
relevant_pubtextdf['char_count'] = relevant_pubtextdf['title_end']+relevant_pubtextdf['abstract_length']
relevant_pubtextdf['text'] = relevant_pubtextdf['title']+"\n"+relevant_pubtextdf['abstract']
relevant_pubtextdf['sentence_count'] = 0
relevant_pubtextdf['word_count'] = 0

i=0
while i < len(relevant_pubtextdf):
    relevant_pubtextdf['sentence_count'].iloc[i] = len(tokenize.sent_tokenize(relevant_pubtextdf['text'].iloc[i],language='english'))
    relevant_pubtextdf['word_count'].iloc[i] = len(tokenize.word_tokenize(relevant_pubtextdf['text'].iloc[i],language='english'))
    i=i+1 

print(relevant_pubtextdf.head(n=2))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


                                              abstract      pmid  \
809  The conserved oligomeric Golgi ( COG ) complex...  23430875   
810  Complex carbohydrates are macromolecules biosy...  18269265   

                                                 title  title_end  \
809   COG5-CDG with a Mild Neurohepatic Presentation .         48   
810  Ion mobility mass spectrometry analysis of hum...         63   

     abstract_length  char_count  \
809              853         901   
810             1360        1423   

                                                  text  sentence_count  \
809  COG5-CDG with a Mild Neurohepatic Presentation...               7   
810  Ion mobility mass spectrometry analysis of hum...               6   

     word_count  
809         155  
810         206  


In [13]:
tokenized_pmids = relevant_pubtextdf[['pmid','sentence_count','word_count','title_end','abstract_length','char_count']].copy()
tokenized_pmids['char_p_sent'] = tokenized_pmids['char_count'].div(tokenized_pmids['sentence_count'])
tokenized_pmids['char_p_word'] = tokenized_pmids['char_count'].div(tokenized_pmids['word_count'])
print(tokenized_pmids.head(n=2))

         pmid  sentence_count  word_count  title_end  abstract_length  \
809  23430875               7         155         48              853   
810  18269265               6         206         63             1360   

     char_count  char_p_sent  char_p_word  
809         901   128.714286     5.812903  
810        1423   237.166667     6.907767  


In [14]:
## export the data from the tokenized pmids
tokenized_pmids.to_csv(exppath+'tokenized_pmids.txt',sep='\t', header=True)