# **Extracting articles ID from PubMed with a list of common name for statistical analysis**

The code consists of the following Task


Extracting the list of article Ids for articles that use common name without reference to the corresponding scientific name for a list of common names.The code for single occurence is reused for this purpose .It also used for statistical analysis on the number of relevant articles retreived from PubMed

Required Installations

In [None]:
!pip install biopython
!pip install beautifulsoup4
!pip install --upgrade openpyxl==3.0.5
!pip install --upgrade pandas==1.1.2
!pip install --upgrade python==3.7
!pip install XlsxWriter


import dependent libraries

In [None]:
import pandas as pd
import re
import Bio
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
Entrez.email =  "radhu.palliyana@gmail.com" # provide mail id after creating api key
from openpyxl import load_workbook
import xlsxwriter

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Load MPNS version 11 datasets:


1.   mpns_v11_non_sci_names_1.csv containing non scientific names or common or pharmaceutical names of medicinal plants
2.   mpns_v11_plants_1.csv containing scientific names of medicinal plants
3.   mpns_v11_synonyms_1.csv containing synonyms or old scientific names of medicinal plants

In [None]:
mpns_non_sci = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_non_sci_names_1.csv")
mpns_plant = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_plants_1.csv")
#mpns_synon = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_synonyms_1.csv")

**Extracting all the article Ids for articles that use common name without reference to the corresponding scientfic name**

merging Table 1 ,mpns plant dataset and Table 3 , mpns non scientific dataset together with corresponding name id(from table 1) and plant_id(from table 3)

As per MPNS Data dictionary name_id field value links each and every row in TABLE 3 Non Scientific Names to ONE data row in either TABLE 1 PLANTS OR TABLE 2 SYNONYMS


**Data Cleaning and feature engineering**

In [None]:
mpns_non_sci.rename(columns ={'name_id':'name_id_non_sci'},inplace= True) # renaming the column name_id to name_id_non_sci for mpns non scientific(table 3) dataset

In [None]:
art1 = pd.merge(mpns_plant,mpns_non_sci, how = "left" , left_on= "name_id",right_on="plant_id") # merging of the tables 1 and 3
#print(art1.head())
# Checking for header values
for col in art1.columns:
    print(col)

In [None]:
#checking for null values
print(art1.shape[0] - art1.count())

In [None]:
#dataframe created for plants names with non_scientific names used after merging with dataset containing scientific name(mpns_plant) and non scientific name(mpns_non_sci) of plants
art1_non_sci = art1[~art1['name'].isnull()] # plants with non - scientific name null is removed
art1_non_sci = art1_non_sci[~art1_non_sci['full_scientific_name'].isnull()] #plants with scientific name null is removed as TypeError: decoding to str: need a bytes-like object, float found is displayed due to null scientific names
print(art1_non_sci.shape[0] - art1_non_sci.count()) # checking for null values

In [None]:
# feature selection
pd.options.mode.chained_assignment = None #ignoring the warning caused by dropping . value default 'warn'
#dropping the following coumns as its not required for the mapping
art1_non_sci.drop(['genus_hybrid','species_hybrid','infra_species','parent_author'],axis ='columns',inplace = True)
#After confirming from Kew Garden team low quality matches are ignored
art1_non_sci.drop(art1_non_sci.loc[art1_non_sci['quality_rating']=='L'].index, inplace = True)
print(art1_non_sci.shape[0] - art1_non_sci.count())
print(art1_non_sci.head())

In [None]:
#dataframe created for plants names with pharmaceutical names used after merging with scientific name
art1_common = (art1_non_sci.loc[art1_non_sci['name_type']== 'common'])
print(art1_common.head())

In [None]:
#checking for null values
print(art1_common.shape[0] - art1_common.count())

Checking for multiple occurence of common names

In [None]:
art1_common_duplicate =art1_common[art1_common.duplicated('name')]
print('Duplicated rows are ', art1_common_duplicate) # 36070 rows duplicated which means there are multiple occurence of the name

Get first 10 rows of common name

In [None]:
art1_common_head = art1_common.head(100) # top 5 term taken

In [None]:
#selecting range
art1_common_head = art1_common.iloc[512:513] #rows taken from the dataset after 3rd value till 5(includes 5)

Looking for articles having common names without reference to a scientific name

In [None]:
#phram_name = art1_pharm['name']
#phram_name =   ['epilobii herba','epimedii wushanensis folium','equiseti hiemalis herba'] #  'epilobii herba','epimedii wushanensis folium', 
import time
common_name = art1_common_head['name']
#phram_name =   ['epilobii herba','epimedii wushanensis folium','equiseti hiemalis herba'] #use if providing single name or a small list of names

for j in common_name:
    
    term_common = f"{j}"
    print(term_common)
    scientific_name_common =(art1_common.loc[art1_common['name']== term_common,'full_scientific_name']).iloc[0] #full scientific name corresponding to common name is selected
    scientific_name_common = scientific_name_common.strip('. ')# strip all '.' from the beginning and end of string
    len_sci_common= len(scientific_name_common ) #calculating the length of the scientific name for the given common name
    len_term_common= len(term_common ) #calculating the length of the for the given common name
    time.sleep(0.5)
    handle = Entrez.esearch(db ="pmc", term= term_common,retmax= "500")# search and retrieve max 50 article id for each common name
    rec_list = Entrez.read(handle)
    handle.close()
    #print(rec_list['Count'])
    Total_article = (rec_list['Count'])
    #print(len(rec_list['IdList']))
    Ret_max_val = (len(rec_list['IdList']))
    #print(rec_list['IdList'])
    total_id = rec_list['IdList']
    #print("scientific name of term "+term_common+" is :",scientific_name_common )
    
    no_common_1 = 0 #count number of articles with common name in the body of the article
    no_common_2 = 0 #count number of articles with common name in the abstract of the article
    no_sci_1 = 0 #count number of articles with scientific name in the body of the article
    no_sci_2 = 0 #count number of articles with scientific name in the abstract of the article
    no_body = 0  #Count number of articles with no body
    Tot_extrac =0 #Actual number of articles retreived      
    
    for id in total_id:
      handle = Entrez.efetch(db='pmc', id = id , retmode = 'xml')
      total_content =  handle.read()
      #print("Entire text in the article id",id)
      soup = BeautifulSoup(total_content,"html.parser")
      abstracts = soup.find('abstract')#find the tag named 'abstract'
      body = soup.find('body')#find the tag named 'body'
      
      try:
        body_text = body.get_text()
        body_term_text_common_1 = (body.get_text()).lower()# entire text converted to lower case        
        #print(body_text) #print entire body of the article
        sci_name_body= body_text.find(scientific_name_common )#gets the position or the starting index of the word
        abstract_text = abstracts.get_text()
        #print(abstract_text) #print entire abstract of the article
        sci_name_abstract= abstract_text.find(scientific_name_common )#gets the position or the starting index of the word
      
        term_name_body= body_term_text_common_1.find(term_common )#gets the position or the starting index of the word

        abstract_term_text_common_1 = (abstracts.get_text()).lower()# entire text converted to lower case
        term_name_abstract= abstract_term_text_common_1.find(term_common)#gets the position or the starting index of the word
      
        #Checking for common name in the body of article
        extract_body_term_common = body_term_text_common_1[term_name_body:term_name_body+ len_term_common] #the common name is extracted from the body of the article using string slicing
        #print("common name",extract_body_term_common)
      
      
        if extract_body_term_common == term_common : # verifying if the common name in the body of the article matches with actual scientific name of the plant
          #print("common name is present in body of the article :",id)
          no_common_1 +=1
        else:
      
          #print("common name is not present in body of the article :",id)
          pass

        #Checking for common name in the abstract of article
        extract_abstract_term_common = abstract_term_text_common_1[term_name_abstract:term_name_abstract+ len_term_common] #the common name is extracted from the body of the article using string slicing
        #print("common name",extract_abstract_term_common)
      
      
        if extract_abstract_term_common == term_common : # verifying if the common name in the body of the article matches with actual scientific name of the plant
          #print("common name is present in abstract of the article :",id)
          no_common_2 +=1
        else:
      
          #print("common name is not present in abstract of the article :",id)
          pass     


        #Checking for scientific name in the body of article
        extract_body_sci_common = body_text[sci_name_body:sci_name_body+ len_sci_common] #the scientific name is extracted from the body of the article using string slicing
        print("scientific name",extract_body_sci_common)
      
        if extract_body_sci_common == scientific_name_common : # verifying if the scientific name in the body of the article matches with actual scientific name of the plant
          #print("Scientific name is present in body of the article :",id)
          no_sci_1 +=1
        else:
      
          #print("Scientific name is not present in body of the article :",id)
          pass

        #Checking for scientific name in the abstract of the article
        extract_abstract_sci_common = abstract_text[sci_name_abstract:sci_name_abstract+ len_sci_common] #the scientific name is extracted from the abstract of the article using string slicing
        #print("scientific name",extract_abstract_sci_common)
      
        if extract_abstract_sci_common == scientific_name_common : # verifying if the scientific name in the abstract of the article matches with actual scientific name of the plant
          #print("Scientific name is present in abstract of the article :",id)
          no_sci_2 +=1
        else:      
          #print("Scientific name is not present in  abstract of the article :",id)
          pass  
        
        Tot_extrac +=1 


      except AttributeError:
        no_body +=1
        continue
    df_ex_common = pd.DataFrame({'common_name':[term_common],'Scientific_name':[scientific_name_common],'common_body': [no_common_1],'common_abstract':[no_common_2],'sci_body':[no_sci_1],'sci_abstract':[no_sci_2],"Tot_No_PubMed_article": [Total_article],"Retrive_max_value": [Ret_max_val],"No_article_No_body": [no_body],"Actual_article_retreived":[Tot_extrac]})
    #with pd.ExcelWriter("Common_result.xlsx",mode="a",engine="openpyxl",if_sheet_exists="overlay") as writer:
    #df_ex_common.to_excel(writer, sheet_name="Common_Output",header=None, startrow=writer.sheets["Common_Output"].max_row,index=True,index_label="No.")
    writer = pd.ExcelWriter('Common_result.xlsx', engine ='openpyxl')#,mode ='a',if_sheet_exists="overlay"  #,if_sheet_exists="replace"
    writer.book = load_workbook('Common_result.xlsx')
    writer.sheets = dict((ws.title,ws) for ws in writer.book.worksheets)
    reader = pd.read_excel(r'Common_result.xlsx')
    df_ex_common.to_excel(writer,index= True,index_label="No.",header = False,sheet_name="Common_Output",startrow = len(reader)+1)
    #df_ex_common.to_excel(writer,index= True,index_label="No.",sheet_name="Common_Output",header = False,startrow=writer.sheets["Common_Output"].max_row )
    writer.save()
    writer.close()
        

#print("No of times pharmaceutical name "+term_common+ " appeared in article is : ",no_common_1+no_common_2)  
#print("No of times Scientific name "+scientific_name_common+" appeared ",no_sci_1+no_sci_2)
#print("No of times articles without body ",no_body)

# Ignore entire code from this section onwards
Contain trail versions of the code


when tried to retreive maximum number of article for the top 100 common name its seen that the term in 8th position for common names with 3rd article id 9499808 in the list of ids retreived for that term throwed the attribute error as the body tag was not available.
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=9499808

This is due to the limitations in the code as it considers only articles having a body and abstract tag.

Hence the single occurence common name search is implemented for each of the following selected common names seperately to have a statistics of the number of articles having correct scientific name:
chhoti elachi, chichira, aanapa-kai, abbe, abete balsamifero, abong-abong, abrus, ginori, giant chickweed, giant cactus

In [None]:
# taking common name and checking for scientific names in the articles  chhoti elachi

term = 'giant cactus'   #The term need to be fetched for each individual term in the list :  chhoti elachi, chichira, aanapa-kai, abbe, abete balsamifero, abong-abong, abrus, ginori, giant chickweed, giant cactus
print(term)
handle = Entrez.esearch(db ="pmc", term= term,retmax= "9")# retrieve max is changed for article id for each common name.If the search throws error, the corresponding id is not considered and the count of article till that run is taken to get the retmax value to be entered for successfull run of the code.
rec_list = Entrez.read(handle)
handle.close()
print(rec_list['Count']) # displays the total number of articles Id containing the given common name
print(len(rec_list['IdList'])) #List the total number of article retrieved. If total number of article containing the common name are more than retmax parameter value given, it returns the given value in retmax.
total_id = rec_list['IdList']
print('The article ids corresponding to the given common name are :' ,total_id)# displays the articles Id corresponding to the given common name

In [None]:
scientific_name_common =(art1_common.loc[art1_common['name']== term,'full_scientific_name']).iloc[0] #full scientific name corresponding to common name is selected
scientific_name_common = scientific_name_common.strip('. ')# strip all '.' from the beginning and end of string
len_sci_common= len(scientific_name_common ) #calculating the length of the scientific name for the given common name
len_term_common= len(term ) #calculating the length of the scientific name for the given common name

In [None]:
print(scientific_name_common)#print scientific name
print(len_sci_common) # print length of the scientific name

In [None]:
#extracting and checking for each article id corresponding to the given term if the correct scientific name is provided or not
for id in total_id:
    handle = Entrez.efetch(db='pmc', id = id , retmode = 'xml')
    total_content =  handle.read()
    print("Entire text in the article id",id)
    #print(total_content) # print the entire html output

    soup = BeautifulSoup(total_content,"html.parser")
    abstracts = soup.find('abstract')#find the tag named 'abstract'
    body = soup.find('body')#find the tag named 'body'

    body_text = body.get_text() # get text out the body tag
    #print(body_text) #print the entire body of the article
    sci_name_body= body_text.find(scientific_name_common ) #gets the position or the starting index of the word

    abstract_text = abstracts.get_text() # get text out of the abstract tag
    #print(abstract_text) #print the entire body of the article
    sci_name_abstract= abstract_text.find(scientific_name_common )#gets the position or the starting index of the word

    body_term_text_common_1 = (body.get_text()).lower()# entire text converted to lower case
    term_name_body= body_term_text_common_1.find(term )#gets the position or the starting index of the word

    abstract_term_text_common_1 = (abstracts.get_text()).lower()# entire text converted to lower case
    term_name_abstract= abstract_term_text_common_1.find(term )#gets the position or the starting index of the word

#Checking for common name in the body of article
    extract_body_term_common = body_term_text_common_1[term_name_body:term_name_body+ len_term_common] #the common name is extracted from the body of the article using string slicing
    print("common name",extract_body_term_common)
    if extract_body_term_common == term : # verifying if the common name in the body of the article matches with actual scientific name of the plant
      print("common name is present in body of the article :",id)
    else:
      
      print("common name is not present in body of the article :",id)     

#Checking for common name in the abstract of article
    extract_abstract_term_common = abstract_term_text_common_1[term_name_abstract:term_name_abstract+ len_term_common] #the common name is extracted from the body of the article using string slicing
    print("common name",extract_abstract_term_common)
    if extract_abstract_term_common == term : # verifying if the common name in the body of the article matches with actual scientific name of the plant
      print("common name is present in abstract of the article :",id)
    else:
      
      print("common name is not present in abstract of the article :",id)     



#Checking for scientific name in the body of article
    extract_body_sci_common = body_text[sci_name_body:sci_name_body+ len_sci_common] #the scientific name is extracted from the body of the article using string slicing
    print("scientific name",extract_body_sci_common)
    if extract_body_sci_common == scientific_name_common :# verifying if the scientific name in the body of the article matches with actual scientific name of the plant
      print("Scientific name is present in body of the article :",id)
    else:
      print("Scientific name is not present in body of the article :",id)

#Checking for scientific name in the abstract of the article
    extract_abstract_sci_common = abstract_text[sci_name_abstract:sci_name_abstract+ len_sci_common]#the scientific name is extracted from the abstract of the article using string slicing
    print("scientific name",extract_abstract_sci_common)
    if extract_abstract_sci_common == scientific_name_common : # verifying if the scientific name in the abstract of the article matches with actual scientific name of the plant
      print("Scientific name is present in abstract of the article :",id)
    else:
      print("Scientific name is not present in  abstract of the article :",id) 


In [None]:
""" 
#code for checking through all pharmaceutical name and retreive the corresponding article id
common_name = art1_common['name']
term_common = ' '# input the common name of medicinal plant

for j in common_name:
    
    term_common = f"{j}[name]"
    print(term_common)
    handle = Entrez.esearch(db ="pmc", term= term_common,retmax= "50")# search and retrieve max 50 article id for each pharmaceutical name
    rec_list = Entrez.read(handle)
    handle.close()
    print(rec_list['Count'])
    print(len(rec_list['IdList']))
    print(rec_list['IdList'])
    total_id = rec_list['IdList']
"""

In [None]:
"""
#Code to retreive if search criteria has two or more parameters like genus and species.
common_gen= art1_common['genus'].tolist() #convert the genus column to list
common_spe= art1_common['species'].tolist() #convert the species column to list
term = ' '# input the common name of medicinal plant

for (i,j) in zip(common_gen,common_spe):
    
    term = f"{i}[genus] AND {j}[species]"
    print(term)
    handle = Entrez.esearch(db ="pmc", term= term,retmax= "50")
    rec_list = Entrez.read(handle)
    handle.close()
    print(rec_list['Count'])
    print(len(rec_list['IdList']))
    print(rec_list['IdList'])
    total_id = rec_list['IdList']

"""

**Reference:**
Entrez is a molecular biology database system that provides integrated access to nucleotide and protein sequence.The system is produced by the National Center for Biotechnology Information (NCBI).

Entrez Programming Utilities user guide is available at : https://www.ncbi.nlm.nih.gov/books/NBK25501/