# **Extracting articles ID with common name from PubMed**

The code consists of the follwoing Task

Task 5: Extracting all the article Ids for articles that use common name without reference to the corresponding scientfic name

Required Installations

In [None]:
!pip install biopython
!pip install beautifulsoup4


import dependent libraries

In [None]:
import pandas as pd
import re
import Bio
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
Entrez.email =  "radhu.palliyana@gmail.com" # provide mail id after creating api key

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Load MPNS version 11 datasets:


1.   mpns_v11_non_sci_names_1.csv containing non scientific names or common or pharmaceutical names of medicinal plants
2.   mpns_v11_plants_1.csv containing scientific names of medicinal plants
3.   mpns_v11_synonyms_1.csv containing synonyms or old scientific names of medicinal plants

In [None]:
mpns_non_sci = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_non_sci_names_1.csv")
mpns_plant = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_plants_1.csv")
#mpns_synon = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_synonyms_1.csv")

**Extracting all the article Ids for articles that use common name without reference to the corresponding scientfic name**

merging Table 1 ,mpns plant dataset and Table 3 , mpns non scientific dataset together with corresponding name id(from table 1) and plant_id(from table 3)

As per MPNS Data dictionary name_id field value links each and every row in TABLE 3 Non Scientific Names to ONE data row in either TABLE 1 PLANTS OR TABLE 2 SYNONYMS


**Data Cleaning and feature engineering**

In [None]:
mpns_non_sci.rename(columns ={'name_id':'name_id_non_sci'},inplace= True) # renaming the column name_id to name_id_non_sci for mpns non scientific(table 3) dataset

In [None]:
art1 = pd.merge(mpns_plant,mpns_non_sci, how = "left" , left_on= "name_id",right_on="plant_id") #merging table 1 and table 3
#print(art1.head())
# Checking for header values
for col in art1.columns:
    print(col)

In [None]:
#checking for null values
print(art1.shape[0] - art1.count())

In [None]:
#dataframe created for plants names with non_scientific names used after merging with dataset containing scientific name(mpns_plant) and non scientific name(mpns_non_sci) of plants
art1_non_sci = art1[~art1['name'].isnull()] # plants with non - scientific name null is removed
art1_non_sci = art1_non_sci[~art1_non_sci['full_scientific_name'].isnull()] #plants with scientific name null is removed as TypeError: decoding to str: need a bytes-like object, float found is displayed due to null scientific names
print(art1_non_sci.shape[0] - art1_non_sci.count()) # checking for null values

In [None]:
# feature selection
pd.options.mode.chained_assignment = None #ignoring the warning caused by dropping . value default 'warn'
#dropping the following coumns as its not required for the mapping
art1_non_sci.drop(['genus_hybrid','species_hybrid','infra_species','parent_author'],axis ='columns',inplace = True)
#After confirming from Kew Garden team low quality matches are ignored
art1_non_sci.drop(art1_non_sci.loc[art1_non_sci['quality_rating']=='L'].index, inplace = True)
print(art1_non_sci.shape[0] - art1_non_sci.count())
print(art1_non_sci.head())

In [None]:
#dataframe created for plants names with common names used after merging with scientific name
art1_common = (art1_non_sci.loc[art1_non_sci['name_type']== 'common'])
print(art1_common.head())

In [None]:
#checking for null values
print(art1_common.shape[0] - art1_common.count())

Checking for multiple occurence of common names

In [None]:
art1_common_duplicate =art1_common[art1_common.duplicated('name')]
print('Duplicated rows are ', art1_common_duplicate) # 36070 rows duplicated which means there are multiple occurence of the name

# Single common name search

In [None]:
# taking common name and checking for scientific names in the articles

term = 'chhoti elachi'   #Elettaria cardamomum (L.) Maton is the scientific name  
print(term)
handle = Entrez.esearch(db ="pmc", term= term,retmax= "25")#retmax can be set for maximum number articles to be retreived
rec_list = Entrez.read(handle)
handle.close()
print(rec_list['Count']) # displays the total number of articles Id containing the given common name
print(len(rec_list['IdList'])) #List the total number of article retrieved. If total number of article containing the common name are more than retmax parameter value given, it returns the given value in retmax.
total_id = rec_list['IdList']
print('The article ids corresponding to the given common name are :' ,total_id)# displays the articles Id containing the given common name

5884010', '2876931 are the list of article corresponding to the given common name

Both the article have scientific names tagged to this common name but there not exact match to the scientific name.

To get check for each article manually navigate to and replace the PMC iD with the article id number for eg for id 9340222 :https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9340222/

PN: Scientific name displayed in reference,tables or images are not detected

In [None]:
scientific_name_common =(art1_common.loc[art1_common['name']== term,'full_scientific_name']).iloc[0] #full scientific name corresponding to common name is selected
scientific_name_common = scientific_name_common.strip('. ')# strip all '.' from the beginning and end of string
len_sci_common= len(scientific_name_common ) #calculating the length of the scientific name for the given common name
len_term_common= len(term ) #calculating the length of the scientific name for the given common name

In [None]:
print(scientific_name_common)
print(len_sci_common)

Checking for pharmaceutical name and scientific name in both body and abstract of article

In [None]:
#extracting and checking for each article id corresponding to the given term if the correct scientific name is provided or not
for id in total_id:
    handle = Entrez.efetch(db='pmc', id = id , retmode = 'xml') 
    total_content =  handle.read()
    print("Entire text in the article id",id)
    #print(total_content) #print the entire article content as html output
    soup = BeautifulSoup(total_content,"html.parser")
    abstracts = soup.find('abstract')#find the tag named 'abstract'
    body = soup.find('body')#find the tag named 'body'
    body_text = body.get_text() #get the text out of body tag
    #print(body_text) #print the entire body of the article
    sci_name_body= body_text.find(scientific_name_common ) #gets the position or the starting index of the word
    abstract_text = abstracts.get_text() #get the text out of abstract tag
    #print(abstract_text) #prints the entire abstract of the article
    sci_name_abstract= abstract_text.find(scientific_name_common )#gets the position or the starting index of the word

    body_term_text_common_1 = (body.get_text()).lower()# entire text converted to lower case
    term_name_body= body_term_text_common_1.find(term )#gets the position or the starting index of the word

    abstract_term_text_common_1 = (abstracts.get_text()).lower()# entire text converted to lower case
    term_name_abstract= abstract_term_text_common_1.find(term )#gets the position or the starting index of the word

#Checking for common name in the body of article
    extract_body_term_common = body_term_text_common_1[term_name_body:term_name_body+ len_term_common] #the common name is extracted from the body of the article using string slicing
    print("common name",extract_body_term_common)
    if extract_body_term_common == term : # verifying if the common name in the body of the article matches with actual scientific name of the plant
      print("common name is present in body of the article :",id)
    else:
      
      print("common name is not present in body of the article :",id)     

#Checking for common name in the abstract of article
    extract_abstract_term_common = abstract_term_text_common_1[term_name_abstract:term_name_abstract+ len_term_common] #the common name is extracted from the body of the article using string slicing
    print("common name",extract_abstract_term_common)
    if extract_abstract_term_common == term : # verifying if the common name in the body of the article matches with actual scientific name of the plant
      print("common name is present in abstract of the article :",id)
    else:
      
      print("common name is not present in abstract of the article :",id) 

#Checking for scientific name in the body of article
    extract_body_sci_common = body_text[sci_name_body:sci_name_body+ len_sci_common] #the scientific name is extracted from the body of the article using string slicing
    print("scientific name",extract_body_sci_common)
    if extract_body_sci_common == scientific_name_common :# verifying if the scientific name in the body of the article matches with actual scientific name of the plant
      print("Scientific name is present in body of the article :",id)
    else:
      print("Scientific name is not present in body of the article :",id)

#Checking for scientific name in the abstract of the article
    extract_abstract_sci_common = abstract_text[sci_name_abstract:sci_name_abstract+ len_sci_common]#the scientific name is extracted from the abstract of the article using string slicing
    print("scientific name",extract_abstract_sci_common)
    if extract_abstract_sci_common == scientific_name_common : # verifying if the scientific name in the abstract of the article matches with actual scientific name of the plant
      print("Scientific name is present in abstract of the article :",id)
    else:
      print("Scientific name is not present in  abstract of the article :",id) 
      
               



# Multiple common name search

In [None]:
# taking common name and checking for scientific names in the articles

common_term_list = 'chichira'   #Hebanthe erianthos (Poir.) Pedersen is the scientific name
print(common_term_list)
handle_list = Entrez.esearch(db ="pmc", term= common_term_list,retmax= "50")# search and retrieve max 50 article id for each pharmaceutical name
rec_list = Entrez.read(handle_list)
handle_list.close()
print(rec_list['Count'])# displays the total number of articles Id containing the given common name
print(len(rec_list['IdList']))#List the total number of article retrieved. If total number of article containing the common name are more than retmax parameter value given, it returns the given value in retmax.
print(rec_list['IdList'])
total_id_common_list = rec_list['IdList']

In [None]:
scientific_name_common_list =(art1_common.loc[art1_common['name']== common_term_list,'full_scientific_name']).astype(str) #full scientific name corresponding to common name is selected
#scientific_name_common = scientific_name_common.strip('.')
scientific_name_common_list  = (scientific_name_common_list).to_list() # Scientific names retrieved are converted into a list
scientific_name_common_list =  [s.strip('. ') for s in scientific_name_common_list] #strip all '.' from the beginning and end of string of each element in the list
len_sci_common_list= len(scientific_name_common_list ) #calculating the length of the scientific name for the given common name
len_multi_term_common = len(common_term_list)

In [None]:
print(scientific_name_common_list)

Some of the document contain the content in a pdf .Which is ignored for now .An example is stated below. Such article will throw error while executing . This is the limitation in current code and will be considered as part of future work

In [None]:
handle_pdf = Entrez.efetch(db='pmc', id = 3336400 , retmode = 'xml')
total_content_common_pdf =  handle_pdf.read()
print("Entire text in the article id",id)
print(total_content_common_pdf)

Checking for scientific name in both body and abstract of article

In [None]:
#extracting and checking for each article id corresponding to the given term if the correct scientific name is provided or not
#checking for common name in the body or abstract of the article
for ids in total_id_common_list:
    handle_multi = Entrez.efetch(db='pmc', id = ids , retmode = 'xml')
    total_content_common_multi_1 =  handle_multi.read()
    print("Entire text in the article id",ids)
    #print(total_content_aommon_multi_1) #print the entire article content in html

    soup_common_multi_1 = BeautifulSoup(total_content_common_multi_1,"html.parser")
    

    #Checking for common name in the body of article
    body_common_multi_1 = soup_common_multi_1.find('body')#find the tag named 'body'
    body_text_common_multi_1 = (body_common_multi_1.get_text())
  
    #Checking for common name in the body of article
    body_multi_term_text_common_1 = (body_common_multi_1.get_text()).lower()# entire text converted to lower case
    multi_term_name_body= body_multi_term_text_common_1.find(common_term_list)#gets the position or the starting index of the word

    for b in body_common_multi_1:
      print("Entire content of the article id :", ids)
      body_text_common_multi_1 = body_common_multi_1.get_text()#get the text from the body of the article
      #print(body_text_pharm_multi) #print the entire body of the article
   
      #applying string slicing to find the common name from the body of the article
      extract_body_multi_term_common = body_multi_term_text_common_1[multi_term_name_body:multi_term_name_body+ len_multi_term_common] #the common name is extracted from the body of the article using string slicing
      print("common name",extract_body_multi_term_common)
      if extract_body_multi_term_common == common_term_list : # verifying if the common name in the body of the article matches with actual scientific name of the plant
        print("common name is present in body of the article :",ids)
      else:
        print("common name is not present in body of the article :",ids)

    #Checking for common name in the abstract of article
    abstract_common_multi_1 = soup_common_multi_1.find('abstract')#find the tag named 'abstract'
    abstract_text_common_multi_1 = (abstract_common_multi_1.get_text())

    abstract_multi_term_text_common_1 = (abstract_common_multi_1.get_text()).lower()# entire text converted to lower case
    multi_term_name_abstract= abstract_multi_term_text_common_1.find(common_term_list)#gets the position or the starting index of the word

    for a in abstract_common_multi_1:
      print("Entire content of the article id :", ids)
      abstract_text_common_multi = abstract_common_multi_1.get_text() # get the text from the abstract of the article
      #print(abstract_text_common_multi) # print the entire abstract of the article

      #applying string slicing to find the common name from the abstract of the article
      extract_abstract_multi_term_common = abstract_multi_term_text_common_1[multi_term_name_abstract:multi_term_name_abstract+ len_multi_term_common] #the common name is extracted from the body of the article using string slicing
      print("common name",extract_abstract_multi_term_common)
      if extract_abstract_multi_term_common == common_term_list : # verifying if the common name in the body of the article matches with actual scientific name of the plant
        print("common name is present in abstract of the article :",id)
      else:
        print("common name is not present in abstract of the article :",id)      


for ids in total_id_common_list:
    handle_multi = Entrez.efetch(db='pmc', id = ids , retmode = 'xml')
    total_content_common_multi =  handle_multi.read()
    print("Entire text in the article id",ids)
    #print(total_content_common_multi) #Print the entire article content as html output
    soup_common_multi = BeautifulSoup(total_content_common_multi,"html.parser")
   
    #Checking for scientific name in the body of article
    body_common_multi = soup_common_multi.find('body')#find the tag named 'body'
    body_text_common_multi = (body_common_multi.get_text())#get the text out of body tag of the article
    
    for b in body_common_multi:
      print("Entire content of the article id :", ids)
      body_text_common_multi = body_common_multi.get_text() #get the text out of body tag of the article
      #print(body_text_common_multi) #print the entire body of the article

      #applying regular expression to find the scientific name from the body of the article    
      regex_body = re.compile("(?=(" + "|".join(map(re.escape, scientific_name_common_list)) + "))")#patterm will match the strings from the input iterable
      sci_name_multi = re.findall(regex_body, body_text_common_multi)
      if sci_name_multi != scientific_name_common_list:
        print("Scientific name is not present in body of the article ",ids)

      else:
        print("Scientific name is present in body of the article ", ids)

    #Checking for scientific name in the abstract of article

    abstract_common_multi = soup_common_multi.find('abstract')#find the tag named 'abstract'
    abstract_text_common_multi = (abstract_common_multi.get_text()) # get the text out of abstract tag of the article
    
    for a in abstract_common_multi:
      print("Entire content of the article id :", ids)
      abstract_text_common_multi = abstract_common_multi.get_text() # get the text out of abstract tag of the article
      #print(abstract_text_common_multi) #print the entire abstract of the article

      #applying regular expression to find the scientific name from the abstract of the article    
      regex_abstract = re.compile("(?=(" + "|".join(map(re.escape, scientific_name_common_list)) + "))")#patterm will match the strings from the input iterable
      sci_name_abstract_multi = re.findall(regex_abstract, abstract_text_common_multi)
      if sci_name_abstract_multi != scientific_name_common_list:
        print("Scientific name is not present in abstract of the article ",ids)

      else:
        print("Scientific name is present in abstract of the article ", ids)


In [None]:
""" 
#code for checking through all pharmaceutical name and retreive the corresponding article id
common_name = art1_common['name']
term_common = ' '# input the common name of medicinal plant

for j in common_name:
    
    term_common = f"{j}[name]"
    print(term_common)
    handle = Entrez.esearch(db ="pmc", term= term_common,retmax= "50")# search and retrieve max 50 article id for each pharmaceutical name
    rec_list = Entrez.read(handle)
    handle.close()
    print(rec_list['Count'])
    print(len(rec_list['IdList']))
    print(rec_list['IdList'])
    total_id = rec_list['IdList']
"""

In [None]:
"""
#Code to retreive if search criteria has two or more parameters like genus and species.
common_gen= art1_common['genus'].tolist() #convert the genus column to list
common_spe= art1_common['species'].tolist() #convert the species column to list
term = ' '# input the common name of medicinal plant

for (i,j) in zip(common_gen,common_spe):
    
    term = f"{i}[genus] AND {j}[species]"
    print(term)
    handle = Entrez.esearch(db ="pmc", term= term,retmax= "50")
    rec_list = Entrez.read(handle)
    handle.close()
    print(rec_list['Count'])
    print(len(rec_list['IdList']))
    print(rec_list['IdList'])
    total_id = rec_list['IdList']

"""

**Reference:**
Entrez is a molecular biology database system that provides integrated access to nucleotide and protein sequence.The system is produced by the National Center for Biotechnology Information (NCBI).

Entrez Programming Utilities user guide is available at : https://www.ncbi.nlm.nih.gov/books/NBK25501/