# **Extracting articles ID with pharmaceutical name from PubMed**

The code consists of the follwoing Task

Task 3: Extracting all the article Ids for articles that use pharmaceutical name without reference to the corresponding scientfic name

Required Installations

In [None]:
!pip install biopython
!pip install beautifulsoup4


import dependent libraries

In [None]:
import pandas as pd
import re
import Bio
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
Entrez.email =  "radhu.palliyana@gmail.com" # provide mail id after creating api key

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Load MPNS version 11 datasets:


1.   mpns_v11_non_sci_names_1.csv containing non scientific names or common or pharmaceutical names of medicinal plants
2.   mpns_v11_plants_1.csv containing scientific names of medicinal plants
3.   mpns_v11_synonyms_1.csv containing synonyms or old scientific names of medicinal plants

In [None]:
mpns_non_sci = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_non_sci_names_1.csv")
mpns_plant = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_plants_1.csv")
#mpns_synon = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_synonyms_1.csv")

**Extracting all the article Ids for articles that use pharmaceutical name without reference to the corresponding scientfic name**

merging Table 1 ,mpns plant dataset and Table 3 , mpns non scientific dataset together with corresponding name id

As per MPNS Data dictionary name_id field value links each and every row in TABLE 3 Non Scientific Names to ONE data row in either TABLE 1 PLANTS OR TABLE 2 SYNONYMS


**Data Cleaning and feature engineering**

In [None]:
mpns_non_sci.rename(columns ={'name_id':'name_id_non_sci'},inplace= True)

In [None]:
art1 = pd.merge(mpns_plant,mpns_non_sci, how = "left" , left_on= "name_id",right_on="plant_id")
#print(art1.head())
# Checking for header values
for col in art1.columns:
    print(col)

In [None]:
#checking for null values
print(art1.shape[0] - art1.count())

In [None]:
#dataframe created for plants names with non_scientific names used after merging with dataset containing scientific name(mpns_plant) and non scientific name(mpns_non_sci) of plants
art1_non_sci = art1[~art1['name'].isnull()] # plants with non - scientific name null is removed
art1_non_sci = art1_non_sci[~art1_non_sci['full_scientific_name'].isnull()] #plants with scientific name null is removed as TypeError: decoding to str: need a bytes-like object, float found is displayed due to null scientific names
print(art1_non_sci.shape[0] - art1_non_sci.count()) # checking for null values

In [None]:
# feature selection
pd.options.mode.chained_assignment = None #ignoring the warning caused by dropping . value default 'warn'
#dropping the following coumns as its not required for the mapping
art1_non_sci.drop(['genus_hybrid','species_hybrid','infra_species','parent_author'],axis ='columns',inplace = True)
#After confirming from Kew Garden team low quality matches are ignored
art1_non_sci.drop(art1_non_sci.loc[art1_non_sci['quality_rating']=='L'].index, inplace = True)
print(art1_non_sci.shape[0] - art1_non_sci.count())
print(art1_non_sci.head())

In [None]:
#dataframe created for plants names with pharmaceutical names used after merging with scientific name
art1_pharm = (art1_non_sci.loc[art1_non_sci['name_type']== 'pharmaceutical'])
print(art1_pharm.head())

In [None]:
#checking for null values
print(art1_pharm.shape[0] - art1_pharm.count())

Checking for multiple occurence of pharmaceutical name

In [None]:
art1_pharm_duplicate =art1_pharm[art1_pharm.duplicated('name')]
print('Duplicated rows are ', art1_pharm_duplicate) # 920 rows duplicated which means there are multiple occurence of the name

# Single pharmaceutical name search

In [None]:
# taking pharmaceutical name and checking for scientific names in the articles

term = 'abelmoschi corolla'   #Abelmoschus manihot (L.) Medik. is the scientific name
print(term)
handle = Entrez.esearch(db ="pmc", term= term,retmax= "50")# search and retrieve max 50 article id for each pharmaceutical name
rec_list = Entrez.read(handle)
handle.close()
print(rec_list['Count'])# displays the total number of articles Id containing the given pharmaceutical name
print(len(rec_list['IdList']))#List the total number of article retrieved. If total number of article  containing the pharmaceutical name are more than retmax parameter value given, it returns the given value in retmax.
total_id = rec_list['IdList']
print('The article ids corresponding to the given pharmaceutical name are :' ,total_id)# displays the articles Ids containing the given pharmaceutical name

For eg: '9340222', '9403506', '8069016', '8037085', '7535141', '7482509', '6501976', '6500631', '6425497', '6222764', '6155732', '4926151' are the list of article corresponding to the given pharmaceutical name: abelmoschi corolla

Among these the following article have scientific names tagged to this pharmaceutical name:  9403506,7535141 and the rest doesnt have one.

To get check for each article manually navigate to and replace the PMC iD with the article id number for eg for id 9340222 :https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9340222/

PN: Scientific name displayed in reference,tables or images are not detected

In [None]:
scientific_name_pharm =(art1_pharm.loc[art1_pharm['name']== term,'full_scientific_name']).iloc[0] #full scientific name corresponding to pharmaceutical name is selected
scientific_name_pharm = scientific_name_pharm.strip('. ')# strip all '.' from the beginning and end of string
len_sci_pharm= len(scientific_name_pharm ) #calculating the length of the scientific name for the given pharmaceutical name
len_term_pharm= len(term ) #calculating the length of the for the given pharmaceutical name
print(len_sci_pharm)
print(len_term_pharm)

In [None]:
print(scientific_name_pharm)

Checking for scientific name in both body and abstract of article

In [None]:

for id in total_id:
    handle = Entrez.efetch(db='pmc', id = id , retmode = 'xml')
    total_content =  handle.read()
    print("Entire text in the article id",id)
    #print(total_content) # print the entire content of the article in html
    soup = BeautifulSoup(total_content,"html.parser")
    abstracts = soup.find('abstract')#find the tag named 'abstract'
    body = soup.find('body')#find the tag named 'body'
    body_text = body.get_text()
    #print(body_text) #print entire body of the article
    sci_name_body= body_text.find(scientific_name_pharm )#gets the position or the starting index of the word
    abstract_text = abstracts.get_text()
    #print(abstract_text) #print entire abstract of the article
    sci_name_abstract= abstract_text.find(scientific_name_pharm )#gets the position or the starting index of the word


    body_term_text_pharm_1 = (body.get_text()).lower()# entire text converted to lower case
    term_name_body= body_term_text_pharm_1.find(term )#gets the position or the starting index of the word

    abstract_term_text_pharm_1 = (abstracts.get_text()).lower()# entire text converted to lower case
    term_name_abstract= abstract_term_text_pharm_1.find(term )#gets the position or the starting index of the word

#Checking for pharmaceutical name in the body of article
    extract_body_term_pharm = body_term_text_pharm_1[term_name_body:term_name_body+ len_term_pharm] #the pharmaceutical name is extracted from the body of the article using string slicing
    print("pharmaceutical name",extract_body_term_pharm)
    if extract_body_term_pharm == term : # verifying if the pharmaceutical name in the body of the article matches with actual scientific name of the plant
      print("pharmaceutical name is present in body of the article :",id)
    else:
      
      print("pharmaceutical name is not present in body of the article :",id)     

#Checking for pharmaceutical name in the abstract of article
    extract_abstract_term_pharm = abstract_term_text_pharm_1[term_name_abstract:term_name_abstract+ len_term_pharm] #the pharmaceutical name is extracted from the body of the article using string slicing
    print("pharmaceutical name",extract_abstract_term_pharm)
    if extract_abstract_term_pharm == term : # verifying if the pharmaceutical name in the body of the article matches with actual scientific name of the plant
      print("pharmaceutical name is present in abstract of the article :",id)
    else:
      
      print("pharmaceutical name is not present in abstract of the article :",id)     


#Checking for scientific name in the body of article
    extract_body_sci_pharm = body_text[sci_name_body:sci_name_body+ len_sci_pharm] #the scientific name is extracted from the body of the article using string slicing
    print("scientific name",extract_body_sci_pharm)
    if extract_body_sci_pharm == scientific_name_pharm : # verifying if the scientific name in the body of the article matches with actual scientific name of the plant
      print("Scientific name is present in body of the article :",id)
    else:
      
      print("Scientific name is not present in body of the article :",id)

#Checking for scientific name in the abstract of the article
    extract_abstract_sci_pharm = abstract_text[sci_name_abstract:sci_name_abstract+ len_sci_pharm] #the scientific name is extracted from the abstract of the article using string slicing
    print("scientific name",extract_abstract_sci_pharm)
    if extract_abstract_sci_pharm == scientific_name_pharm : # verifying if the scientific name in the abstract of the article matches with actual scientific name of the plant
      print("Scientific name is present in abstract of the article :",id)
    else:
      
      print("Scientific name is not present in  abstract of the article :",id)          



# Multiple pharmaceutical name search

In [None]:
# taking pharmaceutical name and checking for scientific names in the articles

multi_term = 'angelicae radix pulverata'   #Angelica acutiloba var. acutiloba and Angelica acutiloba (Siebold & Zucc.) Kitag. is the scientific name
print(multi_term)
handle1 = Entrez.esearch(db ="pmc", term= multi_term,retmax= "50")# search and retrieve max 50 article id for each pharmaceutical name
rec_list1 = Entrez.read(handle1)
handle1.close()
print(rec_list1['Count']) # displays the total number of articles Id containing the given pharmaceutical name
print(len(rec_list1['IdList']))#List the total number of article retrieved. If total number of article  containing the pharmaceutical name are more than retmax parameter value given, it returns the given value in retmax.
total_id_multi = rec_list1['IdList'] 
print('The article ids corresponding to the given pharmaceutical name are :' ,total_id_multi)# displays the articles Id containing the given pharmaceutical name

'9230602', '6107777'are the list of article corresponding to the given pharmaceutical name

Among these the following article none have scientific names tagged to this synonym name.

To get check for each article manually navigate to and replace the PMC iD with the article id number for eg for id 9340222 :https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9340222/

PN: Scientific name displayed in reference,tables or images are not detected

In [None]:
scientific_name_pharm_multi =(art1_pharm.loc[art1_pharm['name']== multi_term,'full_scientific_name']).astype(str) #full scientific name corresponding to pharmaceutical name is selected
scientific_name_pharm_list = (scientific_name_pharm_multi).to_list() # Scientific names retrieved are converted into a list
scientific_name_pharm_multi =  [s.strip('. ') for s in scientific_name_pharm_list]#strip all '.' from the beginning and end of string of each element in the list
len_sci_pharm_multi= len(scientific_name_pharm_multi )#calculating the length of the scientific name for the given pharmaceutical name
len_multi_term_pharm = len(multi_term) #------>  #calculating the length of the given pharmaceutical name

In [None]:
#List of scientific name with same pharmaceutical name
print(scientific_name_pharm_multi)

Checking for scientific name in both body and abstract of article

In [None]:

#checking for pharmaceutical name in the body or abstract of the article
for ids in total_id_multi:
    handle_multi = Entrez.efetch(db='pmc', id = ids , retmode = 'xml')
    total_content_pharm_multi =  handle_multi.read()
    print("Entire text in the article id",ids)
    #print(total_content_pharm_multi) #print the entire article content in html

    soup_pharm_multi = BeautifulSoup(total_content_pharm_multi,"html.parser")
    

    #Checking for pharmaceutical name in the body of article
    body_pharm_multi = soup_pharm_multi.find('body')#find the tag named 'body'
    body_text_pharm_multi = (body_pharm_multi.get_text())
  
    #Checking for pharmaceutical name in the body of article
    body_multi_term_text_pharm_1 = (body_pharm_multi.get_text()).lower()# entire text converted to lower case
    multi_term_name_body= body_multi_term_text_pharm_1.find(multi_term)#gets the position or the starting index of the word

    for b in body_pharm_multi:
      print("Entire content of the article id :", ids)
      body_text_pharm_multi = body_pharm_multi.get_text()#get the text from the body of the article
      #print(body_text_pharm_multi) #print the entire body of the article
   
      #applying string slicing to find the pharmaceutical name from the body of the article
      extract_body_multi_term_pharm = body_multi_term_text_pharm_1[multi_term_name_body:multi_term_name_body+ len_multi_term_pharm] #the pharmaceutical name is extracted from the body of the article using string slicing
      print("pharmaceutical name",extract_body_multi_term_pharm)
      if extract_body_multi_term_pharm == multi_term : # verifying if the pharmaceutical name in the body of the article matches with actual scientific name of the plant
        print("pharmaceutical name is present in body of the article :",ids)
      else:
        print("pharmaceutical name is not present in body of the article :",ids)

    #Checking for pharmaceutical name in the abstract of article
    abstract_pharm_multi = soup_pharm_multi.find('abstract')#find the tag named 'abstract'
    abstract_text_pharm_multi = (abstract_pharm_multi.get_text())

    abstract_multi_term_text_pharm_1 = (abstract_pharm_multi.get_text()).lower()# entire text converted to lower case
    multi_term_name_abstract= abstract_multi_term_text_pharm_1.find(multi_term)#gets the position or the starting index of the word

    for a in abstract_pharm_multi:
      print("Entire content of the article id :", ids)
      abstract_text_pharm_multi = abstract_pharm_multi.get_text() # get the text from the abstract of the article
      #print(abstract_text_pharm_multi) # print the entire abstract of the article

      #applying string slicing to find the pharmaceutical name from the abstract of the article
      extract_abstract_multi_term_pharm = abstract_multi_term_text_pharm_1[multi_term_name_abstract:multi_term_name_abstract+ len_multi_term_pharm] #the pharmaceutical name is extracted from the body of the article using string slicing
      print("pharmaceutical name",extract_abstract_multi_term_pharm)
      if extract_abstract_multi_term_pharm == multi_term : # verifying if the pharmaceutical name in the body of the article matches with actual scientific name of the plant
        print("pharmaceutical name is present in abstract of the article :",id)
      else:
        print("pharmaceutical name is not present in abstract of the article :",id)      


#Checking for scientific name in the body or abstract of the article
for ids in total_id_multi:
    handle_multi = Entrez.efetch(db='pmc', id = ids , retmode = 'xml')
    total_content_pharm_multi =  handle_multi.read()
    print("Entire text in the article id",ids)
    #print(total_content_pharm_multi) #print the entire article content in html

    soup_pharm_multi = BeautifulSoup(total_content_pharm_multi,"html.parser")
    

    #Checking for scientific name in the body of article
    body_pharm_multi = soup_pharm_multi.find('body')#find the tag named 'body'
    body_text_pharm_multi = (body_pharm_multi.get_text())

    for b in body_pharm_multi:
      print("Entire content of the article id :", ids)
      body_text_pharm_multi = body_pharm_multi.get_text()#get the text from the body of the article
      #print(body_text_pharm_multi) #print the entire body of the article

      #applying regular expression to find the scientific name from the body of the article    
      regex_body = re.compile("(?=(" + "|".join(map(re.escape, scientific_name_pharm_multi)) + "))") #patterm will match the strings from the input iterable
      sci_name_multi = re.findall(regex_body, body_text_pharm_multi)

      if sci_name_multi != scientific_name_pharm_multi:
        print("Scientific name is not present in body of the article ",ids)

      else:
        print("Scientific name is present in body of the article ", ids)

    #Checking for scientific name in the abstract of article
    abstract_pharm_multi = soup_pharm_multi.find('abstract')#find the tag named 'abstract'
    abstract_text_pharm_multi = (abstract_pharm_multi.get_text())

    for a in abstract_pharm_multi:
      print("Entire content of the article id :", ids)
      abstract_text_pharm_multi = abstract_pharm_multi.get_text() # get the text from the abstract of the article
      #print(abstract_text_pharm_multi) # print the entire abstract of the article
   
      #applying regular expression to find the scientific name from the abstract of the article    
      regex_abstract = re.compile("(?=(" + "|".join(map(re.escape, scientific_name_pharm_multi)) + "))")#patterm will match the strings from the input iterable
      sci_name_abstract_multi = re.findall(regex_abstract, abstract_text_pharm_multi)
      if sci_name_abstract_multi != scientific_name_pharm_multi:
        print("Scientific name is not present in abstract of the article ",ids)

      else:
        print("Scientific name is present in abstract of the article ", ids)

In [None]:
""" 
#code for checking through all pharmaceutical name and retreive the corresponding article id
phram_name = art1_pharm['name']
term_pharm = '' # input the common name of medicinal plant

for j in phram_name:
    
    term_pharm = f"{j}[name]"
    print(term_pharm)
    handle = Entrez.esearch(db ="pmc", term= term_pharm,retmax= "50")# search and retrieve max 50 article id for each pharmaceutical name
    rec_list = Entrez.read(handle)
    handle.close()
    print(rec_list['Count'])
    print(len(rec_list['IdList']))
    print(rec_list['IdList'])
    total_id = rec_list['IdList']
"""

In [None]:
"""
#Code to retreive if search criteria has two or more parameters like genus and species.
pharm_gen= art1_pharm['genus'].tolist() #convert the genus column to list
pharm_spe= art1_pharm['species'].tolist() #convert the species column to list
term = '' # input the common name of medicinal plant
for (i,j) in zip(pharm_gen,pharm_spe):
    
    term = f"{i}[genus] AND {j}[species]"
    print(term)
    handle = Entrez.esearch(db ="pmc", term= term,retmax= "50")
    rec_list = Entrez.read(handle)
    handle.close()
    print(rec_list['Count'])
    print(len(rec_list['IdList']))
    print(rec_list['IdList'])
    total_id = rec_list['IdList']

"""

**Reference:**
Entrez is a molecular biology database system that provides integrated access to nucleotide and protein sequence.The system is produced by the National Center for Biotechnology Information (NCBI).

Entrez Programming Utilities user guide is available at : https://www.ncbi.nlm.nih.gov/books/NBK25501/