# **Extracting articles ID with synonyms from PubMed**

The code consists of the follwoing Task

Task 4: Find articles still using an old scientific name (a.k.a. synonyms of current scientific name).

Required Installations

In [None]:
!pip install biopython
!pip install beautifulsoup4


import dependent libraries

In [None]:
import pandas as pd
import re
import Bio
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
Entrez.email =  "radhu.palliyana@gmail.com" # provide mail id after creating api key

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Load MPNS version 11 datasets:


1.   mpns_v11_non_sci_names_1.csv containing non scientific names or common or pharmaceutical names of medicinal plants
2.   mpns_v11_plants_1.csv containing scientific names of medicinal plants
3.   mpns_v11_synonyms_1.csv containing synonyms or old scientific names of medicinal plants

In [None]:
mpns_non_sci = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_non_sci_names_1.csv")
mpns_plant = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_plants_1.csv")
mpns_synon = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_synonyms_1.csv")

**Find articles still using an old scientific name (a.k.a. synonyms of current scientific name)**

merging Table 1 ,mpns plant dataset and Table 2 , mpns non scientific dataset together with corresponding name id(table 1) and acc_name_id(table 2)

As per MPNS Data dictionary acc_name_id field value links each and every row in TABLE 2 SYNONYMS to ONE data row in TABLE 1 PLANTS


**Data Cleaning and feature engineering**

In [None]:
mpns_synon.rename(columns ={'full_scientific_name':'full_scientific_name_synonym'},inplace= True)

In [None]:
#After confirming from Kew Garden team Misapplied names are ignored
mpns_synon.drop(mpns_synon.loc[mpns_synon['taxon_status']=='Misapplied'].index, inplace = True)
mpns_synon.drop(mpns_synon.loc[mpns_synon['quality_rating']=='L'].index, inplace = True)

In [None]:
#dropping the following coumns as its not required for the mapping
mpns_synon.drop(['genus_hybrid','species_hybrid','infra_species','parent_author','primary_author'],axis ='columns',inplace = True) # dropped the columns from synonyms dataset
mpns_plant.drop(['genus_hybrid','species_hybrid','infra_species','parent_author','primary_author'],axis ='columns',inplace = True) # dropped the columns from scientific name of plant dataset

In [None]:
art1 = pd.merge(mpns_plant,mpns_synon, how = "left" , left_on= "name_id",right_on="acc_name_id")
#art1 = pd.concat([mpns_plant,mpns_synon], ignore_index=True, sort=False)
#print(art1.head())
# Checking for header values
for col in art1.columns:
    print(col)

In [None]:
#checking for null values
print(art1.shape[0] - art1.count())

In [None]:
#dataframe created for plants names with synonyms used after merging with dataset containing scientific name(mpns_plant) and synonyms(mpns_synon) of plants
art1_synon = art1[~art1['full_scientific_name_synonym'].isnull()] # plants with synonyms as null is removed

In [None]:
#checking for null values
print(art1_synon.shape[0] - art1_synon.count())

In [None]:
#dataframe choosen for taxon_status as synonym only
art1_synon = (art1_synon.loc[art1_synon['taxon_status_y']== 'Synonym'])

In [None]:
#checking for null values
print(art1_synon.shape[0] - art1_synon.count())

In [None]:
art1_synon = art1_synon[~art1_synon['full_scientific_name'].isnull()] #plants with scientific name null is removed as TypeError: decoding to str: need a bytes-like object, float found is displayed due to null scientific names.Also count of null values is less for full_scientific_name .It is 1975 in total
print(art1_synon.shape[0] - art1_synon.count()) # checking for null values

Checking for multiple occurence of synonyms

In [None]:
art1_synon_duplicate =art1_synon[art1_synon.duplicated('full_scientific_name_synonym')]
print('Duplicated rows are ', art1_synon_duplicate) #315 rows are duplicated which means there are multiple occurence of the Synonym

# Single Synonym name occurence search

In [None]:
# taking synonym and checking for scientific names in the articles

term = 'Gardenia neuberia Eckl. & Zeyh.'   #Hyperacanthus amoenus (Sims) Bridson is the scientific name  
print(term)
handle = Entrez.esearch(db ="pmc", term= term,retmax= "50")# search and retrieve max 50 article id for each pharmaceutical name
rec_list = Entrez.read(handle)
handle.close()
print(rec_list['Count']) # displays the total number of articles Id containing the given synonymn name
print(len(rec_list['IdList']))#List the total number of article retrieved. If total number of article  containing the synonym name are more than retmax parameter value given, it returns the given value in retmax.
total_id = rec_list['IdList']
print('The article ids corresponding to the given synonym name are :' ,total_id)# displays the articles Id containing the given synonym name

'9033093', '8904431', '8537377', '8479245', '8390284', '8417044', '7428464', '7273665', '7237506', '7165193', '6483958', '4495512', '3016261' are the list of article corresponding to the given synonym name

Among these the following article have scientific names tagged to this Synonym name available in table:8537377 .Rest doesnt have one.

To get check for each article manually navigate to and replace the PMC iD with the article id number for eg for id 9340222 :https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9340222/

PN: Scientific name displayed in reference,tables or images are not detected

In [None]:
scientific_name_synonym=(art1_synon.loc[art1_synon['full_scientific_name_synonym']== term,'full_scientific_name']).iloc[0] #full scientific name corresponding to synonym is selected
scientific_name_synonym = scientific_name_synonym.strip('. ')# strip all '.' from the beginning and end of string
len_sci_synonym= len(scientific_name_synonym ) #calculating the length of the scientific name for the given synonym
len_synonym = len(term)#calculating the length of the synonym

In [None]:
print(scientific_name_synonym)

In [None]:

for id in total_id:
    handle = Entrez.efetch(db='pmc', id = id , retmode = 'xml')
    total_content =  handle.read()
    print("Entire text in the article id",id)
    #print(total_content) #print the total content
    soup = BeautifulSoup(total_content,"html.parser")
    abstracts = soup.find('abstract')#find the tag named 'abstract'
    body = soup.find('body')#find the tag named 'body'
    body_text = body.get_text()
    #print(body_text) # print the body text
    sci_name_body= body_text.find(scientific_name_synonym )#gets the position or the starting index of the scientific name
    synonym_body= body_text.find(term )  # gets the position or the starting index of the synonym
    abstract_text = abstracts.get_text()
    #print(abstract_text) # print the abstract text
    sci_name_abstract= abstract_text.find(scientific_name_synonym ) #gets the position or the starting index of the scientific name
    synonym_abstract= abstract_text.find(term ) # gets the position or the starting index of the synonym

#Checking for synonym in the body of article
    extract_body_synonym = body_text[synonym_body:synonym_body+ len_synonym] #the synonym is extracted from the body of the article using string slicing
    print("synonym",extract_body_synonym)
    if extract_body_synonym == term : # verifying if the synonym in the body of the article matches with synonym of the plant
      print("Synonym is present in body of the article :",id)
    else:
      
      print("Synonym is not present in body of the article :",id)

#Checking for synonym in the abstract of the article
    extract_abstract_synonym = abstract_text[synonym_abstract:synonym_abstract+ len_synonym] #the synonym is extracted from the abstract of the article using string slicing
    print("synonym",extract_abstract_synonym)
    if extract_abstract_synonym == term :  # verifying if the synonym in the body of the article matches with synonym of the plant
      print("Synonym is present in abstract of the article :",id)
    else:
      
      print("Synonym is not present in  abstract of the article :",id)   

#Checking for scientific name in the body of article
    extract_body_sci_synonym = body_text[sci_name_body:sci_name_body+ len_sci_synonym] #the scientific name is extracted from the body of the article using string slicing
    print("scientific name",extract_body_sci_synonym)
    if extract_body_sci_synonym == scientific_name_synonym : # verifying if the scientific name in the body of the article matches with actual scientific name of the plant
      print("Scientific name is present in body of the article :",id)
    else:
      
      print("Scientific name is not present in body of the article :",id)

#Checking for scientific name in the abstract of the article
    extract_abstract_sci_synonym = abstract_text[sci_name_abstract:sci_name_abstract+ len_sci_synonym] #the scientific name is extracted from the abstract of the article using string slicing
    print("scientific name",extract_abstract_sci_synonym)
    if extract_abstract_sci_synonym == scientific_name_synonym : # verifying if the scientific name in the abstract of the article matches with actual scientific name of the plant
      print("Scientific name is present in abstract of the article :",id)
    else:
      
      print("Scientific name is not present in  abstract of the article :",id)          



# Multiple Synonym name occurence search

In [None]:
# taking Synonym name and checking for scientific names in the articles

multi_term = 'Vitis vinifera var. minuta Risso'   #Vitis vinifera L. is the scientific name    
print(multi_term)
handle1 = Entrez.esearch(db ="pmc", term= multi_term,retmax= "50")# search and retrieve max 50 article id for each pharmaceutical name
rec_list1 = Entrez.read(handle1)
handle1.close()
print(rec_list1['Count']) # displays the total number of articles Id containing the given synonym name
print(len(rec_list1['IdList'])) #List the total number of article retrieved. If total number of article  containing the synonym name are more than retmax parameter value given, it returns the given value in retmax.
total_id_multi = rec_list1['IdList'] 
print('The article ids corresponding to the given synonym name are :' ,total_id_multi)# displays the articles Id containing the given synonym name

'7070704', '5435909', '4868922' are the list of article corresponding to the given synonym name

Among these the following article have scientific names tagged to this synonym name but available in reference:7070704 .Rest doesnt have one.

To get check for each article manually navigate to and replace the PMC iD with the article id number for eg for id 9340222 :https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9340222/

PN: Scientific name displayed in reference,tables or images are not detected

In [None]:
scientific_name_synon_multi =(art1_synon.loc[art1_synon['full_scientific_name_synonym']== multi_term,'full_scientific_name']).astype(str) #full scientific name corresponding to synonym is selected
scientific_name_synon_list = (scientific_name_synon_multi).to_list() # Scientific names retrieved are converted into a list
scientific_name_synon_multi =  [s.strip('. ') for s in scientific_name_synon_list]#strip all '.' from the beginning and end of string of each element in the list
len_sci_synon_multi= len(scientific_name_synon_multi )#calculating the length of the scientific name for the given synonymn
len_synon_multi= len(multi_term )#calculating the length of the given synonymn

In [None]:
#List of scientific name with same synonym
print(scientific_name_synon_multi)

Checking for scientific name in both body and abstract of article

In [None]:

for ids in total_id_multi:
    handle_multi = Entrez.efetch(db='pmc', id = ids , retmode = 'xml')
    total_content_synon_multi =  handle_multi.read()
    print("Entire text in the article id",ids)
    #print(total_content_synon_multi) # print total content of the article

    soup_synon_multi = BeautifulSoup(total_content_synon_multi,"html.parser")    
    body_synon_multi = soup_synon_multi.find('body') #find the tag named 'body'
    body_text_synon_multi = (body_synon_multi.get_text())

    abstract_synon_multi = soup_synon_multi.find('abstract') #find the tag named 'abstract'
    abstract_text_synon_multi = (abstract_synon_multi.get_text())    
    
    synonym_body_multi = body_text_synon_multi.find(multi_term )  # to check for synonym in body of the article
    synonym_abstract_multi = abstract_text_synon_multi.find(multi_term) # to check for synonym in abstract of the article

    #Checking for synonym in the body of article
    extract_body_synonym_multi = body_text_synon_multi[synonym_body_multi:synonym_body_multi+ len_synon_multi]#the synonym is extracted from the body of the article using string slicing
    print("synonym",extract_body_synonym_multi)
    if extract_body_synonym_multi == multi_term :
      print("Synonym is present in body of the article :",ids)
    else:
      print("Synonym name is not present in body of the article :",ids)

    #Checking for synonym in the abstract of article
    extract_abstract_synonym_multi = abstract_text_synon_multi[synonym_abstract_multi:synonym_abstract_multi+ len_synon_multi]#the synonym is extracted from the abstract of the article using string slicing
    print("synonym",extract_abstract_synonym_multi)
    if extract_abstract_synonym_multi == multi_term :
      print("Synonym is present in abstract of the article :",ids)
    else:
      print("Synonym name is not present in abstract of the article :",ids)


    #Checking for scientific name in the body of article

    for b in body_synon_multi:
      print("Entire content of the article id :", ids)
      body_text_synon_multi = body_synon_multi.get_text()
      #print(body_text_synon_multi) #print body of the article
                    
      #applying regular expression to find the scientific name from the body of the article  
      regex_body = re.compile("(?=(" + "|".join(map(re.escape, scientific_name_synon_multi)) + "))")#patterm will match the strings from the input iterable
      sci_name_multi = re.findall(regex_body, body_text_synon_multi)
      if sci_name_multi != scientific_name_synon_multi:
        print("Scientific name is not present in body of the article ",ids)

      else:
        print("Scientific name is present in body of the article ", ids)

    #Checking for scientific name in the abstract of article
    
    for a in abstract_synon_multi:
      print("Entire content of the article id :", ids)
      abstract_text_synon_multi = abstract_synon_multi.get_text()
      #print(abstract_text_synon_multi) # print abstract of the article
            
      #applying regular expression to find the scientific name from the abstract of the article    
      regex_abstract = re.compile("(?=(" + "|".join(map(re.escape, scientific_name_synon_multi)) + "))")#patterm will match the strings from the input iterable
      sci_name_abstract_multi = re.findall(regex_abstract, abstract_text_synon_multi)
      if sci_name_abstract_multi != scientific_name_synon_multi:
        print("Scientific name is not present in abstract of the article ",ids)

      else:
        print("Scientific name is present in abstract of the article ", ids)

In [None]:
""" 
#code for checking through all term and retreive the corresponding article id
synon_name = art1_synon['full_scientific_name_synonym']

for j in synon_name:
    
    term_synon = f"{j}[name]"
    print(term_synon)
    handle = Entrez.esearch(db ="pmc", term= term_synon,retmax= "50")# search and retrieve max 50 article id for each pharmaceutical name
    rec_list = Entrez.read(handle)
    handle.close()
    print(rec_list['Count'])
    print(len(rec_list['IdList']))
    print(rec_list['IdList'])
    total_id = rec_list['IdList']
"""

In [None]:
"""
#Code to retreive if search criteria has two or more parameters like genus and species.
synon_gen= art1_synon['genus_y'].tolist() #convert the genus column to list
synon_spe= art1_synon['species_y'].tolist() #convert the species column to list
for (i,j) in zip(synon_gen,synon_spe):
    
    term = f"{i}[genus] AND {j}[species]"
    print(term)
    handle = Entrez.esearch(db ="pmc", term= term,retmax= "50")
    rec_list = Entrez.read(handle)
    handle.close()
    print(rec_list['Count'])
    print(len(rec_list['IdList']))
    print(rec_list['IdList'])
    total_id = rec_list['IdList']

"""

**Reference:**
Entrez is a molecular biology database system that provides integrated access to nucleotide and protein sequence.The system is produced by the National Center for Biotechnology Information (NCBI).

Entrez Programming Utilities user guide is available at : https://www.ncbi.nlm.nih.gov/books/NBK25501/