# **Extracting articles ID from PubMed with a list of synonyms for statistical analysis**

The code consists of the follwoing Task

Find articles still using an old scientific name (a.k.a. synonyms of current scientific name) from a list of synonyms.
The code for single occurence is reused for this purpose .It also used for statistical analysis on the number of relevant articles retreived from PubMed

Required Installations

In [None]:
!pip install biopython
!pip install beautifulsoup4
!pip install --upgrade openpyxl==3.0.5
!pip install --upgrade pandas==1.1.2
!pip install --upgrade python==3.7
!pip install XlsxWriter


import dependent libraries

In [None]:
import pandas as pd
import re
import Bio
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
Entrez.email =  "radhu.palliyana@gmail.com" # provide mail id after creating api key
from openpyxl import load_workbook
import xlsxwriter

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Load MPNS version 11 datasets:


1.   mpns_v11_non_sci_names_1.csv containing non scientific names or common or pharmaceutical names of medicinal plants
2.   mpns_v11_plants_1.csv containing scientific names of medicinal plants
3.   mpns_v11_synonyms_1.csv containing synonyms or old scientific names of medicinal plants

In [None]:
mpns_non_sci = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_non_sci_names_1.csv")
mpns_plant = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_plants_1.csv")
mpns_synon = pd.read_csv("/content/drive/MyDrive/Dissertation/mpns_v11_synonyms_1.csv")

**Find articles still using an old scientific name (a.k.a. synonyms of current scientific name)**

merging Table 1 ,mpns plant dataset and Table 2 , mpns non scientific dataset together with corresponding name id(table 1) and acc_name_id(table 2)

As per MPNS Data dictionary acc_name_id field value links each and every row in TABLE 2 SYNONYMS to ONE data row in TABLE 1 PLANTS


**Data Cleaning and feature engineering**

In [None]:
mpns_synon.rename(columns ={'full_scientific_name':'full_scientific_name_synonym'},inplace= True)

In [None]:
#After confirming from Kew Garden team Misapplied names are ignored
mpns_synon.drop(mpns_synon.loc[mpns_synon['taxon_status']=='Misapplied'].index, inplace = True)
mpns_synon.drop(mpns_synon.loc[mpns_synon['quality_rating']=='L'].index, inplace = True)

In [None]:
#dropping the following coumns as its not required for the mapping
mpns_synon.drop(['genus_hybrid','species_hybrid','infra_species','parent_author','primary_author'],axis ='columns',inplace = True) # dropped the columns from synonyms dataset
mpns_plant.drop(['genus_hybrid','species_hybrid','infra_species','parent_author','primary_author'],axis ='columns',inplace = True) # dropped the columns from scientific name of plant dataset

In [None]:
art1 = pd.merge(mpns_plant,mpns_synon, how = "left" , left_on= "name_id",right_on="acc_name_id")
#art1 = pd.concat([mpns_plant,mpns_synon], ignore_index=True, sort=False)
#print(art1.head())
# Checking for header values
for col in art1.columns:
    print(col)

In [None]:
#checking for null values
print(art1.shape[0] - art1.count())

In [None]:
#dataframe created for plants names with synonyms used after merging with dataset containing scientific name(mpns_plant) and synonyms(mpns_synon) of plants
art1_synon = art1[~art1['full_scientific_name_synonym'].isnull()] # plants with synonyms as null is removed

In [None]:
#checking for null values
print(art1_synon.shape[0] - art1_synon.count())

In [None]:
#dataframe choosen for taxon_status as synonym only
art1_synon = (art1_synon.loc[art1_synon['taxon_status_y']== 'Synonym'])

In [None]:
#checking for null values
print(art1_synon.shape[0] - art1_synon.count())

In [None]:
art1_synon = art1_synon[~art1_synon['full_scientific_name'].isnull()] #plants with scientific name null is removed as TypeError: decoding to str: need a bytes-like object, float found is displayed due to null scientific names.Also count of null values is less for full_scientific_name .It is 1975 in total
print(art1_synon.shape[0] - art1_synon.count()) # checking for null values

Checking for multiple occurence of synonyms

In [None]:
art1_synon_duplicate =art1_synon[art1_synon.duplicated('full_scientific_name_synonym')]
print('Duplicated rows are ', art1_synon_duplicate) #315 rows are duplicated which means there are multiple occurence of the Synonym

Get first 10 rows of synonym

In [None]:
art1_synon_head = art1_synon.head(100) # top 100 terms taken

In [None]:
#selecting range
art1_synon_head = art1_synon.iloc[1003:1100] #rows taken from the dataset after 10003rd value till 1100(excluding 1100th position)

Looking for articles having synonyms

In [None]:
import time
synonym_name = art1_synon_head['full_scientific_name_synonym']
for j in synonym_name:
    
    term_synon = f"{j}"
    print(term_synon)
    scientific_name_synonym=(art1_synon.loc[art1_synon['full_scientific_name_synonym']== term_synon,'full_scientific_name']).iloc[0] #full scientific name corresponding to synonym is selected
    scientific_name_synonym = scientific_name_synonym.strip('. ')# strip all '.' from the beginning and end of string
    len_sci_synonym= len(scientific_name_synonym ) #calculating the length of the scientific name for the given synonym
    len_synonym = len(term_synon)#calculating the length of the synonym
    time.sleep(0.5)
    handle = Entrez.esearch(db ="pmc", term= term_synon,retmax= "500")# search and retrieve article id for each synonym
    rec_list = Entrez.read(handle)
    handle.close()
    #print(rec_list['Count'])
    Total_article = (rec_list['Count'])
    #print(len(rec_list['IdList']))
    Ret_max_val = (len(rec_list['IdList']))
    #print(rec_list['IdList'])
    total_id = rec_list['IdList']
    #print("scientific name of term "+term_synon+" is :",scientific_name_synonym )
    
    no_synon_1 = 0 #count number of articles with synonym in the body of the article
    no_synon_2 = 0 #count number of articles with synonym in the abstract of the article
    no_sci_1 = 0 #count number of articles with scientific name in the body of the article
    no_sci_2 = 0 #count number of articles with scientific name in the abstract of the article
    no_body = 0 #Count number of articles with no body
    Tot_extrac =0 #Actual number of articles retreived

    for id in total_id:
      handle = Entrez.efetch(db='pmc', id = id , retmode = 'xml')
      total_content =  handle.read()
      #print("Entire text in the article id",id)
      #print(total_content) #print the total content
      soup = BeautifulSoup(total_content,"html.parser")
      abstracts = soup.find('abstract')#find the tag named 'abstract'
      body = soup.find('body')#find the tag named 'body'
      
      try:
        body_text = body.get_text()
        #print(body_text) # print the body text
        sci_name_body= body_text.find(scientific_name_synonym )#gets the position or the starting index of the scientific name
        synonym_body= body_text.find(term_synon)  # gets the position or the starting index of the synonym
        abstract_text = abstracts.get_text()
        #print(abstract_text) # print the abstract text
        sci_name_abstract= abstract_text.find(scientific_name_synonym ) #gets the position or the starting index of the scientific name
        synonym_abstract= abstract_text.find(term_synon) # gets the position or the starting index of the synonym

        #Checking for synonym in the body of article
        extract_body_synonym = body_text[synonym_body:synonym_body+ len_synonym] #the synonym is extracted from the body of the article using string slicing
        #print("synonym",extract_body_synonym)
        if extract_body_synonym == term_synon : # verifying if the synonym in the body of the article matches with synonym of the plant
          #print("Synonym is present in body of the article :",id)
          no_synon_1 +=1
        else:
      
          print("Synonym is not present in body of the article :",id)

        #Checking for synonym in the abstract of the article
        extract_abstract_synonym = abstract_text[synonym_abstract:synonym_abstract+ len_synonym] #the synonym is extracted from the abstract of the article using string slicing
        #print("synonym",extract_abstract_synonym)
        if extract_abstract_synonym == term_synon :  # verifying if the synonym in the body of the article matches with synonym of the plant
          #print("Synonym is present in abstract of the article :",id)
          no_synon_2 +=1
        else:
      
          print("Synonym is not present in  abstract of the article :",id)   

        #Checking for scientific name in the body of article
        extract_body_sci_synonym = body_text[sci_name_body:sci_name_body+ len_sci_synonym] #the scientific name is extracted from the body of the article using string slicing
        #print("scientific name",extract_body_sci_synonym)
        if extract_body_sci_synonym == scientific_name_synonym : # verifying if the scientific name in the body of the article matches with actual scientific name of the plant
          #print("Scientific name is present in body of the article :",id)
          no_sci_1 +=1
        else:
      
          print("Scientific name is not present in body of the article :",id)

        #Checking for scientific name in the abstract of the article
        extract_abstract_sci_synonym = abstract_text[sci_name_abstract:sci_name_abstract+ len_sci_synonym] #the scientific name is extracted from the abstract of the article using string slicing
        #print("scientific name",extract_abstract_sci_synonym)
        if extract_abstract_sci_synonym == scientific_name_synonym : # verifying if the scientific name in the abstract of the article matches with actual scientific name of the plant
          #print("Scientific name is present in abstract of the article :",id)
          no_sci_2 +=1
        else:
      
          print("Scientific name is not present in  abstract of the article :",id)
        Tot_extrac +=1  
      
      except AttributeError:
        no_body +=1
        continue
    
    df_ex_synon = pd.DataFrame({'synonym_name':[term_synon],'Scientific_name':[scientific_name_synonym],'synonym_body': [no_synon_1],'synonym_abstract':[no_synon_2],'sci_body':[no_sci_1],'sci_abstract':[no_sci_2],"Tot_No_PubMed_article": [Total_article],"Retrive_max_value": [Ret_max_val],"No_article_No_body": [no_body],"Actual_article_retreived":[Tot_extrac]})
    #with pd.ExcelWriter("Synonym_result.xlsx",mode="a",engine="openpyxl",if_sheet_exists="overlay") as writer:
    #df_ex_synon.to_excel(writer, sheet_name="Synonym_Output",header=None, startrow=writer.sheets["Synonym_Output"].max_row,index=True,index_label="No.")
    writer = pd.ExcelWriter('Synonym_result.xlsx', engine ='openpyxl')#,mode ='a',if_sheet_exists="overlay"  #,if_sheet_exists="replace"
    writer.book = load_workbook('Synonym_result.xlsx')
    writer.sheets = dict((ws.title,ws) for ws in writer.book.worksheets)
    reader = pd.read_excel(r'Synonym_result.xlsx')
    df_ex_synon.to_excel(writer,index= True,index_label="No.",header = False,sheet_name="Synonym_Output",startrow = len(reader)+1)
    #df_ex_synon.to_excel(writer,index= True,index_label="No.",sheet_name="Synonym_Output",header = False,startrow=writer.sheets["Synonym_Output"].max_row )
    writer.save()
    writer.close()        



# Ignore entire code from this section onwards
Contain trail versions of the code

When tried to extract 50 articles per synonym from the dataset , the article id 8739336 didnt contain the tag body.Which was retrieved as 2nd article for one of the term.The article which threw the error:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8739336/

This is due to the limitations in the code as it considers only articles having a body and abstract tag.

Hence the single occurence synonym search is implemented for each of the following selected synonyms seperately to have a statistics of the number of articles having correct scientific name:
Gardenia neuberia Eckl. & Zeyh., Vitis vinifera var. minuta Risso,Abelmoschus officinalis (DC.) Endl., Linnaea macrotera Graebn. & Buchw., Abelia myrtilloides Rehder, Abelia parvifolia Hemsl., Linnaea parvifolia (Hemsl.) Graebn., Abelia engleriana (Graebn.) Rehder, Abelia deutziifolia (H.Lév.) H.Lév., Linnaea chinensis (R.Br.) A.Braun & Vatke, Feuilleea jupunba (Willd.) Kuntze, Zornia setifera Mohlenbr.

In [None]:
term = 'Gardenia neuberia Eckl. & Zeyh.'   #The term need to be fetched for each individual term in the list : Gardenia neuberia Eckl. & Zeyh., Vitis vinifera var. minuta Risso,Abelmoschus officinalis (DC.) Endl., Linnaea macrotera Graebn. & Buchw., Abelia myrtilloides Rehder, Abelia parvifolia Hemsl., Linnaea parvifolia (Hemsl.) Graebn., Abelia engleriana (Graebn.) Rehder, Abelia deutziifolia (H.Lév.) H.Lév., Linnaea chinensis (R.Br.) A.Braun & Vatke, Feuilleea jupunba (Willd.) Kuntze, Zornia setifera Mohlenbr.
print(term)
handle = Entrez.esearch(db ="pmc", term= term,retmax= "50")# retrieve max is changed for article id for each synonym.If the search throws error, the corresponding id is not considered and the count of article till that run is taken to get the retmax value to be entered for successfull run of the code.
rec_list = Entrez.read(handle)
handle.close()
print(rec_list['Count']) # displays the total number of articles Id containing the given synonymn name
print(len(rec_list['IdList']))#List the total number of article retrieved. If total number of article  containing the synonym name are more than retmax parameter value given, it returns the given value in retmax.
total_id = rec_list['IdList']
print('The article ids corresponding to the given synonym name are :' ,total_id)# displays the articles Id corresponding to the given synonym

In [None]:
scientific_name_synonym=(art1_synon.loc[art1_synon['full_scientific_name_synonym']== term,'full_scientific_name']).iloc[0] #full scientific name corresponding to synonym is selected
scientific_name_synonym = scientific_name_synonym.strip('. ')# strip all '.' from the beginning and end of string
len_sci_synonym= len(scientific_name_synonym ) #calculating the length of the scientific name for the given synonym
len_synonym = len(term)#calculating the length of the synonym

In [None]:
print(scientific_name_synonym)

In [None]:
#extracting and checking for each article id corresponding to the given term if synonym is mentioned or not
for id in total_id:
    handle = Entrez.efetch(db='pmc', id = id , retmode = 'xml')
    total_content =  handle.read()
    print("Entire text in the article id",id)
    #print(total_content) #print the total content
    soup = BeautifulSoup(total_content,"html.parser")
    abstracts = soup.find('abstract')#find the tag named 'abstract'
    body = soup.find('body')#find the tag named 'body'
    body_text = body.get_text()
    #print(body_text) # print the body text
    sci_name_body= body_text.find(scientific_name_synonym )#gets the position or the starting index of the scientific name
    synonym_body= body_text.find(term )  # gets the position or the starting index of the synonym
    abstract_text = abstracts.get_text()
    #print(abstract_text) # print the abstract text
    sci_name_abstract= abstract_text.find(scientific_name_synonym ) #gets the position or the starting index of the scientific name
    synonym_abstract= abstract_text.find(term ) # gets the position or the starting index of the synonym

#Checking for synonym in the body of article
    extract_body_synonym = body_text[synonym_body:synonym_body+ len_synonym] #the synonym is extracted from the body of the article using string slicing
    print("synonym",extract_body_synonym)
    if extract_body_synonym == term : # verifying if the synonym in the body of the article matches with synonym of the plant
      print("Synonym is present in body of the article :",id)
    else:
      
      print("Synonym is not present in body of the article :",id)

#Checking for synonym in the abstract of the article
    extract_abstract_synonym = abstract_text[synonym_abstract:synonym_abstract+ len_synonym] #the synonym is extracted from the abstract of the article using string slicing
    print("synonym",extract_abstract_synonym)
    if extract_abstract_synonym == term :  # verifying if the synonym in the body of the article matches with synonym of the plant
      print("Synonym is present in abstract of the article :",id)
    else:
      
      print("Synonym is not present in  abstract of the article :",id)   

#Checking for scientific name in the body of article
    extract_body_sci_synonym = body_text[sci_name_body:sci_name_body+ len_sci_synonym] #the scientific name is extracted from the body of the article using string slicing
    print("scientific name",extract_body_sci_synonym)
    if extract_body_sci_synonym == scientific_name_synonym : # verifying if the scientific name in the body of the article matches with actual scientific name of the plant
      print("Scientific name is present in body of the article :",id)
    else:
      
      print("Scientific name is not present in body of the article :",id)

#Checking for scientific name in the abstract of the article
    extract_abstract_sci_synonym = abstract_text[sci_name_abstract:sci_name_abstract+ len_sci_synonym] #the scientific name is extracted from the abstract of the article using string slicing
    print("scientific name",extract_abstract_sci_synonym)
    if extract_abstract_sci_synonym == scientific_name_synonym : # verifying if the scientific name in the abstract of the article matches with actual scientific name of the plant
      print("Scientific name is present in abstract of the article :",id)
    else:
      
      print("Scientific name is not present in  abstract of the article :",id)          


In [None]:
""" 
#code for checking through all term and retreive the corresponding article id
synon_name = art1_synon['full_scientific_name_synonym']

for j in synon_name:
    
    term_synon = f"{j}[name]"
    print(term_synon)
    handle = Entrez.esearch(db ="pmc", term= term_synon,retmax= "50")# search and retrieve max 50 article id for each pharmaceutical name
    rec_list = Entrez.read(handle)
    handle.close()
    print(rec_list['Count'])
    print(len(rec_list['IdList']))
    print(rec_list['IdList'])
    total_id = rec_list['IdList']
"""

In [None]:
"""
#Code to retreive if search criteria has two or more parameters like genus and species.
synon_gen= art1_synon['genus_y'].tolist() #convert the genus column to list
synon_spe= art1_synon['species_y'].tolist() #convert the species column to list
for (i,j) in zip(synon_gen,synon_spe):
    
    term = f"{i}[genus] AND {j}[species]"
    print(term)
    handle = Entrez.esearch(db ="pmc", term= term,retmax= "50")
    rec_list = Entrez.read(handle)
    handle.close()
    print(rec_list['Count'])
    print(len(rec_list['IdList']))
    print(rec_list['IdList'])
    total_id = rec_list['IdList']

"""

**Reference:**
Entrez is a molecular biology database system that provides integrated access to nucleotide and protein sequence.The system is produced by the National Center for Biotechnology Information (NCBI).

Entrez Programming Utilities user guide is available at : https://www.ncbi.nlm.nih.gov/books/NBK25501/