In [5]:
import csv
import re
import urllib
from time import sleep
from datetime import datetime
import xml.etree.ElementTree as ET

# Query to include both terms and filter for the last 10 years
current_year = datetime.now().year
start_year = current_year - 10

query = '"social determinants of health"[All Fields] AND "liver"[All Fields] AND ("{start}"[Date - Publication] : "{end}"[Date - Publication])'.format(start=start_year, end=current_year)

# Common settings between esearch and efetch
base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
db = 'db=pubmed'

# esearch settings
search_eutil = 'esearch.fcgi?'
search_term = '&term=' + urllib.parse.quote(query)
search_usehistory = '&usehistory=y'
search_rettype = '&rettype=json'

# Call the esearch command for the query and read the web result
search_url = base_url + search_eutil + db + search_term + search_usehistory + search_rettype
print("This is the esearch command:\n" + search_url + "\n")
f = urllib.request.urlopen(search_url)
search_data = f.read().decode('utf-8')

# Extract the total abstract count
total_abstract_count = int(re.findall("<Count>(\d+?)</Count>", search_data)[0])

# efetch settings
fetch_eutil = 'efetch.fcgi?'
retmax = 1000  # Max records per request
retstart = 0   # Start point for fetching
fetch_retmode = "&retmode=xml"  # Change to XML for structured data
fetch_rettype = "&rettype=abstract"

# Obtain WebEnv and QueryKey settings from the esearch results
fetch_webenv = "&WebEnv=" + re.findall("<WebEnv>(\S+)<\/WebEnv>", search_data)[0]
fetch_querykey = "&query_key=" + re.findall("<QueryKey>(\d+?)</QueryKey>", search_data)[0]

# Call efetch commands using a loop until all abstracts are obtained
run = True
all_abstracts = list()
loop_counter = 1

while run:
    print("This is efetch run number " + str(loop_counter))
    loop_counter += 1
    fetch_retstart = "&retstart=" + str(retstart)
    fetch_retmax = "&retmax=" + str(retmax)

    # Create the efetch URL
    fetch_url = base_url + fetch_eutil + db + fetch_querykey + fetch_webenv + fetch_retstart + fetch_retmax + fetch_retmode + fetch_rettype
    print(fetch_url)

    # Open the efetch URL
    f = urllib.request.urlopen(fetch_url)
    fetch_data = f.read().decode('utf-8')

    # Print raw XML response for debugging
    print("Raw XML Response:")
    print(fetch_data[:1000])

    # Parse the XML response
    root = ET.fromstring(fetch_data)
    abstracts = []

    # Search for abstract under different tags
    for pubmed_article in root.findall('.//PubmedArticle'):
        abstract_tag = pubmed_article.find('.//AbstractText')
        if abstract_tag is not None:
            abstract_text = abstract_tag.text
            if abstract_text:
                abstracts.append(abstract_text.strip())

    # Append to the list all_abstracts
    all_abstracts += abstracts
    print("A total of " + str(len(all_abstracts)) + " abstracts have been downloaded.\n")

    # Wait 2 seconds to avoid being blocked by PubMed
    sleep(2)

    # Update retstart to download the next chunk of abstracts
    retstart = retstart + retmax
    if retstart > total_abstract_count:
        run = False

# Save abstracts to CSV
with open('/content/drive/MyDrive/Raajitha_NLP_CourseCapstone/pubmed_abstracts.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Abstract'])  # Column header
    for abstract in all_abstracts:
        writer.writerow([abstract])

print("Abstract extraction completed and saved to 'pubmed_abstracts.csv'.")


This is the esearch command:
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%22social%20determinants%20of%20health%22%5BAll%20Fields%5D%20AND%20%22liver%22%5BAll%20Fields%5D%20AND%20%28%222014%22%5BDate%20-%20Publication%5D%20%3A%20%222024%22%5BDate%20-%20Publication%5D%29&usehistory=y&rettype=json

This is efetch run number 1
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key=1&WebEnv=MCID_675757a5cc068b1ffb0bfd73&retstart=0&retmax=1000&retmode=xml&rettype=abstract
Raw XML Response:
<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2024//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_240101.dtd">
<PubmedArticleSet>
<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Automated"><PMID Version="1">39630170</PMID><DateCompleted><Year>2024</Year><Month>12</Month><Day>04</Day></DateCompleted><DateRevised><Year>2024</Year><Month>12</Month><Day>04</Day></DateRevised