In [31]:
# Installing required libraries
# requests -> for API calls
# xmltodict -> to convert XML response into Python dictionary

!pip install requests xmltodict



In [32]:
# Importing necessary libraries

import requests       # to make HTTP API calls
import xmltodict      # to parse XML data
import json           # to save output in JSON format

In [25]:
# Step 1: Using ESearch API to get 50 PubMed IDs (PMIDs)
# Keyword: oncology biomarkers

search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

# Parameters required for the API
params = {
    "db": "pubmed",                 # database name
    "term": "oncology biomarkers",  # search keyword
    "retmax": 50,                   # number of articles to fetch
    "retmode": "json"               # response format
}

# Sending GET request to PubMed API
response = requests.get(search_url, params=params)

# Converting response to JSON
search_data = response.json()

# Extracting list of PMIDs
pmid_list = search_data["esearchresult"]["idlist"]

print("Total PMIDs fetched:", len(pmid_list))
print("Sample PMIDs:", pmid_list[:5])

Total PMIDs fetched: 50
Sample PMIDs: ['41758068', '41757682', '41757268', '41756885', '41756321']


In [33]:
# Step 2: Using EFetch API to get full article details
# We pass all PMIDs separated by commas

fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

fetch_params = {
    "db": "pubmed",
    "id": ",".join(pmid_list),  # joining PMIDs
    "retmode": "xml"            # data comes in XML format
}

# Sending request to fetch article details
fetch_response = requests.get(fetch_url, params=fetch_params)

# Parsing XML into Python dictionary
articles_data = xmltodict.parse(fetch_response.text)

print("Data fetched successfully.")

Data fetched successfully.


In [34]:
# Step 3: Extracting required fields from parsed XML
# Fields required:
# PMID, Title, Authors, Abstract, Publication Date, Journal

articles = articles_data["PubmedArticleSet"]["PubmedArticle"]

formatted_articles = []  # This will store cleaned article data

for article in articles:
    try:
        medline = article["MedlineCitation"]
        article_info = medline["Article"]

        # Extract PMID
        pmid = medline["PMID"]
        if isinstance(pmid, dict):
            pmid = pmid["#text"]

        # Extract Title
        title = article_info.get("ArticleTitle", "")

        # Extract Abstract
        abstract_text = ""
        if "Abstract" in article_info:
            abstract = article_info["Abstract"].get("AbstractText", "")

            # Sometimes abstract is list
            if isinstance(abstract, list):
                abstract_text = " ".join(abstract)
            else:
                abstract_text = abstract

        # Extract Journal Name
        journal = article_info["Journal"]["Title"]

        # Extract Publication Year
        pub_date = article_info["Journal"]["JournalIssue"]["PubDate"]
        publication_year = pub_date.get("Year", "")

        # Extract Authors
        authors_list = []
        if "AuthorList" in article_info:
            authors = article_info["AuthorList"]["Author"]

            if not isinstance(authors, list):
                authors = [authors]

            for author in authors:
                if "LastName" in author:
                    authors_list.append(author["LastName"])

        # Creating structured dictionary
        formatted_articles.append({
            "pmid": pmid,
            "title": title,
            "authors": authors_list,
            "abstract": abstract_text,
            "publication_year": publication_year,
            "journal": journal
        })

    except Exception as e:
        # If any article has missing fields, skip it
        continue

print("Total formatted articles:", len(formatted_articles))

Total formatted articles: 44


In [35]:
# Step 4: Saving extracted data into JSON file

with open("pubmed_articles.json", "w", encoding="utf-8") as file:
    json.dump(formatted_articles, file, indent=4)

print("pubmed_articles.json file saved successfully!")

pubmed_articles.json file saved successfully!


In [36]:
# Print first article to verify output

formatted_articles[23]

{'pmid': '41752040',
 'title': 'Advances in Breast Cancer Research: Immunological, Pathological, and Pharmacological Perspectives for Improving Patient Outcomes.',
 'authors': ['Crocamo', 'Dos Santos', 'Abdelhay'],
 'abstract': 'Breast cancer remains the most frequently diagnosed malignancy worldwide. Over the past decade, advances in molecular biology have expanded beyond tumor-intrinsic features to encompass the immune microenvironment and patient-specific pharmacogenomic profiles, profoundly reshaping diagnostic, prognostic, and therapeutic paradigms in breast oncology. Owing to rapid technological progress and an expanding therapeutic armamentarium, periodic synthesis of both foundational principles and emerging evidence remains essential for the critical interpretation of ongoing advances. This review provides a comprehensive overview of the contemporary global landscape of breast cancer, integrating developments in diagnosis, risk stratification, and therapeutic innovation. We ex