
# PubMed Research Paper Fetcher

This notebook demonstrates how to interact with the PubMed API to:
- Fetch research paper IDs based on a search query.
- Retrieve detailed information about each paper, such as title, authors, and affiliations.
- Save the results in a CSV file.

The project uses Python libraries such as `requests` for API calls, `pandas` for data handling, and `xml.etree.ElementTree` for XML parsing.


In [12]:

import requests
import pandas as pd
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional


In [13]:

# Base URLs for PubMed API
PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

def fetch_paper_ids(query: str, max_results: int = 10) -> List[str]:
    """
    Fetch paper IDs from PubMed based on a query.

    Args:
        query (str): The search query.
        max_results (int): Maximum number of results to fetch.

    Returns:
        List[str]: A list of paper IDs.
    """
    try:
        params = {
            "db": "pubmed",
            "term": query,
            "retmax": max_results,
            "retmode": "json"
        }
        response = requests.get(PUBMED_SEARCH_URL, params=params)
        response.raise_for_status()
        return response.json()["esearchresult"]["idlist"]
    except requests.exceptions.RequestException as e:
        print(f"Error fetching paper IDs: {e}")
        return []


In [14]:

PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

def fetch_paper_details(paper_ids: List[str]) -> List[Dict[str, Optional[str]]]:
    """
    Fetch detailed paper information from PubMed.

    Args:
        paper_ids (List[str]): A list of PubMed paper IDs.

    Returns:
        List[Dict[str, Optional[str]]]: A list of dictionaries containing paper details.
    """
    if not paper_ids:
        print("No paper IDs provided.")
        return []

    try:
        params = {
            "db": "pubmed",
            "id": ",".join(paper_ids),
            "retmode": "xml"
        }
        response = requests.get(PUBMED_FETCH_URL, params=params)
        response.raise_for_status()

        root = ET.fromstring(response.text)
        papers = []

        for article in root.findall(".//PubmedArticle"):
            pubmed_id = article.find(".//PMID").text
            title = article.find(".//ArticleTitle").text if article.find(".//ArticleTitle") is not None else "N/A"
            pub_date = article.find(".//PubDate/Year").text if article.find(".//PubDate/Year") is not None else "N/A"
            
            authors = article.findall(".//Author")
            non_academic_authors = []
            company_affiliations = []
            corresponding_email = "N/A"

            for author in authors:
                affiliation = author.find(".//AffiliationInfo/Affiliation")
                if affiliation is not None:
                    affiliation_text = affiliation.text.lower()
                    if any(keyword in affiliation_text for keyword in ["pharmaceutical", "biotech", "company", "corporation"]):
                        last_name = author.find("LastName")
                        if last_name is not None:
                            non_academic_authors.append(last_name.text)
                        company_affiliations.append(affiliation.text)

                    if "corresponding" in affiliation_text and "email" in affiliation_text:
                        corresponding_email = affiliation_text.split("email:")[1].strip()

            papers.append({
                "PubmedID": pubmed_id,
                "Title": title,
                "Publication Date": pub_date,
                "Non-academic Author(s)": ", ".join(non_academic_authors) if non_academic_authors else "N/A",
                "Company Affiliation(s)": ", ".join(company_affiliations) if company_affiliations else "N/A",
                "Corresponding Author Email": corresponding_email
            })

        return papers
    except requests.exceptions.RequestException as e:
        print(f"Error fetching paper details: {e}")
        return []


In [15]:

def save_to_csv(papers: List[Dict[str, Optional[str]]], filename: str) -> None:
    """
    Save a list of papers to a CSV file.

    Args:
        papers (List[Dict[str, Optional[str]]]): A list of paper details.
        filename (str): The filename for the CSV file.

    Returns:
        None
    """
    if papers:
        df = pd.DataFrame(papers)
        df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")
    else:
        print("No papers to save.")


In [16]:

# Example: Fetching and saving papers
query = "diabetes"
print(f"Fetching papers for query: '{query}'")

# Fetch paper IDs
paper_ids = fetch_paper_ids(query)
print("Paper IDs:", paper_ids)

# Fetch paper details
papers = fetch_paper_details(paper_ids)
print("Papers:", papers)

# Save to CSV
save_to_csv(papers, "diabetes.csv")


Fetching papers for query: 'diabetes'
Paper IDs: ['39810263', '39810247', '39810242', '39810231', '39810230', '39810164', '39810122', '39810115', '39810091', '39810079']
Papers: [{'PubmedID': '39810263', 'Title': 'Discovery of robust and highly specific microbiome signatures of non-alcoholic fatty liver disease.', 'Publication Date': '2025', 'Non-academic Author(s)': 'Xu', 'Company Affiliation(s)': 'The State Key Laboratory of Pharmaceutical Biotechnology, The University of Hong Kong, Hong Kong SAR, China.', 'Corresponding Author Email': 'N/A'}, {'PubmedID': '39810247', 'Title': 'The status of care for youth with type 1 diabetes within and coming from humanitarian crises settings: a narrative review.', 'Publication Date': '2025', 'Non-academic Author(s)': 'N/A', 'Company Affiliation(s)': 'N/A', 'Corresponding Author Email': 'N/A'}, {'PubmedID': '39810242', 'Title': 'The rise of weekly insulins: addressing the challenges of type 2 diabetes care in Brazil.', 'Publication Date': '2025', '