# PubMed Research Paper Fetcher

This notebook is designed to interact with the PubMed API to perform the following tasks:
1. Fetch unique identifiers (PubMed IDs) for research papers based on a query.
2. Retrieve detailed information about each paper, such as title, authors, and publication dates.
3. Save the fetched data into a CSV file for easy sharing and analysis.

## Workflow Overview:
- **Fetch Paper IDs**: Use the `fetch_paper_ids` function to query the PubMed API and get IDs.
- **Fetch Paper Details**: Use the `fetch_paper_details` function to get detailed metadata for these papers.
- **Save to CSV**: Store the results in a tabular format using the `save_to_csv` function.

## Libraries Used:
- `requests`: For making HTTP requests to interact with the PubMed API.
- `pandas`: For handling data and saving it as a CSV file.
- `xml.etree.ElementTree`: For parsing XML responses returned by the PubMed API.



In [1]:

import requests
import pandas as pd
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional


In [2]:

# Base URLs for PubMed API
PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

def fetch_paper_ids(query: str, max_results: int = 10) -> List[str]:
    """
    Fetch paper IDs from PubMed based on a query.

    Args:
        query (str): The search query.
        max_results (int): Maximum number of results to fetch.

    Returns:
        List[str]: A list of paper IDs.
    """
    try:
        # Parameters for the API request
        params = {
            "db": "pubmed",         # Database to search in
            "term": query,          # Query term or keywords
            "retmax": max_results,  # Limit the number of results
            "retmode": "json"       # Response format
        }
        # Send the GET request
        response = requests.get(PUBMED_SEARCH_URL, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        # Extract and return the list of paper IDs
        return response.json()["esearchresult"]["idlist"]
    except requests.exceptions.RequestException as e:
        print(f"Error fetching paper IDs: {e}")
        return []


In [3]:

PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

def fetch_paper_details(paper_ids: List[str]) -> List[Dict[str, Optional[str]]]:
    """
    Fetch detailed paper information from PubMed.

    Args:
        paper_ids (List[str]): A list of PubMed paper IDs.

    Returns:
        List[Dict[str, Optional[str]]]: A list of dictionaries containing paper details.
    """
    if not paper_ids:
        print("No paper IDs provided.")
        return []

    try:
        params = {
            "db": "pubmed",
            "id": ",".join(paper_ids),
            "retmode": "xml"
        }
        response = requests.get(PUBMED_FETCH_URL, params=params)
        response.raise_for_status()

        root = ET.fromstring(response.text)
        papers = []

        for article in root.findall(".//PubmedArticle"):
            pubmed_id = article.find(".//PMID").text
            title = article.find(".//ArticleTitle").text if article.find(".//ArticleTitle") is not None else "N/A"
            pub_date = article.find(".//PubDate/Year").text if article.find(".//PubDate/Year") is not None else "N/A"
            
            authors = article.findall(".//Author")
            non_academic_authors = []
            company_affiliations = []
            corresponding_email = "N/A"

            for author in authors:
                affiliation = author.find(".//AffiliationInfo/Affiliation")
                if affiliation is not None:
                    affiliation_text = affiliation.text.lower()
                    if any(keyword in affiliation_text for keyword in ["pharmaceutical", "biotech", "company", "corporation"]):
                        last_name = author.find("LastName")
                        if last_name is not None:
                            non_academic_authors.append(last_name.text)
                        company_affiliations.append(affiliation.text)

                    if "corresponding" in affiliation_text and "email" in affiliation_text:
                        corresponding_email = affiliation_text.split("email:")[1].strip()

            papers.append({
                "PubmedID": pubmed_id,
                "Title": title,
                "Publication Date": pub_date,
                "Non-academic Author(s)": ", ".join(non_academic_authors) if non_academic_authors else "N/A",
                "Company Affiliation(s)": ", ".join(company_affiliations) if company_affiliations else "N/A",
                "Corresponding Author Email": corresponding_email
            })

        return papers
    except requests.exceptions.RequestException as e:
        print(f"Error fetching paper details: {e}")
        return []


In [4]:

def save_to_csv(papers: List[Dict[str, Optional[str]]], filename: str) -> None:
    """
    Save a list of papers to a CSV file.

    Args:
        papers (List[Dict[str, Optional[str]]]): A list of paper details.
        filename (str): The filename for the CSV file.

    Returns:
        None
    """
    if papers:
        df = pd.DataFrame(papers)
        df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")
    else:
        print("No papers to save.")


In [5]:
# Example: Fetching and saving research papers based on a query
query = "diabetes"  # The search term
print(f"Fetching papers for query: '{query}'")

# Step 1: Fetch paper IDs
paper_ids = fetch_paper_ids(query)
print("Paper IDs:", paper_ids)

# Step 2: Fetch detailed paper information
papers = fetch_paper_details(paper_ids)
print("Papers:", papers)

# Step 3: Save the results to a CSV file
save_to_csv(papers, "diabetes.csv")


Fetching papers for query: 'diabetes'
Paper IDs: ['39818466', '39818438', '39818418', '39818408', '39818342', '39818319', '39818302', '39818298', '39818297', '39818296']
Papers: [{'PubmedID': '39818466', 'Title': 'Validating psychometric properties of generic quality-of-life instruments (WHOQOL-BREF (TW) and EQ-5D) among non-dialysis chronic kidney disease: Rasch and confirmatory factor analyses.', 'Publication Date': '2025', 'Non-academic Author(s)': 'N/A', 'Company Affiliation(s)': 'N/A', 'Corresponding Author Email': 'N/A'}, {'PubmedID': '39818438', 'Title': 'Visceral fat distribution: Interracial studies.', 'Publication Date': '2025', 'Non-academic Author(s)': 'N/A', 'Company Affiliation(s)': 'N/A', 'Corresponding Author Email': 'N/A'}, {'PubmedID': '39818418', 'Title': 'Environmental and Clinical Factors Concerning Gastrointestinal Bleeding: An Umbrella Review of Meta-Analyses.', 'Publication Date': '2025', 'Non-academic Author(s)': 'N/A', 'Company Affiliation(s)': 'N/A', 'Corresp