In [35]:
import requests
import xml.etree.ElementTree as ET
import re
from typing import List, Dict, Any, Optional
import logging

In [36]:
# Setting up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

In [37]:
# # Function to fetch PubMed IDs based on a search query
# def fetch_paper_ids_from_pubmed(query: str, max_results: int = 10) -> List[str]:
#     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
#     params = {
#         'db': 'pubmed',
#         'term': query,
#         'retmax': max_results,  # Limit the number of results
#         'usehistory': 'y',      # Use history for retrieving large result sets
#         'retmode': 'xml'
#     }

#     # Send the request to fetch paper IDs
#     response = requests.get(base_url, params=params)
    
#     if response.status_code == 200:
#         # Parse the XML response to extract paper IDs
#         root = ET.fromstring(response.content)
        
#         # Extract the list of paper IDs from the XML response
#         id_list = [id_tag.text for id_tag in root.findall(".//Id")]
        
#         return id_list
#     else:
#         print(f"Error fetching paper IDs: {response.status_code}")
#         return []

# Function to fetch PubMed IDs based on a search query
def fetch_paper_ids_from_pubmed(query: str, max_results: int = 10) -> List[str]:
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        'db': 'pubmed',
        'term': query,
        'retmax': max_results,  # Limit the number of results
        'usehistory': 'y',      # Use history for retrieving large result sets
        'retmode': 'xml'
    }

    # Log the query and parameters
    logging.debug(f"Fetching PubMed IDs for query: {query} with parameters: {params}")

    try:
        # Send the request to fetch paper IDs
        response = requests.get(base_url, params=params)

        # Check if the response status is successful
        if response.status_code == 200:
            logging.info(f"Successfully fetched paper IDs for query: {query}")
            
            # Parse the XML response to extract paper IDs
            root = ET.fromstring(response.content)
            
            # Extract the list of paper IDs from the XML response
            id_list = [id_tag.text for id_tag in root.findall(".//Id")]

            logging.debug(f"Fetched {len(id_list)} paper IDs")
            return id_list
        else:
            logging.error(f"Error fetching paper IDs: {response.status_code}")
            return []
    except requests.exceptions.RequestException as e:
        # Log the error if the request fails
        logging.error(f"Request failed: {e}")
        return []

In [38]:
# # Function to send the request and get the paper's XML data
# def fetch_xml_data(pubmed_id: str) -> Optional[ET.Element]:
#     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
#     params = {
#         'db': 'pubmed',
#         'id': pubmed_id,
#         'retmode': 'xml'
#     }

#     try:
#         # Send the request to fetch detailed paper information
#         response = requests.get(base_url, params=params)
#         response.raise_for_status()  # Raise an exception for HTTP errors
        
#         if response.status_code == 200:
#             return ET.fromstring(response.content)  # Return the XML root element
#         else:
#             print(f"Error fetching paper details for PubMed ID {pubmed_id}: {response.status_code}")
#             return None
#     except requests.exceptions.RequestException as e:
#         print(f"Request failed: {e}")
#         return None

# Function to send the request and get the paper's XML data
def fetch_xml_data(pubmed_id: str) -> Optional[ET.Element]:
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'pubmed',
        'id': pubmed_id,
        'retmode': 'xml'
    }

    # Log the request details
    logging.debug(f"Fetching XML data for PubMed ID: {pubmed_id} with parameters: {params}")

    try:
        # Send the request to fetch detailed paper information
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        # Check if the response status is successful
        if response.status_code == 200:
            logging.info(f"Successfully fetched XML data for PubMed ID: {pubmed_id}")
            return ET.fromstring(response.content)  # Return the XML root element
        else:
            logging.error(f"Error fetching paper details for PubMed ID {pubmed_id}: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        # Log the error if the request fails
        logging.error(f"Request failed for PubMed ID {pubmed_id}: {e}")
        return None

In [39]:
# List of academic keywords to exclude
academic_keywords = [
    "University", "College", "Institute", "Academy", "School", 
    "Faculty", "Academician", "PhD", "Professor",
]

# Function to check if an affiliation is academic
def is_academic(affiliation: str, academic_keywords = List[str]) -> bool:
    logging.debug(f"Checking affiliation: {affiliation}")
    for keyword in academic_keywords:
        if keyword.lower() in affiliation.lower():
            return True
        
    logging.info(f"Affiliation '{affiliation}' is not academic.")
    return False

In [40]:
# List of pharmaceutical and biotech-related keywords to identify relevant companies
pharma_biotech_keywords = [
    "Pharmaceutical", "Biotech", "Biotechnology", "Pharma", "Biopharma", 
    "Med", "Healthcare", "Bio", "Genetics", "Drug", "Therapeutics", "Vaccine",
    "Diagnostics", "Clinical", "Development", "Manufacturing"
]

# Function to check if an affiliation is related to pharmaceutical or biotech companies
def is_pharma_biotech(affiliation: str, pharma_biotech_keywords = List[str]) -> bool:
    logging.debug(f"Checking affiliation: {affiliation}")

    for keyword in pharma_biotech_keywords:
        if keyword.lower() in affiliation.lower():
            return True
        
    logging.info(f"Affiliation is not related to pharma/biotech: {affiliation}")
    return False

In [41]:
# Function to extract PubMed ID
def get_pubmed_id(root: ET.Element, pubmed_id: str) -> str:
    logging.debug(f"Extracting PubMed ID: {pubmed_id}")
    return pubmed_id


# Function to extract the title of the paper
def get_title(root: ET.Element) -> str:
    title_tag = root.find(".//ArticleTitle")
    if title_tag is not None:
        logging.info(f"Title extracted: {title_tag.text}")
        return title_tag.text
    else:
        logging.warning("Title not found in the XML.")
        return "N/A"


# Function to extract the publication date
def get_publication_date(root: ET.Element) -> str:
    pub_date_tag = root.find(".//PubDate")
    if pub_date_tag is not None:
        year = pub_date_tag.find("Year")
        month = pub_date_tag.find("Month")
        day = pub_date_tag.find("Day")
        if year is not None and month is not None and day is not None:
            date = f"{year.text}-{month.text}-{day.text}"
            logging.info(f"Publication date extracted: {date}")
            return date
        else:
            logging.warning("Incomplete publication date found (missing year, month, or day).")
            return "N/A"
    else:
        logging.warning("Publication date not found in the XML.")
        return "N/A"


In [None]:
# Function to extract authors
def get_authors(root: ET.Element, academic_keywords: List[str]) -> List[str]:
    logging.debug("Extracting authors from the XML data.")

    authors_tag = root.findall(".//AuthorList/Author")
    authors = []
    for author in authors_tag:
        last_name = author.find("LastName")
        fore_name = author.find("ForeName")
        affiliation_info = author.findall(".//AffiliationInfo/Affiliation")
        
        # Log the author information for debugging
        logging.debug(f"Processing author: {fore_name.text if fore_name is not None else 'N/A'} {last_name.text if last_name is not None else 'N/A'}")

        # Get the affiliations and check if they are non-academic
        if affiliation_info:
            for aff in affiliation_info:
                logging.debug(f"Checking affiliation: {aff.text if aff is not None else 'N/A'}")
                if aff is not None and not is_academic(aff.text, academic_keywords):
                    if last_name is not None and fore_name is not None:
                        author_name = f"{fore_name.text} {last_name.text}"
                        logging.info(f"Adding author: {author_name}")
                        authors.append(author_name)
                    break
        else:
            # If no affiliation is found, consider it as non-academic (fallback case)
            if last_name is not None and fore_name is not None:
                author_name = f"{fore_name.text} {last_name.text}"
                logging.info(f"Adding author: {author_name}")
                authors.append(author_name)
    
    if authors:
        logging.debug(f"Authors extracted: {authors}")
    else:
        logging.debug("No authors found.")

    return authors if authors else ["N/A"]


# Function to extract company affiliations
# def get_company_affiliations(root):
#     affiliations = []
#     authors_tag = root.findall(".//AuthorList/Author")
    
#     # Regex pattern to match and remove email and address
#     email_pattern = r"[\w\.-]+@[\w\.-]+"
#     address_pattern = r"\d{5},? \w+,\s?[A-Za-z\s]+"

#     for author in authors_tag:
#         aff_info = author.findall(".//AffiliationInfo/Affiliation")
#         for aff in aff_info:
#             if aff is not None:
#                 affiliation = aff.text
                
#                 # Remove email and address from affiliation string
#                 affiliation = re.sub(email_pattern, '', affiliation)  # Remove emails
#                 affiliation = re.sub(address_pattern, '', affiliation)  # Remove address
                
#                 # If there's still a valid affiliation left, add it
#                 if affiliation.strip():
#                     affiliations.append(affiliation.strip())
    
#     return affiliations if affiliations else ["N/A"]

# Regex pattern to match and remove email addresses
email_pattern = r"[\w\.-]+@[\w\.-]+"

def get_pharma_biotech_affiliations(root: ET.Element, pharma_biotech_keywords: List[str], academic_keywords: List[str]) -> List[str]:
    logging.debug("Extracting pharma/biotech affiliations from the XML data.")
    pharma_biotech_affiliations = []
    authors_tag = root.findall(".//AuthorList/Author")
    
    for author in authors_tag:
        aff_info = author.findall(".//AffiliationInfo/Affiliation")
        for aff in aff_info:
            if aff is not None:
                logging.debug(f"Checking affiliation: {aff.text}")
                
                if is_pharma_biotech(aff.text, pharma_biotech_keywords) and not is_academic(aff.text, academic_keywords):
                    # Remove email from affiliation text
                    clean_affiliation = re.sub(email_pattern, '', aff.text).strip()
                    
                    if clean_affiliation:  # Ensure there's something left after removing emails
                        logging.info(f"Adding pharma/biotech affiliation: {clean_affiliation}")
                        pharma_biotech_affiliations.append(clean_affiliation)
    
    if pharma_biotech_affiliations:
        logging.debug(f"Pharma/Biotech affiliations extracted: {pharma_biotech_affiliations}")
    else:
        logging.debug("No pharma/biotech affiliations found.")

    return set(pharma_biotech_affiliations) if pharma_biotech_affiliations else ["N/A"]



# Function to extract corresponding author email
def get_corresponding_email(root: ET.Element, academic_keywords: List[str]) -> List[str]:
    corresponding_email = None
    
    # Find authors
    authors_tag = root.findall(".//AuthorList/Author")
    for author in authors_tag:
        # Extract the affiliation info
        aff_info = author.findall(".//AffiliationInfo/Affiliation")
        for aff in aff_info:
            if aff is not None:
                # Split the affiliation text and get the last part (email)
                email = aff.text.split()[-1] if aff.text else "N/A"
                if "@" in email:  # Check if it's a valid email
                    # Check if the affiliation belongs to a non-academic institution
                    if not is_academic(aff.text, academic_keywords):
                        corresponding_email = email
                        break  # Exit after finding the first non-academic email
    
    return corresponding_email if corresponding_email else "N/A"

In [29]:
# Main function to orchestrate the extraction
def fetch_paper_details(pubmed_id: str) -> Dict[str, Any]:
    # Get XML data from PubMed
    root = fetch_xml_data(pubmed_id)
    
    if root is None:
        return {}

    paper_details = {}
    
    paper_details['PubMedID'] = get_pubmed_id(root, pubmed_id)
    paper_details['Title'] = get_title(root)
    paper_details['PublicationDate'] = get_publication_date(root)
    paper_details['Authors'] = get_authors(root, academic_keywords)
    # paper_details['CompanyAffiliations'] = get_company_affiliations(root)
    paper_details['CompanyAffiliations'] = get_pharma_biotech_affiliations(root, pharma_biotech_keywords, academic_keywords)
    paper_details['CorrespondingAuthorEmail'] = get_corresponding_email(root, academic_keywords)

    return paper_details

In [30]:
# Function to fetch all paper details based on a query
def fetch_all_papers(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
    # Fetch the paper IDs
    paper_ids = fetch_paper_ids_from_pubmed(query, max_results)

    # Fetch the details for each paper using the paper IDs
    all_paper_details = []
    for pubmed_id in paper_ids:
        paper_details = fetch_paper_details(pubmed_id)
        all_paper_details.append(paper_details)

    return all_paper_details

In [31]:
query = "healthcare burnout"
papers = fetch_all_papers(query, max_results=10)

papers

2025-04-10 02:51:57,895 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443


2025-04-10 02:51:59,489 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/esearch.fcgi?db=pubmed&term=healthcare+burnout&retmax=10&usehistory=y&retmode=xml HTTP/11" 200 None
2025-04-10 02:51:59,505 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443
2025-04-10 02:52:00,319 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/efetch.fcgi?db=pubmed&id=40202386&retmode=xml HTTP/11" 200 None
2025-04-10 02:52:00,513 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443
2025-04-10 02:52:01,299 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/efetch.fcgi?db=pubmed&id=40200377&retmode=xml HTTP/11" 200 None
2025-04-10 02:52:01,529 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443
2025-04-10 02:52:02,310 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/efetch.fcgi?db=pubmed&id=40198009&retmode=xml HTTP/11" 200 None
2025-04-10 02:52:02,562 - DEBUG - Starting new HTTPS conn

[{'PubMedID': '40202386',
  'Title': 'Antecedents and Outcomes of Physician Coworker Conflict: A Differential Occupational Model for Health Care Managers.',
  'PublicationDate': '2025-Apr-10',
  'Authors': ['N/A'],
  'CompanyAffiliations': ['N/A'],
  'CorrespondingAuthorEmail': 'N/A'},
 {'PubMedID': '40200377',
  'Title': 'Compassion fatigue in helping professions: a scoping literature review.',
  'PublicationDate': '2025-Apr-08',
  'Authors': ['Amelia Mohd Noor',
   'Dodi Suryana',
   'Engku Mardiah Engku Kamarudin',
   'Noor Banu Mahadir Naidu',
   'Priyalatha Govindasamy'],
  'CompanyAffiliations': {'Department of Moral, Civic Studies and Character Development, Universiti Pendidikan Sultan Idris, Tanjung Salim, Perak, Malaysia.'},
  'CorrespondingAuthorEmail': 'amelia@fpm.upsi.edu.my.'},
 {'PubMedID': '40198009',
  'Title': 'Factors Contributing to Well-Being Among Hospital-Based Nurses.',
  'PublicationDate': 'N/A',
  'Authors': ['Christine Griffin'],
  'CompanyAffiliations': {"The

In [32]:
import pandas as pd

def save_paper_details_to_csv(papers: List[Dict[str, Any]], query: str, filename: Optional[str] = None) -> None:
    # Sanitize the query to make it a valid filename
    sanitized_query = re.sub(r'[^\w\s-]', '', query)  # Remove any special characters
    sanitized_query = re.sub(r'[-\s]+', '_', sanitized_query).strip()  # Replace spaces and hyphens with underscores
    
    # Set the filename to the query if not provided
    if not filename:
        filename = f"{sanitized_query}_results.csv"
    
    # Create a DataFrame from the list of paper details
    df = pd.DataFrame(papers)
    
    # Save the DataFrame to a CSV file
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Results saved to {filename}")


In [33]:
# Save to CSV with dynamic filename based on the query
save_paper_details_to_csv(papers, query=query)

Results saved to healthcare_burnout_results.csv


In [34]:
import sys
import argparse
import logging

# Define your other functions here (fetch_paper_ids_from_pubmed, fetch_all_papers, etc.)

def setup_logging(debug: bool):
    logging.basicConfig(
        level=logging.DEBUG if debug else logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

def main():
    # Simulate command-line arguments for Jupyter Notebook
    sys.argv = ['your_script.py', 'healthcare burnout', '--debug', '--file', 'results.csv']

    # Argument parsing setup
    parser = argparse.ArgumentParser(description="Fetch PubMed papers based on a query")
    parser.add_argument("query", type=str, help="Search query to fetch papers from PubMed")
    parser.add_argument("-d", "--debug", action="store_true", help="Enable debug logging")
    parser.add_argument("-f", "--file", type=str, help="Specify the filename to save the results")

    # Parse the arguments
    args = parser.parse_args()

    # Set up logging based on the debug flag
    setup_logging(debug=args.debug)

    # Fetch the papers based on the query
    papers = fetch_all_papers(args.query, max_results=10)

    # Save the results to a file or print them to the console
    if args.file:
        save_paper_details_to_csv(papers, query=args.query, filename=args.file)
    else:
        print(papers)

# Run the main function
main()


2025-04-10 02:52:09,197 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443
2025-04-10 02:52:10,025 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/esearch.fcgi?db=pubmed&term=healthcare+burnout&retmax=10&usehistory=y&retmode=xml HTTP/11" 200 None
2025-04-10 02:52:10,025 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443
2025-04-10 02:52:10,807 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/efetch.fcgi?db=pubmed&id=40202386&retmode=xml HTTP/11" 200 None
2025-04-10 02:52:11,000 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443
2025-04-10 02:52:11,776 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/efetch.fcgi?db=pubmed&id=40200377&retmode=xml HTTP/11" 200 None
2025-04-10 02:52:12,022 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443
2025-04-10 02:52:12,792 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/efetch.fcgi?db=pubmed&id

Results saved to results.csv
