In [41]:
import requests
import xml.etree.ElementTree as ET
import re

In [42]:
# Function to fetch PubMed IDs based on a search query
def fetch_paper_ids_from_pubmed(query, max_results=10):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        'db': 'pubmed',
        'term': query,
        'retmax': max_results,  # Limit the number of results
        'usehistory': 'y',      # Use history for retrieving large result sets
        'retmode': 'xml'
    }

    # Send the request to fetch paper IDs
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        # Parse the XML response to extract paper IDs
        root = ET.fromstring(response.content)
        
        # Extract the list of paper IDs from the XML response
        id_list = [id_tag.text for id_tag in root.findall(".//Id")]
        
        return id_list
    else:
        print(f"Error fetching paper IDs: {response.status_code}")
        return []

In [43]:
# # Function to fetch detailed information about a paper using its PubMed ID
# def fetch_paper_details(pubmed_id):
#     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
#     params = {
#         'db': 'pubmed',
#         'id': pubmed_id,
#         'retmode': 'xml'
#     }

#     try:
#         # Send the request to fetch detailed paper information
#         response = requests.get(base_url, params=params)
#         response.raise_for_status()  # Raise an exception for HTTP errors

#         if response.status_code == 200:
#             # Parse the XML response to extract paper details
#             root = ET.fromstring(response.content)
#             # print(response.content.decode())  # Print XML content for debugging
#             paper_details = {}
#             paper_details['PubMedID'] = pubmed_id

#             # Extract Title
#             title_tag = root.find(".//ArticleTitle")
#             paper_details['Title'] = title_tag.text if title_tag is not None else "N/A"

#             # Extract Publication Date
#             pub_date_tag = root.find(".//PubDate")
#             if pub_date_tag is not None:
#                 year = pub_date_tag.find("Year")
#                 month = pub_date_tag.find("Month")
#                 day = pub_date_tag.find("Day")
#                 if year is not None and month is not None and day is not None:
#                     paper_details['PublicationDate'] = f"{year.text}-{month.text}-{day.text}"
#                 else:
#                     paper_details['PublicationDate'] = "N/A"
#             else:
#                 paper_details['PublicationDate'] = "N/A"

#             # Extract Authors
#             authors_tag = root.findall(".//AuthorList/Author")
#             authors = []
#             for author in authors_tag:
#                 last_name = author.find("LastName")
#                 fore_name = author.find("ForeName")
#                 if last_name is not None and fore_name is not None:
#                     authors.append(f"{fore_name.text} {last_name.text}")
#                 else:
#                     authors.append("N/A")
#             paper_details['Authors'] = authors if authors else ["N/A"]

#             # Extract Company Affiliations
#             affiliations = []
#             for author in authors_tag:
#                 aff_info = author.findall(".//AffiliationInfo/Affiliation")
#                 for aff in aff_info:
#                     if aff is not None:
#                         affiliations.append(aff.text)
#             paper_details['CompanyAffiliations'] = affiliations if affiliations else ["N/A"]

#             # Extract Corresponding Author Email
#             corresponding_email_tag = root.findall(".//AuthorList/Author/AffiliationInfo/Affiliation")
#             corresponding_email = []
#             for aff in corresponding_email_tag:
#                 email = aff.text.split()[-1] if aff.text else "N/A"
#                 if "@" in email:
#                     corresponding_email.append(email)
            
#             paper_details['CorrespondingAuthorEmail'] = corresponding_email[0] if corresponding_email else "N/A"

#             return paper_details
#         else:
#             print(f"Error fetching paper details for PubMed ID {pubmed_id}: {response.status_code}")
#             return {}

#     except requests.exceptions.RequestException as e:
#         print(f"Request failed: {e}")
#         return {}

In [44]:
# # Function to fetch all paper details based on a query
# def fetch_all_papers(query, max_results=10):
#     # Fetch the paper IDs
#     paper_ids = fetch_paper_ids_from_pubmed(query, max_results)

#     # Fetch the details for each paper using the paper IDs
#     all_paper_details = []
#     for pubmed_id in paper_ids:
#         paper_details = fetch_paper_details(pubmed_id)
#         all_paper_details.append(paper_details)

#     return all_paper_details

In [45]:
# # Example usage:
# # Fetch all papers based on the query "machine learning"
# query = "healthcare burnout"
# papers = fetch_all_papers(query, max_results=1)

# # # Print the fetched paper's details
# # print(f"PubMedID: {paper_details['PubMedID']}")
# # print(f"Title: {paper_details['Title']}")
# # print(f"Publication Date: {paper_details['PublicationDate']}")
# # print(f"Authors: {', '.join(paper_details['Authors'])}")
# # print(f"Company Affiliations: {', '.join(paper_details['CompanyAffiliations'])}")
# # print(f"Corresponding Author Email: {paper_details['CorrespondingAuthorEmail']}")
# papers

In [46]:
# Function to send the request and get the paper's XML data
def fetch_xml_data(pubmed_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'pubmed',
        'id': pubmed_id,
        'retmode': 'xml'
    }

    try:
        # Send the request to fetch detailed paper information
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        if response.status_code == 200:
            return ET.fromstring(response.content)  # Return the XML root element
        else:
            print(f"Error fetching paper details for PubMed ID {pubmed_id}: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

In [47]:
# List of academic keywords to exclude
academic_keywords = [
    "University", "College", "Institute", "Academy", "School", 
    "Faculty", "Academician", "PhD", "Professor",
]

# Function to check if an affiliation is academic
def is_academic(affiliation):
    for keyword in academic_keywords:
        if keyword.lower() in affiliation.lower():
            return True
    return False

In [48]:
# List of pharmaceutical and biotech-related keywords to identify relevant companies
pharma_biotech_keywords = [
    "Pharmaceutical", "Biotech", "Biotechnology", "Pharma", "Biopharma", 
    "Med", "Healthcare", "Bio", "Genetics", "Drug", "Therapeutics", "Vaccine",
    "Diagnostics", "Clinical", "Development", "Manufacturing"
]

# Function to check if an affiliation is related to pharmaceutical or biotech companies
def is_pharma_biotech(affiliation):
    for keyword in pharma_biotech_keywords:
        if keyword.lower() in affiliation.lower():
            return True
    return False

In [49]:
# Function to extract PubMed ID
def get_pubmed_id(root, pubmed_id):
    return pubmed_id


# Function to extract the title of the paper
def get_title(root):
    title_tag = root.find(".//ArticleTitle")
    return title_tag.text if title_tag is not None else "N/A"


# Function to extract the publication date
def get_publication_date(root):
    pub_date_tag = root.find(".//PubDate")
    if pub_date_tag is not None:
        year = pub_date_tag.find("Year")
        month = pub_date_tag.find("Month")
        day = pub_date_tag.find("Day")
        if year is not None and month is not None and day is not None:
            return f"{year.text}-{month.text}-{day.text}"
    return "N/A"


In [None]:
# Function to extract authors
def get_authors(root):
    authors_tag = root.findall(".//AuthorList/Author")
    authors = []
    for author in authors_tag:
        last_name = author.find("LastName")
        fore_name = author.find("ForeName")
        if last_name is not None and fore_name is not None:
            authors.append(f"{fore_name.text} {last_name.text}")
        else:
            authors.append("N/A")
    return authors if authors else ["N/A"]


# Function to extract company affiliations
# def get_company_affiliations(root):
#     affiliations = []
#     authors_tag = root.findall(".//AuthorList/Author")
#     for author in authors_tag:
#         aff_info = author.findall(".//AffiliationInfo/Affiliation")
#         for aff in aff_info:
#             if aff is not None:
#                 affiliations.append(aff.text)
#     return affiliations if affiliations else ["N/A"]

# Function to extract pharmaceutical or biotech affiliations
def get_pharma_biotech_affiliations(root):
    pharma_biotech_affiliations = []
    authors_tag = root.findall(".//AuthorList/Author")
    for author in authors_tag:
        aff_info = author.findall(".//AffiliationInfo/Affiliation")
        for aff in aff_info:
            if aff is not None and is_pharma_biotech(aff.text) and not is_academic(aff.text):
                pharma_biotech_affiliations.append(aff.text)
    return set(pharma_biotech_affiliations) if pharma_biotech_affiliations else ["N/A"]


# Function to extract corresponding author email
def get_corresponding_email(root):
    corresponding_email_tag = root.findall(".//AuthorList/Author/AffiliationInfo/Affiliation")
    corresponding_email = []
    for aff in corresponding_email_tag:
        email = aff.text.split()[-1] if aff.text else "N/A"
        if "@" in email:
            corresponding_email.append(email)
    return set(corresponding_email) if corresponding_email else "N/A"

In [56]:
# Main function to orchestrate the extraction
def fetch_paper_details(pubmed_id):
    # Get XML data from PubMed
    root = fetch_xml_data(pubmed_id)
    
    if root is None:
        return {}

    paper_details = {}
    
    paper_details['PubMedID'] = get_pubmed_id(root, pubmed_id)
    paper_details['Title'] = get_title(root)
    paper_details['PublicationDate'] = get_publication_date(root)
    paper_details['Authors'] = get_authors(root)
    # paper_details['CompanyAffiliations'] = get_company_affiliations(root)
    paper_details['CompanyAffiliations'] = get_pharma_biotech_affiliations(root)
    paper_details['CorrespondingAuthorEmail'] = get_corresponding_email(root)

    return paper_details


In [57]:
# Function to fetch all paper details based on a query
def fetch_all_papers(query, max_results=10):
    # Fetch the paper IDs
    paper_ids = fetch_paper_ids_from_pubmed(query, max_results)

    # Fetch the details for each paper using the paper IDs
    all_paper_details = []
    for pubmed_id in paper_ids:
        paper_details = fetch_paper_details(pubmed_id)
        all_paper_details.append(paper_details)

    return all_paper_details

In [58]:
query = " pfizer"
papers = fetch_all_papers(query, max_results=10)

papers

[{'PubMedID': '40202700',
  'Title': 'Management of Late-Onset Rheumatoid Arthritis with Treat-to-Target Strategy.',
  'PublicationDate': '2025-Apr-09',
  'Authors': ['Masayoshi Harigai', 'Takahiko Sugihara'],
  'CompanyAffiliations': ['Department of Rheumatology, Sanno Medical Center, 8-5-35 Akasaka, Minato-ku, Tokyo, 107-0052, Japan. mharigai@iuhw.ac.jp.'],
  'CorrespondingAuthorEmail': ['mharigai@iuhw.ac.jp.',
   'mharigai@iuhw.ac.jp.']},
 {'PubMedID': '40202585',
  'Title': '[Clinical pharmacology of opioid analgesics].',
  'PublicationDate': '2025-Apr-09',
  'Authors': ['Constanze Rémi',
   'Jennifer Berner',
   'Aleksandra Dukic-Ott',
   'Christina Hepperle'],
  'CompanyAffiliations': ['Klinik und Poliklinik für Palliativmedizin, Campus Großhadern, LMU Klinikum, Marchioninistr.\xa015, 81377, München, Deutschland. constanze.remi@med.uni-muenchen.de.',
   'Klinikapotheke, LMU Klinikum, Marchioninistr.\xa015, 81377, München, Deutschland. constanze.remi@med.uni-muenchen.de.',
   'Kli