In [1]:
import requests
import xml.etree.ElementTree as ET
import re
from typing import List, Dict, Any, Optional
import logging

In [2]:
# Setting up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
# Function to fetch PubMed IDs based on a search query
def fetch_paper_ids_from_pubmed(query: str, max_results: int = 10) -> List[str]:
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        'db': 'pubmed',
        'term': query,
        'retmax': max_results,  # Limit the number of results
        'usehistory': 'y',      # Use history for retrieving large result sets
        'retmode': 'xml'
    }

    # Log the query and parameters
    logging.debug(f"Fetching PubMed IDs for query: {query} with parameters: {params}")

    try:
        # Send the request to fetch paper IDs
        response = requests.get(base_url, params=params)

        # Check if the response status is successful
        if response.status_code == 200:
            logging.info(f"Successfully fetched paper IDs for query: {query}")
            
            # Parse the XML response to extract paper IDs
            root = ET.fromstring(response.content)
            
            # Extract the list of paper IDs from the XML response
            id_list = [id_tag.text for id_tag in root.findall(".//Id")]

            logging.debug(f"Fetched {len(id_list)} paper IDs")
            return id_list
        else:
            logging.error(f"Error fetching paper IDs: {response.status_code}")
            return []
    except requests.exceptions.RequestException as e:
        # Log the error if the request fails
        logging.error(f"Request failed: {e}")
        return []

In [4]:
# Function to send the request and get the paper's XML data
def fetch_xml_data(pubmed_id: str) -> Optional[ET.Element]:
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'pubmed',
        'id': pubmed_id,
        'retmode': 'xml'
    }

    # Log the request details
    logging.debug(f"Fetching XML data for PubMed ID: {pubmed_id} with parameters: {params}")

    try:
        # Send the request to fetch detailed paper information
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        # Check if the response status is successful
        if response.status_code == 200:
            logging.info(f"Successfully fetched XML data for PubMed ID: {pubmed_id}")
            return ET.fromstring(response.content)  # Return the XML root element
        else:
            logging.error(f"Error fetching paper details for PubMed ID {pubmed_id}: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        # Log the error if the request fails
        logging.error(f"Request failed for PubMed ID {pubmed_id}: {e}")
        return None

In [5]:
# # List of academic keywords to exclude
# academic_keywords = [
#     "University", "College", "Institute", "Academy", "School", 
#     "Faculty", "Academician", "PhD", "Professor"
# ]

# # Function to check if an affiliation is academic
# def is_academic(affiliation: str, academic_keywords = List[str]) -> bool:
#     logging.debug(f"Checking affiliation: {affiliation}")
#     for keyword in academic_keywords:
#         if keyword.lower() in affiliation.lower():
#             return True
        
#     logging.info(f"Affiliation '{affiliation}' is not academic.")
#     return False

import ollama
import logging

# Function to check if an affiliation is academic using Ollama, keeping the format the same
# Function to check if an affiliation is academic using Ollama
def is_academic_using_ollama(affiliation: str, model: str = "llama3:8b") -> bool:
    logging.debug(f"Checking affiliation: {affiliation}")

    # Prepare the prompt for Ollama, asking it to classify whether the affiliation is academic
    prompt = f"Is the following affiliation academic? {affiliation}. If academic, answer 'Yes'. Otherwise, answer 'No'."

    try:
        # Chat with Ollama to classify if affiliation is academic
        response = ollama.chat(model=model, messages=[{'role': 'user', 'content': prompt}])
        
        # Get the response content and process it
        result = response['message']['content'].strip().lower()
        
        # Check if Ollama's response indicates 'Yes' (academic) or 'No' (not academic)
        if 'yes' in result:
            logging.info(f"Affiliation '{affiliation}' is academic.")
            return True
        elif 'no' in result:
            logging.info(f"Affiliation '{affiliation}' is not academic.")
            return False
        else:
            logging.info(f"Unclear response from Ollama for '{affiliation}'")
            return False
    except Exception as e:
        logging.error(f"Error occurred while checking affiliation: {e}")
        return False

2025-04-12 22:39:24,887 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2025-04-12 22:39:24,895 - DEBUG - load_verify_locations cafile='c:\\Users\\khand\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\certifi\\cacert.pem'


In [6]:
# List of pharmaceutical and biotech-related keywords to identify relevant companies
pharma_biotech_keywords = [
    "Pharmaceutical", "Biotech", "Biotechnology", "Pharma", "Biopharma", 
    "Med", "Healthcare", "Bio", "Genetics", "Drug", "Therapeutics", "Vaccine",
    "Diagnostics", "Clinical", "Development", "Manufacturing"
]

# Function to check if an affiliation is related to pharmaceutical or biotech companies
def is_pharma_biotech(affiliation: str, pharma_biotech_keywords = List[str]) -> bool:
    logging.debug(f"Checking affiliation: {affiliation}")

    for keyword in pharma_biotech_keywords:
        if keyword.lower() in affiliation.lower():
            return True
        
    logging.info(f"Affiliation is not related to pharma/biotech: {affiliation}")
    return False

In [7]:
# Function to extract PubMed ID
def get_pubmed_id(root: ET.Element, pubmed_id: str) -> str:
    logging.debug(f"Extracting PubMed ID: {pubmed_id}")
    return pubmed_id


# Function to extract the title of the paper
def get_title(root: ET.Element) -> str:
    title_tag = root.find(".//ArticleTitle")
    if title_tag is not None:
        logging.info(f"Title extracted: {title_tag.text}")
        return title_tag.text
    else:
        logging.warning("Title not found in the XML.")
        return "N/A"


# Function to extract the publication date
# def get_publication_date(root: ET.Element) -> str:
#     pub_date_tag = root.find(".//PubDate")
#     if pub_date_tag is not None:
#         year = pub_date_tag.find("Year")
#         month = pub_date_tag.find("Month")
#         day = pub_date_tag.find("Day")
#         if year is not None and month is not None and day is not None:
#             date = f"{year.text}-{month.text}-{day.text}"
#             logging.info(f"Publication date extracted: {date}")
#             return date
#         else:
#             logging.warning("Incomplete publication date found (missing year, month, or day).")
#             return "N/A"
#     else:
#         logging.warning("Publication date not found in the XML.")
#         return "N/A"


def get_publication_date(root: ET.Element) -> str:
    """Extracts the publication date from PubMed XML data and handles various date formats.

    Args:
        root (ET.Element): The XML root element containing the paper's metadata.

    Returns:
        str: The publication date in a consistent format ("YYYY-MM-DD"), or "N/A" if not found.
    """
    # Function to normalize the date format (if needed)
    def normalize_date(date: str) -> str:
        # Match different date formats and handle accordingly
        # Format: Year (e.g., "2020"), Year-Month (e.g., "2020 Aug"), Year-Month-Day (e.g., "2020 May 31")
        month_map = {
            "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", 
            "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
        }
        
        # If the date has a month abbreviation (e.g., "2020 Aug"), replace it with month number
        match = re.match(r"(\d{4})\s*(\w{3})\s*(\d{1,2})?$", date)
        if match:
            year = match.group(1)
            month_str = match.group(2)
            day = match.group(3) if match.group(3) else "01"  # Default to 01 if day is missing
            month = month_map.get(month_str.capitalize(), "01")  # Default to January if month not found
            return f"{year}-{month}-{day}"

        # If the date is year-month (e.g., "2020 Aug"), default the day to 01
        match = re.match(r"(\d{4})\s*(\w{3})$", date)
        if match:
            year = match.group(1)
            month_str = match.group(2)
            month = month_map.get(month_str.capitalize(), "01")
            return f"{year}-{month}-01"

        # If only year (e.g., "2020"), return as "YYYY-01-01"
        match = re.match(r"(\d{4})$", date)
        if match:
            return f"{match.group(1)}-01-01"

        return date  # Return as is if no match

    # First, check for EPubDate (complete date)
    epub_date_tag = root.find(".//Item[@Name='EPubDate']")
    if epub_date_tag is not None and epub_date_tag.text:
        logging.info(f"EPubDate found: {epub_date_tag.text}")
        return normalize_date(epub_date_tag.text)  # Normalize and return date

    # Next, check for PubDate (year and possibly month)
    pub_date_tag = root.find(".//Item[@Name='PubDate']")
    if pub_date_tag is not None and pub_date_tag.text:
        # Normalize and return the PubDate
        logging.info(f"PubDate found: {pub_date_tag.text}")
        return normalize_date(pub_date_tag.text)

    # If neither date is found, return "N/A"
    logging.warning("Publication date not found.")
    return "N/A"


In [8]:
import requests
import xml.etree.ElementTree as ET

def fetch_pubmed_data(pubmed_id: str) -> ET.Element:
    """Fetches PubMed article metadata by PubMed ID and returns the XML root."""
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pubmed_id}&retmode=xml"
    
    response = requests.get(url)
    if response.status_code == 200:
        print(f"Successfully fetched PubMed data for ID: {pubmed_id}")
        # Parse the XML response
        return ET.fromstring(response.text)
    else:
        print(f"Failed to fetch PubMed data for ID: {pubmed_id}")
        return None

# Example PubMed ID (replace with a valid PubMed ID)
pubmed_id = "31168464"  # Example PubMed ID (replace with any valid ID)
root = fetch_xml_data(pubmed_id)

if root:
    # Now test the get_publication_date function on the fetched XML
    publication_date = get_publication_date(root)
    print("Extracted Publication Date:", publication_date)

2025-04-12 22:39:25,245 - DEBUG - Fetching XML data for PubMed ID: 31168464 with parameters: {'db': 'pubmed', 'id': '31168464', 'retmode': 'xml'}
2025-04-12 22:39:25,258 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443
2025-04-12 22:39:26,041 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/efetch.fcgi?db=pubmed&id=31168464&retmode=xml HTTP/11" 200 None
2025-04-12 22:39:26,210 - INFO - Successfully fetched XML data for PubMed ID: 31168464
  if root:


Extracted Publication Date: N/A


In [9]:
# import requests

# # Replace with a valid PubMed ID (PMID)
# pmid = "31158464"  # Example PMID
# url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=xml"

# response = requests.get(url)

# # Check the status code to ensure it's a successful response
# if response.status_code == 200:
#     xml_content = response.text
#     print(xml_content)
# else:
#     print(f"Failed to retrieve data: {response.status_code}")

In [10]:
# Function to extract authors
def get_authors(root: ET.Element, model: str = "llama3:8b") -> List[str]:
    logging.debug("Extracting authors from the XML data.")
    authors_tag = root.findall(".//AuthorList/Author")
    authors = []
    for author in authors_tag:
        last_name = author.find("LastName")
        fore_name = author.find("ForeName")
        affiliation_info = author.findall(".//AffiliationInfo/Affiliation")
        
        # Log the author information for debugging
        logging.debug(f"Processing author: {fore_name.text if fore_name is not None else 'N/A'} {last_name.text if last_name is not None else 'N/A'}")

        # Get the affiliations and check if they are non-academic using Ollama
        if affiliation_info:
            for aff in affiliation_info:
                logging.debug(f"Checking affiliation: {aff.text if aff is not None else 'N/A'}")
                if aff is not None and not is_academic_using_ollama(aff.text, model):
                    if last_name is not None and fore_name is not None:
                        author_name = f"{fore_name.text} {last_name.text}"
                        logging.info(f"Adding author: {author_name}")
                        authors.append(author_name)
                    break
        else:
            # If no affiliation is found, consider it as non-academic (fallback case)
            if last_name is not None and fore_name is not None:
                author_name = f"{fore_name.text} {last_name.text}"
                logging.info(f"Adding author: {author_name}")
                authors.append(author_name)
    
    if authors:
        logging.debug(f"Authors extracted: {authors}")
    else:
        logging.debug("No authors found.")

    return authors if authors else ["N/A"]


# Function to extract company affiliations
# def get_company_affiliations(root):
#     affiliations = []
#     authors_tag = root.findall(".//AuthorList/Author")
    
#     # Regex pattern to match and remove email and address
#     email_pattern = r"[\w\.-]+@[\w\.-]+"
#     address_pattern = r"\d{5},? \w+,\s?[A-Za-z\s]+"

#     for author in authors_tag:
#         aff_info = author.findall(".//AffiliationInfo/Affiliation")
#         for aff in aff_info:
#             if aff is not None:
#                 affiliation = aff.text
                
#                 # Remove email and address from affiliation string
#                 affiliation = re.sub(email_pattern, '', affiliation)  # Remove emails
#                 affiliation = re.sub(address_pattern, '', affiliation)  # Remove address
                
#                 # If there's still a valid affiliation left, add it
#                 if affiliation.strip():
#                     affiliations.append(affiliation.strip())
    
#     return affiliations if affiliations else ["N/A"]

# Regex pattern to match and remove email addresses
email_pattern = r"[\w\.-]+@[\w\.-]+"


def get_pharma_biotech_affiliations(root: ET.Element, pharma_biotech_keywords: List[str], model: str = "llama3:8b") -> List[str]:
    logging.debug("Extracting pharma/biotech affiliations from the XML data.")
    pharma_biotech_affiliations = []
    authors_tag = root.findall(".//AuthorList/Author")
    
    for author in authors_tag:
        aff_info = author.findall(".//AffiliationInfo/Affiliation")
        for aff in aff_info:
            if aff is not None:
                logging.debug(f"Checking affiliation: {aff.text}")
                
                # Check if the affiliation matches pharma/biotech and is non-academic using Ollama
                if is_pharma_biotech(aff.text, pharma_biotech_keywords) and not is_academic_using_ollama(aff.text, model):
                    # Remove email from affiliation text
                    clean_affiliation = re.sub(email_pattern, '', aff.text).strip()
                    
                    if clean_affiliation:  # Ensure there's something left after removing emails
                        logging.info(f"Adding pharma/biotech affiliation: {clean_affiliation}")
                        pharma_biotech_affiliations.append(clean_affiliation)
    
    if pharma_biotech_affiliations:
        logging.debug(f"Pharma/Biotech affiliations extracted: {pharma_biotech_affiliations}")
    else:
        logging.debug("No pharma/biotech affiliations found.")

    return set(pharma_biotech_affiliations) if pharma_biotech_affiliations else ["N/A"]


# Function to extract corresponding author email
def get_corresponding_email(
    root: ET.Element,
    model: str = "llama3:8b"
) -> List[str]:
    """Extracts non-academic corresponding author emails from the XML, using Ollama for academic classification."""
    logging.debug("Extracting corresponding email from the XML data.")
    emails = []

    authors_tag = root.findall(".//AuthorList/Author")
    for author in authors_tag:
        aff_info = author.findall(".//AffiliationInfo/Affiliation")
        for aff in aff_info:
            if aff is not None:
                logging.debug(f"Checking affiliation: {aff.text}")
                
                # Check if the affiliation is non-academic using Ollama
                if not is_academic_using_ollama(aff.text, model):
                    parts = aff.text.split() if aff.text else []
                    
                    for part in parts:
                        if "@" in part:
                            email = part.strip().strip(";,.()[]")
                            if email not in emails:
                                logging.info(f"Adding email: {email}")
                                emails.append(email)
    
    return emails if emails else ["N/A"]


In [11]:
# Main function to orchestrate the extraction
def fetch_paper_details(pubmed_id: str) -> Dict[str, Any]:
    logging.debug(f"Starting to fetch paper details for PubMed ID: {pubmed_id}")

    # Get XML data from PubMed
    root = fetch_xml_data(pubmed_id)
    
    if root is None:
        logging.error(f"Failed to fetch XML data for PubMed ID: {pubmed_id}")
        return {}

    paper_details = {}
    
    try:
        # Extracting paper details
        paper_details['PubMedID'] = get_pubmed_id(root, pubmed_id)
        logging.debug(f"PubMed ID: {paper_details['PubMedID']} extracted.")
        
        paper_details['Title'] = get_title(root)
        logging.debug(f"Title: {paper_details['Title']} extracted.")
        
        paper_details['PublicationDate'] = get_publication_date(root)
        logging.debug(f"Publication Date: {paper_details['PublicationDate']} extracted.")
        
        paper_details['Authors'] = get_authors(root, "llama3:8b")
        logging.debug(f"Authors: {paper_details['Authors']} extracted.")
        
        paper_details['CompanyAffiliations'] = get_pharma_biotech_affiliations(root, pharma_biotech_keywords, "llama3:8b")
        logging.debug(f"Company Affiliations: {paper_details['CompanyAffiliations']} extracted.")
        
        paper_details['CorrespondingAuthorEmail'] = get_corresponding_email(root, "llama3:8b")
        logging.debug(f"Corresponding Author Email: {paper_details['CorrespondingAuthorEmail']} extracted.")
        
    except Exception as e:
        logging.error(f"Error occurred while fetching paper details for PubMed ID {pubmed_id}: {e}")
        return {}

    logging.info(f"Successfully fetched paper details for PubMed ID: {pubmed_id}")
    return paper_details

In [12]:
# Function to fetch all paper details based on a query

def fetch_all_papers(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
    logging.debug(f"Starting to fetch papers for query: {query}")
    
    # Fetch the paper IDs
    paper_ids = fetch_paper_ids_from_pubmed(query, max_results)
    
    if not paper_ids:
        logging.warning(f"No paper IDs found for query: {query}")
        return []
    
    logging.debug(f"Fetched {len(paper_ids)} paper IDs for query: {query}")

    # Fetch the details for each paper using the paper IDs
    all_paper_details = []
    for pubmed_id in paper_ids:
        logging.debug(f"Fetching details for PubMed ID: {pubmed_id}")
        paper_details = fetch_paper_details(pubmed_id)
        
        if paper_details:
            all_paper_details.append(paper_details)
            logging.debug(f"Successfully fetched details for PubMed ID: {pubmed_id}")
        else:
            logging.error(f"Failed to fetch details for PubMed ID: {pubmed_id}")
    
    logging.info(f"Successfully fetched details for {len(all_paper_details)} papers out of {len(paper_ids)} paper IDs.")
    
    return all_paper_details

In [13]:
query = "31158464"
papers = fetch_all_papers(query, max_results=20)

papers

2025-04-12 22:39:26,313 - DEBUG - Starting to fetch papers for query: 31158464
2025-04-12 22:39:26,313 - DEBUG - Fetching PubMed IDs for query: 31158464 with parameters: {'db': 'pubmed', 'term': '31158464', 'retmax': 20, 'usehistory': 'y', 'retmode': 'xml'}
2025-04-12 22:39:26,317 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443
2025-04-12 22:39:27,156 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/esearch.fcgi?db=pubmed&term=31158464&retmax=20&usehistory=y&retmode=xml HTTP/11" 200 None
2025-04-12 22:39:27,156 - INFO - Successfully fetched paper IDs for query: 31158464
2025-04-12 22:39:27,156 - DEBUG - Fetched 1 paper IDs
2025-04-12 22:39:27,156 - DEBUG - Fetched 1 paper IDs for query: 31158464
2025-04-12 22:39:27,156 - DEBUG - Fetching details for PubMed ID: 31158464
2025-04-12 22:39:27,156 - DEBUG - Starting to fetch paper details for PubMed ID: 31158464
2025-04-12 22:39:27,156 - DEBUG - Fetching XML data for PubMed ID: 31158464 with paramet

[{'PubMedID': '31158464',
  'Title': 'Thyroid autoimmune disorders and cancer.',
  'PublicationDate': 'N/A',
  'Authors': ['Giusy Elia'],
  'CompanyAffiliations': {'Department of Clinical and Experimental Medicine, University of Pisa, Pisa, Italy. Electronic address:'},
  'CorrespondingAuthorEmail': ['e.giusy_87@hotmail.it']}]

In [14]:
import pandas as pd

# Function to save paper details to CSV
def save_paper_details_to_csv(papers: List[Dict[str, Any]], query: str, filename: Optional[str] = None) -> None:
    logging.debug(f"Starting to save paper details to CSV for query: {query}")

    # Sanitize the query to make it a valid filename
    sanitized_query = re.sub(r'[^\w\s-]', '', query)  # Remove any special characters
    sanitized_query = re.sub(r'[-\s]+', '_', sanitized_query).strip()  # Replace spaces and hyphens with underscores

    # Set the filename to the query if not provided
    if not filename:
        filename = f"{sanitized_query}_results.csv"
        logging.debug(f"Generated filename from query: {filename}")
    else:
        logging.debug(f"Using provided filename: {filename}")

    # Create a DataFrame from the list of paper details
    try:
        df = pd.DataFrame(papers)
        logging.debug(f"DataFrame created with {len(df)} rows.")

        # Save the DataFrame to a CSV file
        df.to_csv(filename, index=False, encoding='utf-8')
        logging.info(f"Results successfully saved to {filename}")
    except Exception as e:
        logging.error(f"Error saving results to {filename}: {e}")

In [15]:
# Save to CSV with dynamic filename based on the query
save_paper_details_to_csv(papers, query=query)

2025-04-12 22:40:22,962 - DEBUG - Starting to save paper details to CSV for query: 31158464
2025-04-12 22:40:22,969 - DEBUG - Generated filename from query: 31158464_results.csv
2025-04-12 22:40:22,980 - DEBUG - DataFrame created with 1 rows.
2025-04-12 22:40:22,982 - INFO - Results successfully saved to 31158464_results.csv


In [None]:
# import sys
# import argparse
# import logging


# def setup_logging(debug: bool):
#     logging.basicConfig(
#         level=logging.DEBUG if debug else logging.INFO,
#         format='%(asctime)s - %(levelname)s - %(message)s'
#     )

# def main():
#     # Simulate command-line arguments for Jupyter Notebook
#     sys.argv = ['your_script.py', 'healthcare burnout', '--debug', '--file', 'results.csv']

#     # Argument parsing setup
#     parser = argparse.ArgumentParser(description="Fetch PubMed papers based on a query")
#     parser.add_argument("query", type=str, help="Search query to fetch papers from PubMed")
#     parser.add_argument("-d", "--debug", action="store_true", help="Enable debug logging")
#     parser.add_argument("-f", "--file", type=str, help="Specify the filename to save the results")

#     # Parse the arguments
#     args = parser.parse_args()

#     # Set up logging based on the debug flag
#     setup_logging(debug=args.debug)

#     # Fetch the papers based on the query
#     papers = fetch_all_papers(args.query, max_results=10)

#     # Save the results to a file or print them to the console
#     if args.file:
#         save_paper_details_to_csv(papers, query=args.query, filename=args.file)
#     else:
#         print(papers)

# # Run the main function
# main()

2025-04-12 22:40:23,003 - DEBUG - Starting to fetch papers for query: healthcare burnout
2025-04-12 22:40:23,003 - DEBUG - Fetching PubMed IDs for query: healthcare burnout with parameters: {'db': 'pubmed', 'term': 'healthcare burnout', 'retmax': 10, 'usehistory': 'y', 'retmode': 'xml'}
2025-04-12 22:40:23,003 - DEBUG - Starting new HTTPS connection (1): eutils.ncbi.nlm.nih.gov:443
2025-04-12 22:40:23,927 - DEBUG - https://eutils.ncbi.nlm.nih.gov:443 "GET /entrez/eutils/esearch.fcgi?db=pubmed&term=healthcare+burnout&retmax=10&usehistory=y&retmode=xml HTTP/11" 200 None
2025-04-12 22:40:23,933 - INFO - Successfully fetched paper IDs for query: healthcare burnout
2025-04-12 22:40:23,938 - DEBUG - Fetched 10 paper IDs
2025-04-12 22:40:23,943 - DEBUG - Fetched 10 paper IDs for query: healthcare burnout
2025-04-12 22:40:23,944 - DEBUG - Fetching details for PubMed ID: 40217487
2025-04-12 22:40:23,946 - DEBUG - Starting to fetch paper details for PubMed ID: 40217487
2025-04-12 22:40:23,948 - 

KeyboardInterrupt: 

In [17]:
# import openai
# from dotenv import load_dotenv
# import os

# # Load the API key from the .env file
# load_dotenv()

# # Set the OpenAI API key
# openai.api_key = os.getenv("OPENAI_API_KEY")

# # Check if the API key is loaded correctly
# if openai.api_key:
#     print("API Key loaded successfully!")
# else:
#     print("Error: OpenAI API Key not found.")

In [18]:
# # Define the function for classifying affiliations
# def classify_affiliation_with_llm(affiliation: str) -> str:
#     prompt = f"""
# Classify the following affiliation into one of these categories: Academic, Pharma/Biotech, or Other.
# Affiliation: "{affiliation}"
# Only return one of: Academic, Pharma/Biotech, Other.
# """
#     client = openai.OpenAI()
#     completion = client.completions.create(
#         model="curie",
#         prompt=prompt,
#         temperature=0,
#         n=1
#     )
    
#     # Accessing the response using the new model's methods
#     classification = completion['choices'][0]['message']['content'].strip()
    
#     return classification

In [19]:
import ollama

model = "llama3:8b"
# response = ollama.chat(model=model, messages=[{'role': 'user', 'content': 'What is Ollama?'}])
# print(response['message']['content'])

In [20]:
# # Test with a few sample affiliations
# sample_affiliations = [
#     "Harvard University, Department of Biology",
#     "Pfizer Pharmaceuticals, Inc.",
#     "Acme Corp. - Research and Development",
#     "University of California, Berkeley",
#     "Johnson & Johnson"
# ]

# # Run the classification function on each sample
# for affiliation in sample_affiliations:
#     classification = classify_affiliation_with_llm(affiliation)
#     print(f"Affiliation: {affiliation}\nClassification: {classification}\n")

In [21]:
def is_academic(affiliation: str, model_name: str = "llama3:8b") -> bool:
    """
    Check if an affiliation is academic using an LLM.

    Args:
        affiliation (str): The affiliation string to check.
        model_name (str): The model to use for classification.

    Returns:
        bool: True if the affiliation is considered academic, False otherwise.
    """
    prompt = f"""
    Is the following affiliation academic or not? Reply with only 'Yes' or 'No'.

    Affiliation: "{affiliation}"
    """

    response = ollama.chat(model=model_name, messages=[{"role": "user", "content": prompt}])
    answer = response['message']['content'].strip().lower()

    return answer.startswith("yes")

In [30]:
def is_pharma_biotech(affiliation: str, model_name: str = "llama3:8b") -> bool:
    """
    Check if an affiliation is related to the pharmaceutical or biotech industry using an LLM.

    Args:
        affiliation (str): The affiliation string to check.
        model_name (str): The Ollama model to use. Defaults to "llama3:8b".

    Returns:
        bool: True if the affiliation is considered pharmaceutical or biotech, False otherwise.
    """
    prompt = (
        f"Is the following affiliation related to pharmaceutical or biotech industry? "
        f"Reply with only 'Yes' or 'No'.\n\nAffiliation: \"{affiliation}\""
    )

    response = ollama.chat(
        model=model_name,
        messages=[{"role": "user", "content": prompt}]
    )

    answer = response["message"]["content"].strip().lower()
    return answer.startswith("yes")

In [31]:
def test_is_pharma_biotech():
    # Test cases for pharma/biotech and non-pharma/biotech affiliations
    test_cases = [
        ("Pfizer Inc, Research and Development", True),  # Pharma/biotech affiliation
        ("Johnson & Johnson, Pharmaceuticals", True),  # Pharma/biotech affiliation
        ("Harvard University, Department of Computer Science", False),  # Non-pharma
        ("Biotech Labs, Inc.", True),  # Pharma/biotech affiliation
        ("Massachusetts Institute of Technology (MIT)", False),  # Non-pharma
        ("AstraZeneca Pharmaceuticals", True),  # Pharma/biotech affiliation
    ]

    for affiliation, expected_result in test_cases:
        result = is_pharma_biotech(affiliation)
        assert result == expected_result, f"Test failed for: {affiliation}. Expected {expected_result}, got {result}"

    print("All tests passed!")

if __name__ == "__main__":
    test_is_pharma_biotech()


2025-04-12 22:49:10,570 - DEBUG - close.started
2025-04-12 22:49:10,579 - DEBUG - close.complete
2025-04-12 22:49:10,582 - DEBUG - connect_tcp.started host='127.0.0.1' port=11434 local_address=None timeout=None socket_options=None
2025-04-12 22:49:10,592 - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x0000018E7F1642C0>
2025-04-12 22:49:10,599 - DEBUG - send_request_headers.started request=<Request [b'POST']>
2025-04-12 22:49:10,614 - DEBUG - send_request_headers.complete
2025-04-12 22:49:10,616 - DEBUG - send_request_body.started request=<Request [b'POST']>
2025-04-12 22:49:10,619 - DEBUG - send_request_body.complete
2025-04-12 22:49:10,621 - DEBUG - receive_response_headers.started request=<Request [b'POST']>
2025-04-12 22:49:12,838 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Type', b'application/json; charset=utf-8'), (b'Date', b'Sat, 12 Apr 2025 17:19:12 GMT'), (b'Content-Length', b'297')]

All tests passed!
