In [1]:
!pip install requests pandas


Defaulting to user installation because normal site-packages is not writeable


In [2]:
import requests
import pandas as pd

# Constants
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

def fetch_paper_ids(query):
    """Fetch paper IDs based on the given query."""
    url = f"{BASE_URL}esearch.fcgi?db=pubmed&term={query}&retmode=json"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        return data.get("esearchresult", {}).get("idlist", [])
    
    raise Exception("Failed to fetch paper IDs")

def fetch_paper_details(paper_id):
    """Fetch details of a paper using its PubMed ID."""
    url = f"{BASE_URL}esummary.fcgi?db=pubmed&id={paper_id}&retmode=json"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        return data.get("result", {}).get(paper_id, {})

    return {}

def fetch_papers(query):
    """Fetch papers from PubMed and return processed data."""
    paper_ids = fetch_paper_ids(query)
    papers = []

    for pid in paper_ids:
        details = fetch_paper_details(pid)
        papers.append({
            "PubmedID": pid,
            "Title": details.get("title", "N/A"),
            "Publication Date": details.get("pubdate", "N/A"),
            "Authors": details.get("authors", []),
        })

    return papers

def identify_non_academic_authors(authors):
    """Identify non-academic authors based on affiliation."""
    company_keywords = ["Pharma", "Biotech", "Inc.", "Ltd.", "Corporation", "Company"]
    non_academic_authors = []
    company_affiliations = []

    for author in authors:
        name = author.get("name", "Unknown")
        affiliation = author.get("affiliation", "")
        
        if any(keyword in affiliation for keyword in company_keywords):
            non_academic_authors.append(name)
            company_affiliations.append(affiliation)

    return non_academic_authors, company_affiliations

def save_to_csv(papers, filename="output.csv"):
    """Save processed data to a CSV file."""
    df = pd.DataFrame(papers)
    df.to_csv(filename, index=False)
    print(f"Saved results to {filename}")

def run_query(query, save_to_file=False, filename="output.csv"):
    """Run the paper fetching process and display/save results."""
    print(f"Fetching papers for query: {query}")

    papers = fetch_papers(query)

    for paper in papers:
        non_academic_authors, company_affiliations = identify_non_academic_authors(paper["Authors"])
        paper["Non-academic Authors"] = ", ".join(non_academic_authors)
        paper["Company Affiliations"] = ", ".join(company_affiliations)
        paper.pop("Authors", None)  # Remove full author list from final output

    df = pd.DataFrame(papers)
    
    if save_to_file:
        save_to_csv(papers, filename)
    
    return df  # Return the dataframe for display in Jupyter



In [3]:
df = run_query("cancer therapy", save_to_file=True, filename="papers.csv")
df  # This will display the dataframe in Jupyter Notebook


Fetching papers for query: cancer therapy
Saved results to papers.csv


Unnamed: 0,PubmedID,Title,Publication Date,Non-academic Authors,Company Affiliations
0,40064605,The Lives of Older People With Advanced Cancer...,2025 Mar,,
1,40064578,Advances of PET/CT in Target Delineation of Lu...,2025 Mar 9,,
2,40064574,Racial Diversity and Co-Mutational Analysis of...,2025 Feb 14,,
3,40064573,"The PESGA Trial: A Prospective, Open-Label, Si...",2025 Feb 13,,
4,40064481,Neoadjuvant treatment for incidental gallbladd...,2025 Mar 11,,
5,40064476,Highlights of 2024. Broadening anti-cancer imm...,2025 Mar 10,,
6,40064384,Deciphering a crosstalk between biological cue...,2025 Mar 8,,
7,40064365,miR-124-mediated temozolomide sensitivity and ...,2025 Mar 8,,
8,40064297,"Real-world evidence regarding cancer, mortalit...",2025 Mar 8,,
9,40064294,Quality of life and emotional state of patient...,2025 Mar 8,,


In [4]:
from IPython.display import FileLink
FileLink("papers.csv")
