Installing the Required Libraries

In [1]:
!pip install requests lxml pandas



Importing Modules & Defining Global Constants

In [8]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import re
from typing import List, Dict, Optional

# Keywords used to detect non-academic affiliations
NON_ACADEMIC_KEYWORDS = [
    'pharma', 'biotech', 'inc', 'ltd', 'gmbh', 'corp',
    'therapeutics', 'labs', 'llc', 'co.', 'plc'
]

PubMed Search Function

In [9]:
def search_pubmed(query: str, retmax: int = 20) -> List[str]:
    """Search PubMed using a query and return list of PubMed IDs."""
    search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": retmax
    }
    response = requests.get(search_url, params=params)
    response.raise_for_status()
    return response.json()["esearchresult"]["idlist"]

Fetching Articles by IDs

In [10]:
def fetch_articles(pubmed_ids: List[str]) -> List[ET.Element]:
    """Fetch article metadata from PubMed using ID list."""
    fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": ",".join(pubmed_ids),
        "retmode": "xml"
    }
    response = requests.get(fetch_url, params=params)
    response.raise_for_status()
    root = ET.fromstring(response.text)
    return root.findall(".//PubmedArticle")

Helper Functions

In [11]:
def is_non_academic(affiliation: str) -> bool:
    """Check if an affiliation is non-academic using keyword heuristics."""
    return any(keyword in affiliation.lower() for keyword in NON_ACADEMIC_KEYWORDS)

def extract_email(text: str) -> Optional[str]:
    """Extract email from text using regex."""
    match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    return match.group(0) if match else None

Parsing Articles' Metadata

In [12]:
def parse_article(article: ET.Element, debug: bool = False) -> Optional[Dict[str, str]]:
    """Parse one article, extract fields only if it has non-academic authors."""
    pmid = article.findtext(".//PMID")
    title = article.findtext(".//ArticleTitle")
    pub_date = article.findtext(".//PubDate/Year") or "Unknown"

    non_acad_authors = []
    affiliations = []
    email = ""

    for author in article.findall(".//Author"):
        name = f"{author.findtext('ForeName') or ''} {author.findtext('LastName') or ''}".strip()
        affiliation_node = author.find(".//AffiliationInfo")

        if affiliation_node is not None:
            affiliation = affiliation_node.findtext("Affiliation")
            if affiliation:
                if is_non_academic(affiliation):
                    non_acad_authors.append(name)
                    affiliations.append(affiliation)
                if not email:
                    email = extract_email(affiliation) or email

    if not non_acad_authors:
        if debug:
            print(f"🔍 Excluded PMID {pmid} — Academic only.")
        return None

    return {
        "PubmedID": pmid,
        "Title": title,
        "Publication Date": pub_date,
        "Non-academic Author(s)": "; ".join(non_acad_authors),
        "Company Affiliation(s)": "; ".join(affiliations),
        "Corresponding Author Email": email
    }

Main Pipeline Runner

In [13]:
def run_pipeline(query: str, retmax: int = 20, debug: bool = False) -> pd.DataFrame:
    """End-to-end process: search, fetch, parse, filter, return DataFrame."""
    pubmed_ids = search_pubmed(query, retmax)
    if debug:
        print(f"🔎 Found {len(pubmed_ids)} articles for query: '{query}'")

    articles = fetch_articles(pubmed_ids)
    results = []

    for article in articles:
        data = parse_article(article, debug=debug)
        if data:
            results.append(data)

    df = pd.DataFrame(results)
    return df

Using the Tool

In [14]:
query = "cancer AND therapy"
df = run_pipeline(query, retmax=25, debug=True)
df.to_csv("filtered_pubmed_papers.csv", index=False)
df.head()

🔎 Found 25 articles for query: 'cancer AND therapy'
🔍 Excluded PMID 40742624 — Academic only.
🔍 Excluded PMID 40742611 — Academic only.
🔍 Excluded PMID 40742586 — Academic only.
🔍 Excluded PMID 40742583 — Academic only.
🔍 Excluded PMID 40742572 — Academic only.
🔍 Excluded PMID 40742566 — Academic only.
🔍 Excluded PMID 40742565 — Academic only.
🔍 Excluded PMID 40742542 — Academic only.
🔍 Excluded PMID 40742486 — Academic only.
🔍 Excluded PMID 40742462 — Academic only.
🔍 Excluded PMID 40742460 — Academic only.
🔍 Excluded PMID 40742448 — Academic only.
🔍 Excluded PMID 40742388 — Academic only.
🔍 Excluded PMID 40742376 — Academic only.
🔍 Excluded PMID 40742352 — Academic only.
🔍 Excluded PMID 40742351 — Academic only.
🔍 Excluded PMID 40742326 — Academic only.
🔍 Excluded PMID 40742316 — Academic only.
🔍 Excluded PMID 40742313 — Academic only.


Unnamed: 0,PubmedID,Title,Publication Date,Non-academic Author(s),Company Affiliation(s),Corresponding Author Email
0,40742567,Anti-tumor efficacy of RAF/MEK inhibitor VS676...,2025,Haixin Zhu; Gezi Yan; Junjie Ma; Bo Zhang; You...,"College of Pharmaceutical Sciences, Hangzhou F...",dongrongdrdr@zju.edu.cn
1,40742536,ASXL1 deficiency causes epigenetic dysfunction...,2025,Maggie P Fu; Allison Matthews; Maja Tarailo-Gr...,"British Columbia Children's Hospital, Centre f...",
2,40742481,The effect of Fe,2025,Atefeh Mansuryar; Hossein Ali Ebrahimi; Sergio...,"Department of Pharmaceutics, School of Pharmac...",haliebrahimi@gmail.com
3,40742459,"Multi-institutional, randomized, controlled tr...",2025,Gina Chung; Frank Jacobucci,"The Christ Hospital, Cincinnati, OH, USA.; Che...",ssonis@pesclinical.com
4,40742385,PI3K/Akt signaling pathway regulates CD155 exp...,2025,Tatsuya Nishi,Okayama University Graduate School of Medicine...,
