In [1]:
!pip install requests pandas



In [None]:
# ============================================================
# 3-LAYER PUBMED CORPUS BUILDER (MEDICAL AFFAIRS / NLP READY)
# ============================================================

import time
import requests
import pandas as pd
import xml.etree.ElementTree as ET

# ----------------------------
# CONFIG
# ----------------------------
CONTACT_EMAIL = "samitkumardas321@gmail.com"  # IMPORTANT
BATCH_SIZE = 1000                          # Safe batch size
PER_LAYER_TARGET = 10000                   # Pull ~30k → dedupe → ~25k
SLEEP_SECONDS = 0.35

#Storing PubMed ESEARCH and EFETCH URLs for data extraction.
ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
EFETCH_URL  = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Broad, Medical Affairs–style filters
THERAPY_FILTER = '(medicine OR clinical OR therapeutic OR treatment OR disease)'
DATE_FILTER    = '("2012"[Date - Publication] : "2025"[Date - Publication])'
TYPE_FILTER    = '(review[Publication Type] OR clinical trial[Publication Type] OR observational study[Publication Type])'

# ----------------------------
# QUERIES (3 LAYERS)
# ----------------------------
QUERIES = {
    "L1_scientific_engagement": f"""
        (
            "mechanism of action"[Title/Abstract] OR
            "real-world evidence"[Title/Abstract] OR
            "clinical benefit"[Title/Abstract] OR
            efficacy[Title/Abstract] OR
            "benefit-risk"[Title/Abstract] OR
            "treatment response"[Title/Abstract] OR
            biomarker[Title/Abstract]
        )
        AND {THERAPY_FILTER}
        AND {TYPE_FILTER}
        AND {DATE_FILTER}
    """,

    "L2_safety_risk": f"""
        (
            safety[Title/Abstract] OR
            tolerability[Title/Abstract] OR
            "adverse event"[Title/Abstract] OR
            "adverse events"[Title/Abstract] OR
            toxicity[Title/Abstract] OR
            "risk profile"[Title/Abstract]
        )
        AND {THERAPY_FILTER}
        AND {TYPE_FILTER}
        AND {DATE_FILTER}
    """,

    "L3_comparative_critical": f"""
        (
            versus[Title/Abstract] OR
            "head-to-head"[Title/Abstract] OR
            comparison[Title/Abstract] OR
            "unmet need"[Title/Abstract] OR
            limitations[Title/Abstract] OR
            challenges[Title/Abstract]
        )
        AND {THERAPY_FILTER}
        AND {TYPE_FILTER}
        AND {DATE_FILTER}
    """
}

# ----------------------------
# SAFE ESEARCH
# ----------------------------
def esearch_get_history(term, retmax):
    params = {
        "db": "pubmed",
        "term": " ".join(term.split()),
        "retmax": retmax,
        "usehistory": "y",
        "email": CONTACT_EMAIL
    }
    r = requests.get(ESEARCH_URL, params=params, timeout=60)
    r.raise_for_status()
    root = ET.fromstring(r.text)

    count = int(root.findtext("Count", "0"))
    webenv = root.findtext("WebEnv")
    query_key = root.findtext("QueryKey")
    return count, webenv, query_key

# ----------------------------
# SAFE EFETCH (RETRIES + BACKOFF)
# ----------------------------
def efetch_batch_safe(session, webenv, query_key, retstart, retmax,
                      max_retries=5):
    params = {
        "db": "pubmed",
        "query_key": query_key,
        "WebEnv": webenv,
        "retstart": retstart,
        "retmax": retmax,
        "retmode": "xml",
        "rettype": "abstract",
        "email": CONTACT_EMAIL
    }

    for attempt in range(1, max_retries + 1):
        try:
            r = session.get(EFETCH_URL, params=params, timeout=(20, 180))
            r.raise_for_status()
            root = ET.fromstring(r.text)

            rows = []
            for article in root.findall(".//PubmedArticle"):
                pmid = article.findtext(".//PMID") or ""
                title = article.findtext(".//ArticleTitle") or ""

                abstract_parts = [
                    a.text.strip() for a in article.findall(".//AbstractText")
                    if a.text
                ]
                abstract = " ".join(abstract_parts)

                year = (
                    article.findtext(".//PubDate/Year") or
                    article.findtext(".//ArticleDate/Year") or
                    ""
                )

                journal = article.findtext(".//Journal/Title") or ""

                if abstract.strip():
                    rows.append({
                        "pmid": pmid,
                        "title": title,
                        "abstract": abstract,
                        "year": year,
                        "journal": journal
                    })

            return rows

        except Exception as e:
            wait = 2 ** (attempt - 1)
            print(f"EFetch error (start={retstart}) attempt {attempt}: {e}")
            print(f"Retrying in {wait}s...")
            time.sleep(wait)

    raise RuntimeError("EFetch failed after retries")

# ----------------------------
# MAIN PIPELINE
# ----------------------------
def build_pubmed_corpus():
    session = requests.Session()
    session.headers.update({
        "User-Agent": f"LJMU-MSc-NLP/1.0 ({CONTACT_EMAIL})"
    })

    all_rows = []

    for layer, query in QUERIES.items():
        print(f"\n=== {layer} ===")
        count, webenv, query_key = esearch_get_history(query, PER_LAYER_TARGET)

        fetch_n = min(PER_LAYER_TARGET, count)
        print(f"Matched: {count} | Fetching: {fetch_n}")

        fetched = 0
        for start in range(0, fetch_n, BATCH_SIZE):
            batch_size = min(BATCH_SIZE, fetch_n - start)
            print(f"  Fetching records {start}–{start + batch_size - 1}")

            batch = efetch_batch_safe(
                session,
                webenv,
                query_key,
                retstart=start,
                retmax=batch_size
            )

            for row in batch:
                row["layer"] = layer
            all_rows.extend(batch)

            fetched += len(batch)
            time.sleep(SLEEP_SECONDS)

        print(f"Fetched (with abstracts): {fetched}")

    # Combine + dedupe
    df = pd.DataFrame(all_rows)
    before = len(df)
    df = df.drop_duplicates(subset=["pmid"], keep="first")
    after = len(df)

    print("\n====================")
    print(f"Total rows before dedupe: {before}")
    print(f"Unique PMIDs after dedupe: {after}")

    df.to_csv("pubmed_medical_affairs_3layer_corpus.csv", index=False)
    print("Saved: pubmed_medical_affairs_3layer_corpus.csv")

    return df

# ----------------------------
# RUN
# ----------------------------
df_final = build_pubmed_corpus()
df_final.head()



=== L1_scientific_engagement ===
Matched: 369865 | Fetching: 10000
  Fetching records 0–999
  Fetching records 1000–1999
  Fetching records 2000–2999
  Fetching records 3000–3999
EFetch error (start=3000) attempt 1: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Retrying in 1s...
  Fetching records 4000–4999
  Fetching records 5000–5999
  Fetching records 6000–6999
  Fetching records 7000–7999
  Fetching records 8000–8999
  Fetching records 9000–9999
Fetched (with abstracts): 9599

=== L2_safety_risk ===
Matched: 313112 | Fetching: 10000
  Fetching records 0–999
  Fetching records 1000–1999
  Fetching records 2000–2999
  Fetching records 3000–3999
  Fetching records 4000–4999
  Fetching records 5000–5999
  Fetching records 6000–6999
  Fetching records 7000–7999
  Fetching records 8000–8999
  Fetching records 9000–9999
Fetched (with abstracts): 9564

=== L3_comparative_critical ===
Matched: 435741 | Fetching: 10000
  Fetching records 0–999


Unnamed: 0,pmid,title,abstract,year,journal,layer
0,41483736,Intracranial activity of antibody-drug conjuga...,Antibody-drug conjugates (ADCs) represent a ra...,2025,Cancer treatment reviews,L1_scientific_engagement
1,41483728,Effect of quetiapine versus haloperidol on del...,"Delirium is a neuropsychiatric condition, char...",2025,Asian journal of psychiatry,L1_scientific_engagement
2,41483727,Vision-language models in diagnostic imaging: ...,Radiology faces an unprecedented workload cris...,2025,International journal of medical informatics,L1_scientific_engagement
3,41483666,The complexity of dystrophin transcription and...,Duchenne muscular dystrophy (DMD) is a rare ge...,2025,Neuromuscular disorders : NMD,L1_scientific_engagement
4,41483580,EEG biomarkers for a precision-medicine approa...,Major depressive disorder (MDD) is a prevalent...,2025,Psychiatry research. Neuroimaging,L1_scientific_engagement
