In [1]:
from Bio import Entrez, Medline
import time
import json
import pandas as pd
from typing import List, Dict

Entrez.email = "sraole3@gatech.edu"

In [12]:
def search_pubmed(query: str, retmax: int = 200) -> List[str]:
    """Search PubMed for a given query and return a list of PMIDs."""
    handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax, sort="relevance")
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

queries = [
    '(cancer OR oncology) AND (genomics OR transcriptomics OR biomarker)',
    '(cancer OR oncology) AND (drug OR therapy OR therapeutic OR treatment)',
    '(cancer OR oncology) AND (resistance OR "drug resistance" OR relapse)',
    '(cancer OR oncology) AND (target OR "drug target" OR interaction OR pathway)'
]

all_pmids = set()
for q in queries:
    ids = search_pubmed(q, retmax=300)
    all_pmids.update(ids)

pmids = list(all_pmids)
print(f"Unique PMIDs: {len(pmids)}")

Unique PMIDs: 1184


In [13]:
def fetch_pubmed_details(pmids: List[str]) -> List[Dict]:
    """
    Fetch PubMed records in batches and return simplified dicts.
    """
    records_out = []
    batch_size = 100

    for i in range(0, len(pmids), batch_size):
        batch = pmids[i:i+batch_size]
        ids = ",".join(batch)

        handle = Entrez.efetch(
            db="pubmed",
            id=ids,
            rettype="abstract",
            retmode="xml"
        )
        data = Entrez.read(handle)
        handle.close()

        articles = data["PubmedArticle"]

        for article in articles:
            try:
                medline = article["MedlineCitation"]
                pmid = str(medline["PMID"])

                article_info = medline["Article"]
                title = str(article_info.get("ArticleTitle", "")).strip()

                # Abstract text split into sections
                abstract_text = ""
                if "Abstract" in article_info and "AbstractText" in article_info["Abstract"]:
                    parts = article_info["Abstract"]["AbstractText"]
                    abstract_text = " ".join([str(p) for p in parts]).strip()

                # Journal
                journal = ""
                if "Journal" in article_info and "Title" in article_info["Journal"]:
                    journal = str(article_info["Journal"]["Title"])

                # Year
                year = None
                try:
                    pubdate = article_info["Journal"]["JournalIssue"]["PubDate"]
                    if "Year" in pubdate:
                        year = str(pubdate["Year"])
                except Exception:
                    pass

                # Authors
                authors = []
                if "AuthorList" in article_info:
                    for a in article_info["AuthorList"]:
                        if "LastName" in a and "Initials" in a:
                            authors.append(f"{a['LastName']} {a['Initials']}")

                # MeSH terms
                mesh_terms = []
                if "MeshHeadingList" in medline:
                    for mh in medline["MeshHeadingList"]:
                        if "DescriptorName" in mh:
                            mesh_terms.append(str(mh["DescriptorName"]))

                records_out.append({
                    "pmid": pmid,
                    "title": title,
                    "abstract": abstract_text,
                    "journal": journal,
                    "year": year,
                    "authors": authors,
                    "mesh_terms": mesh_terms
                })

            except Exception as e:
                # Skipping malformed records
                continue

        time.sleep(0.34)  # be nice to NCBI

    return records_out

pubmed_raw = fetch_pubmed_details(pmids)
print(f"Fetched {len(pubmed_raw)} records")
pubmed_raw[0]

Fetched 1184 records


{'pmid': '33717076',
 'title': 'Integrative Genomic and Transcriptomic Analyses of Tumor Suppressor Genes and Their Role on Tumor Microenvironment and Immunity in Lung Squamous Cell Carcinoma.',
 'abstract': "Non-small-cell lung cancers (NSCLCs) are largely classified into lung adenocarcinoma (LUAD) and lung squamous cell carcinoma (LUSC), which have different therapeutic options according to its molecular profiles and immune checkpoint expression, especially PD-L1, which is a suppressive factor in the tumor microenvironment. The tumor microenvironment can be altered by the genomic mutations on specific innate immune genes as well as tumor suppressor genes, so it is essential to comprehend the association between tumor microenvironment and tumor suppressor genes to discover the promising immunotherapeutic strategy to overcome the resistance of immune check point blockade. In this study, we aimed to analyze how the somatic mutations in tumor suppressor genes affect the tumor immune micr

In [15]:
def build_pubmed_record(p):
    # Build searchable text (This is what I will be embedding)
    authors_short = ", ".join(p["authors"][:5]) if p.get("authors") else ""
    mesh_short = "; ".join(p["mesh_terms"][:15]) if p.get("mesh_terms") else ""

    search_text_parts = [
        f"PubMed paper: {p.get('title', '')}.",
        f"Abstract: {p.get('abstract', '')}" if p.get("abstract") else "",
        f"Journal: {p.get('journal', '')}." if p.get("journal") else "",
        f"Year: {p.get('year', '')}." if p.get("year") else "",
        f"Authors: {authors_short}." if authors_short else "",
        f"MeSH terms: {mesh_short}." if mesh_short else "",
    ]
    search_text = " ".join([x for x in search_text_parts if x]).strip()

    return {
        "id": f"PubMed::{p['pmid']}",
        "entity_type": "PubMedArticle",
        "identifier": p["pmid"],
        "name": p.get("title", ""),
        "search_text": search_text,
        "metadata": {
            "source": "PubMed",
            "pmid": p["pmid"],
            "journal": p.get("journal"),
            "year": p.get("year"),
            "authors": p.get("authors", []),
            "mesh_terms": p.get("mesh_terms", [])
        }
    }

pubmed_records = [build_pubmed_record(p) for p in pubmed_raw if p.get("title") and p.get("abstract")]
print(len(pubmed_records))
print(pubmed_records[0]["id"])
print(pubmed_records[0]["search_text"][:500])

1177
PubMed::33717076
PubMed paper: Integrative Genomic and Transcriptomic Analyses of Tumor Suppressor Genes and Their Role on Tumor Microenvironment and Immunity in Lung Squamous Cell Carcinoma.. Abstract: Non-small-cell lung cancers (NSCLCs) are largely classified into lung adenocarcinoma (LUAD) and lung squamous cell carcinoma (LUSC), which have different therapeutic options according to its molecular profiles and immune checkpoint expression, especially PD-L1, which is a suppressive factor in the tumor microenvi


In [None]:
# Saving as JSON

with open("data/records/pubmed_records_cancer_mvp.json", "w") as f:
    json.dump(pubmed_records, f, indent=2)

print("Saved pubmed_records_cancer_mvp.json")

Saved pubmed_records_cancer_mvp.json
