In [2]:
# Notebook 01: Fetch PubMed abstracts for multiple categories

import pickle
import time
import os
from Bio import Entrez

# ----------------------------
# 1️⃣ Setup
# ----------------------------
Entrez.email = "pranav4official@gmail.com"  # Required by NCBI

# Categories with ~25 terms each
search_categories = {
    "diseases": [
        "diabetes", "hypertension", "cancer", "asthma", "tuberculosis",
        "HIV", "malaria", "arthritis", "osteoporosis", "obesity",
        "stroke", "Alzheimer's disease", "Parkinson's disease", "hepatitis",
        "migraine", "anemia", "COVID-19", "influenza", "cholera",
        "dengue", "epilepsy", "schizophrenia", "depression", "thyroid disorder",
        "kidney disease"
    ],
    "symptoms": [
        "fever", "cough", "headache", "fatigue", "chest pain",
        "shortness of breath", "dizziness", "nausea", "vomiting", "diarrhea",
        "constipation", "joint pain", "back pain", "sore throat",
        "weight loss", "weight gain", "loss of appetite", "night sweats",
        "itching", "blurred vision", "memory loss", "insomnia",
        "anxiety", "rash", "swelling"
    ],
    "medicines": [
        "aspirin", "paracetamol", "ibuprofen", "amoxicillin", "metformin",
        "statins", "insulin", "omeprazole", "azithromycin", "chloroquine",
        "hydroxychloroquine", "dexamethasone", "morphine", "codeine",
        "heparin", "warfarin", "atenolol", "losartan", "amlodipine",
        "levothyroxine", "diazepam", "sertraline", "fluoxetine",
        "acetaminophen", "naproxen"
    ],
    "procedures": [
        "MRI", "CT scan", "X-ray", "ultrasound", "colonoscopy",
        "endoscopy", "angioplasty", "dialysis", "chemotherapy",
        "radiotherapy", "immunotherapy", "surgery", "biopsy",
        "laparoscopy", "echocardiography", "blood transfusion",
        "bone marrow transplant", "pacemaker implantation",
        "stent placement", "cesarean section", "vaccination",
        "organ transplantation", "plastic surgery", "cataract surgery",
        "hip replacement"
    ],
    "human_systems": [
        "nervous system", "circulatory system", "respiratory system",
        "digestive system", "urinary system", "reproductive system",
        "endocrine system", "skeletal system", "muscular system",
        "immune system", "integumentary system", "lymphatic system",
        "central nervous system", "autonomic nervous system",
        "vascular system", "renal system", "gastrointestinal system",
        "cardiovascular system", "pulmonary system", "hepatic system",
        "pancreatic system", "ocular system", "auditory system",
        "olfactory system", "connective tissues"
    ],
    "miscellaneous": [
        "placebo", "genome", "epigenetics", "stem cells", "CRISPR",
        "nanomedicine", "microbiome", "biomarkers", "artificial intelligence",
        "machine learning", "precision medicine", "telemedicine",
        "bioinformatics", "metabolomics", "proteomics", "genomics",
        "nutrigenomics", "clinical trials", "public health", "epidemiology",
        "medical ethics", "healthcare policy", "drug resistance",
        "vaccine hesitancy", "personalized therapy"
    ]
}

max_results_per_term = 10   # keep small for testing
save_folder = "./abstracts_by_category/"
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# ----------------------------
# 2️⃣ Function to fetch abstracts
# ----------------------------
def fetch_pubmed_abstracts(term, max_results=10):
    abstracts = []
    try:
        handle = Entrez.esearch(db="pubmed", term=term, retmax=max_results)
        record = Entrez.read(handle)
        handle.close()
        ids = record["IdList"]
        
        for pubmed_id in ids:
            try:
                fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="abstract", retmode="text")
                abstract_text = fetch_handle.read()
                fetch_handle.close()
                abstracts.append(abstract_text.strip())
                time.sleep(0.1)  # polite delay
            except Exception as e_inner:
                print(f"Failed to fetch {pubmed_id}: {e_inner}")
    except Exception as e_outer:
        print(f"Search failed for {term}: {e_outer}")
    
    return abstracts

# ----------------------------
# 3️⃣ Loop over categories & terms
# ----------------------------
all_abstracts = {}

for category, terms in search_categories.items():
    print(f"\n📂 Category: {category}")
    category_file = os.path.join(save_folder, f"{category}_abstracts.pkl")
    category_abstracts = {}
    
    # Skip if already exists
    if os.path.exists(category_file):
        print(f"✅ Already saved for {category}, loading...")
        with open(category_file, "rb") as f:
            category_abstracts = pickle.load(f)
    else:
        for term in terms:
            print(f"   🔎 Fetching abstracts for: {term}")
            abstracts = fetch_pubmed_abstracts(term, max_results=max_results_per_term)
            category_abstracts[term] = abstracts
            print(f"   ✅ {len(abstracts)} abstracts saved for {term}")
        
        with open(category_file, "wb") as f:
            pickle.dump(category_abstracts, f)
    
    all_abstracts[category] = category_abstracts

# ----------------------------
# 4️⃣ Save everything together (optional)
# ----------------------------
all_file = os.path.join(save_folder, "all_pubmed_abstracts.pkl")
with open(all_file, "wb") as f:
    pickle.dump(all_abstracts, f)

print("\n🎉 All abstracts fetched and saved by category.")



📂 Category: diseases
   🔎 Fetching abstracts for: diabetes
   ✅ 10 abstracts saved for diabetes
   🔎 Fetching abstracts for: hypertension
   ✅ 10 abstracts saved for hypertension
   🔎 Fetching abstracts for: cancer
   ✅ 10 abstracts saved for cancer
   🔎 Fetching abstracts for: asthma
   ✅ 10 abstracts saved for asthma
   🔎 Fetching abstracts for: tuberculosis
   ✅ 10 abstracts saved for tuberculosis
   🔎 Fetching abstracts for: HIV
   ✅ 10 abstracts saved for HIV
   🔎 Fetching abstracts for: malaria
   ✅ 10 abstracts saved for malaria
   🔎 Fetching abstracts for: arthritis
   ✅ 10 abstracts saved for arthritis
   🔎 Fetching abstracts for: osteoporosis
   ✅ 10 abstracts saved for osteoporosis
   🔎 Fetching abstracts for: obesity
   ✅ 10 abstracts saved for obesity
   🔎 Fetching abstracts for: stroke
   ✅ 10 abstracts saved for stroke
   🔎 Fetching abstracts for: Alzheimer's disease
   ✅ 10 abstracts saved for Alzheimer's disease
   🔎 Fetching abstracts for: Parkinson's disease
   ✅ 10