In [1]:
# Notebook 02: Generate embeddings from PubMed abstracts

import pickle
import os
import numpy as np
from sentence_transformers import SentenceTransformer

# ----------------------------
# 1️⃣ Setup
# ----------------------------
save_folder = "./abstracts_by_category/"
all_file = os.path.join(save_folder, "all_pubmed_abstracts.pkl")

if not os.path.exists(all_file):
    raise FileNotFoundError("❌ Could not find abstracts file. Run Notebook 01 first.")

# Load abstracts
print("🔄 Loading abstracts...")
with open(all_file, "rb") as f:
    all_abstracts = pickle.load(f)

print(f"✅ Categories loaded: {list(all_abstracts.keys())}")

# ----------------------------
# 2️⃣ Flatten abstracts
# ----------------------------
abstracts = []
terms = []

for category, term_dict in all_abstracts.items():
    for term, abs_list in term_dict.items():
        for abs_text in abs_list:
            if abs_text.strip():  # skip empty
                abstracts.append(abs_text)
                terms.append(term)

print(f"📄 Total abstracts collected: {len(abstracts)}")

# ----------------------------
# 3️⃣ Generate embeddings
# ----------------------------
print("🔄 Loading embedding model...")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

print("🔄 Generating embeddings...")
embeddings = model.encode(abstracts, show_progress_bar=True, convert_to_numpy=True)

print(f"✅ Embeddings shape: {embeddings.shape}")

# ----------------------------
# 4️⃣ Save embeddings & abstracts
# ----------------------------
embedding_file = os.path.join(save_folder, "embeddings.pkl")
with open(embedding_file, "wb") as f:
    pickle.dump((abstracts, embeddings, terms), f)

print(f"🎉 Saved embeddings to {embedding_file}")


🔄 Loading abstracts...
✅ Categories loaded: ['diseases', 'symptoms', 'medicines', 'procedures', 'human_systems', 'miscellaneous']
📄 Total abstracts collected: 1500
🔄 Loading embedding model...
🔄 Generating embeddings...


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

✅ Embeddings shape: (1500, 384)
🎉 Saved embeddings to ./abstracts_by_category/embeddings.pkl
