In [5]:
# ===============================
# Notebook 03: Build embeddings and FAISS index (CPU-friendly PubMed/BioBERT)
# ===============================

import os
import time
import pickle
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# ----------------------------
# 1️⃣ Paths
# ----------------------------
PICKLE_FILE = "./abstracts_by_category/all_pubmed_abstracts.pkl"  # Output from Notebook 01
EMB_PKL = "all_pubmed_embeddings.pkl"
EMB_NPY = "embeddings.npy"
TEXTS_NPY = "texts.npy"
FAISS_INDEX = "pubmed_faiss.index"

# ----------------------------
# 2️⃣ Load abstracts
# ----------------------------
if not os.path.exists(PICKLE_FILE):
    raise FileNotFoundError(f"{PICKLE_FILE} not found. Run Notebook 01 first.")

with open(PICKLE_FILE, "rb") as f:
    all_abstracts = pickle.load(f)

# Flatten to a single list of texts
texts = []
for category, term_dict in all_abstracts.items():
    for term, abs_list in term_dict.items():
        texts.extend(abs_list)

texts = [t.strip() for t in texts if t.strip()]
print("✅ Number of texts:", len(texts))

if len(texts) == 0:
    raise ValueError("No abstracts found in pickle file.")

# ----------------------------
# 3️⃣ Load PubMed-specific embedding model
# ----------------------------
print("Loading PubMed/BioBERT embeddings model...")
# Public BioBERT (cased, trained on PubMed)
# ⚠️ CPU-only: will be slower. For faster, can use 'all-MiniLM-L6-v2'
model = SentenceTransformer("dmis-lab/biobert-base-cased-v1.1")  
time.sleep(0.2)

# ----------------------------
# 4️⃣ Compute embeddings in batches
# ----------------------------
batch_size = 64  # CPU-friendly batch size
embeddings = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=True)
    embeddings.append(emb)
    print(f"Encoded {i+len(batch_texts)} / {len(texts)}")

embeddings = np.vstack(embeddings).astype("float32")
print("✅ Final embeddings shape:", embeddings.shape)

# ----------------------------
# 5️⃣ Save embeddings & texts
# ----------------------------
with open(EMB_PKL, "wb") as f:
    pickle.dump((texts, embeddings), f)

np.save(EMB_NPY, embeddings)
np.save(TEXTS_NPY, np.array(texts, dtype=object))

print("✅ Saved embeddings files:", EMB_PKL, EMB_NPY, TEXTS_NPY)

# ----------------------------
# 6️⃣ Build FAISS index
# ----------------------------
dim = embeddings.shape[1]
print("Building FAISS index, dim =", dim)
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
print("FAISS ntotal:", index.ntotal)
faiss.write_index(index, FAISS_INDEX)
print("✅ Saved FAISS index to", FAISS_INDEX)


✅ Number of texts: 1500
Loading PubMed/BioBERT embeddings model...


No sentence-transformers model found with name dmis-lab/biobert-base-cased-v1.1. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 64 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 128 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 192 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 256 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 320 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 384 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 448 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 512 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 576 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 640 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 704 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 768 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 832 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 896 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 960 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 1024 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 1088 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 1152 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 1216 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 1280 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 1344 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 1408 / 1500


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Encoded 1472 / 1500


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoded 1500 / 1500
✅ Final embeddings shape: (1500, 768)
✅ Saved embeddings files: all_pubmed_embeddings.pkl embeddings.npy texts.npy
Building FAISS index, dim = 768
FAISS ntotal: 1500
✅ Saved FAISS index to pubmed_faiss.index
