In [12]:
# Notebook 05 (corrected) — PubMedBERT extractive QA with PubMed/BioBERT retrieval (CPU-optimized)
# Assumes Notebook 03 produced: all_pubmed_embeddings.pkl OR texts.npy + embeddings.npy and pubmed_faiss.index

import os
import re
import pickle
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# -------------------------
# Config / paths (edit if needed)
# -------------------------
EMB_PKL = "all_pubmed_embeddings.pkl"   # preferred (tuple: texts, embeddings)
TEXTS_NPY = "texts.npy"                 # alternate
EMB_NPY = "embeddings.npy"              # alternate
FAISS_INDEX = "pubmed_faiss.index"

# QA model (extractive)
QA_MODEL = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"  # PubMedBERT (base)
# Retrieval embedding model: same family used in Notebook 03
RETRIEVER_MODEL = "dmis-lab/biobert-base-cased-v1.1"  # SentenceTransformer will wrap it with mean pooling

# CPU settings
TOP_K = 5                # number of abstracts to retrieve
CHUNK_WORDS = 500        # chunk size in words (for long abstracts)
MIN_CONFIDENCE = 0.05    # minimum score to accept an answer (tweakable)

# -------------------------
# 1) Load texts + embeddings (support both formats)
# -------------------------
if os.path.exists(EMB_PKL):
    with open(EMB_PKL, "rb") as f:
        texts, embeddings = pickle.load(f)
    embeddings = np.array(embeddings, dtype=np.float32)
elif os.path.exists(TEXTS_NPY) and os.path.exists(EMB_NPY):
    texts = np.load(TEXTS_NPY, allow_pickle=True)
    embeddings = np.load(EMB_NPY).astype(np.float32)
else:
    raise FileNotFoundError("Could not find embeddings. Provide all_pubmed_embeddings.pkl OR texts.npy + embeddings.npy (run Notebook 03).")

print(f"✅ Loaded {len(texts)} texts, embeddings shape = {embeddings.shape}")

# -------------------------
# 2) Load FAISS index
# -------------------------
if not os.path.exists(FAISS_INDEX):
    raise FileNotFoundError(f"{FAISS_INDEX} not found. Run Notebook 03 to build the FAISS index.")

index = faiss.read_index(FAISS_INDEX)
print("✅ FAISS index loaded, ntotal =", index.ntotal)

# -------------------------
# 3) Load retriever & QA models (CPU)
# -------------------------
print("Loading retriever embedding model (PubMed/BioBERT) -- this may take a moment...")
retriever = SentenceTransformer(RETRIEVER_MODEL)  # wraps HF model and mean-pools tokens

print("Loading PubMedBERT QA model (extractive) -- CPU")
qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL)
qa_model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL)
qa_pipe = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer, device=-1)  # CPU

print("✅ Models loaded.")

# -------------------------
# Helpers: cleaning & chunking
# -------------------------
def clean_abstract(text):
    """Remove common metadata lines and tidy whitespace."""
    # Remove DOI, PMID, Author information blocks, copyright lines, common headings
    text = re.sub(r"DOI:\s*\S+", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"PMID:\s*\d+", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"Author information:.*", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"©.*", " ", text)
    text = re.sub(r"\bDOI\b.*", " ", text, flags=re.IGNORECASE)
    # collapse whitespace
    text = " ".join(text.split())
    return text.strip()

def chunk_text_by_words(text, max_words=CHUNK_WORDS):
    """Split a long text into word-count chunks, returning list of chunk strings."""
    words = text.split()
    if len(words) <= max_words:
        return [" ".join(words)]
    chunks = []
    for i in range(0, len(words), max_words):
        chunks.append(" ".join(words[i:i+max_words]))
    return chunks

# -------------------------
# Retrieval + QA pipeline
# -------------------------
def retrieve_top_k_texts(query, k=TOP_K):
    """Embed query with same retriever and search FAISS."""
    q_emb = retriever.encode([query], convert_to_numpy=True).astype(np.float32)
    D, I = index.search(q_emb, k)
    indices = I[0].tolist()
    distances = D[0].tolist()
    retrieved = [(idx, distances[i], texts[idx]) for i, idx in enumerate(indices)]
    return retrieved

def extract_best_answer(query, retrieved_texts):
    """
    Run extractive QA on chunked contexts and return best answer and source info.
    retrieved_texts: list of (idx, distance, text)
    """
    best_answer = None
    best_score = 0.0
    best_source = None

    for idx, dist, text in retrieved_texts:
        clean = clean_abstract(text)
        chunks = chunk_text_by_words(clean, max_words=CHUNK_WORDS)
        for chunk in chunks:
            try:
                res = qa_pipe(question=query, context=chunk)
            except Exception:
                continue
            score = float(res.get("score", 0.0))
            ans = res.get("answer", "").strip()
            # prefer higher score and non-empty answer
            if ans and score > best_score:
                best_score = score
                best_answer = ans
                best_source = {"doc_index": int(idx), "distance": float(dist)}
    return best_answer, best_score, best_source

# -------------------------
# Interactive loop
# -------------------------
print("\n🎯 PubMed extractive QA ready (CPU). Type a question or 'exit' to quit.\n")

while True:
    query = input("🔎 Question: ").strip()
    if not query:
        continue
    if query.lower() in ["exit", "quit"]:
        print("👋 Bye.")
        break

    retrieved = retrieve_top_k_texts(query, k=TOP_K)
    # show which docs were retrieved (optional)
    print("\nRetrieved (doc_index, distance):", [(r[0], round(r[1], 4)) for r in retrieved])

    answer, score, src = extract_best_answer(query, retrieved)
    if answer and score >= MIN_CONFIDENCE:
        print(f"\n💡 Answer (score={score:.3f}): {answer}")
        print("Source doc index:", src)
    else:
        print("\n⚠️ No confident extractive answer found. Showing top retrieved snippets for manual inspection:")
        for i, (idx, dist, txt) in enumerate(retrieved):
            snippet = " ".join(clean_abstract(txt).split()[:80])
            print(f"\n--- Doc {idx} (dist={dist:.3f}) ---\n{snippet}...\n")
    print("\n" + "-"*90 + "\n")


✅ Loaded 1500 texts, embeddings shape = (1500, 768)
✅ FAISS index loaded, ntotal = 1500
Loading retriever embedding model (PubMed/BioBERT) -- this may take a moment...


No sentence-transformers model found with name dmis-lab/biobert-base-cased-v1.1. Creating a new one with mean pooling.


Loading PubMedBERT QA model (extractive) -- CPU


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


✅ Models loaded.

🎯 PubMed extractive QA ready (CPU). Type a question or 'exit' to quit.



🔎 Question:  diabetes symptoms?



Retrieved (doc_index, distance): [(437, 64.7103), (1134, 64.836), (1219, 64.836), (281, 64.9399), (910, 64.9399)]

⚠️ No confident extractive answer found. Showing top retrieved snippets for manual inspection:

--- Doc 437 (dist=64.710) ---
1. Int J Mol Sci. 2025 Sep 18;26(18):9116. Exploring the Diagnostic Utility of Tear IgE and Lid Wiper Epitheliopathy in Ocular Allergy Among Individuals with Hay Fever. Thomas R(1)(2)(3), Azizoglu S(1)(2)(3), Suphioglu C(2)(3)(4), Mikhail E(1)(2)(3), Gokhale M(1)(2)(3). (1)School of Medicine (Optometry), Faculty of Health, Deakin University, Waurn Ponds, Geelong, VIC 3216, Australia. (2)NeuroAllergy Research Laboratory (NARL), School of Life and Environmental Sciences, Faculty of Science, Engineering and Built Environment, Deakin University, Waurn Ponds, Geelong, VIC 3216, Australia. (3)Institute for Mental...


--- Doc 1134 (dist=64.836) ---
1. Medicina (Kaunas). 2025 Sep 18;61(9):1693. Ocular Surface Changes Associated with Neurological Diseases.

🔎 Question:  what is aspirin used for?



Retrieved (doc_index, distance): [(549, 37.2499), (558, 37.2499), (697, 37.2499), (663, 38.0182), (356, 38.2871)]

⚠️ No confident extractive answer found. Showing top retrieved snippets for manual inspection:

--- Doc 549 (dist=37.250) ---
1. Pharmaceuticals (Basel). 2025 Sep 13;18(9):1372. Drugs Versus Microbiota: How Pharmacotherapy Affects Gut and Probiotic Bacteria. Zawistowska-Rojek A(1), Tyski S(1). (1)Department of Pharmaceutical Microbiology and Laboratory Diagnostic, National Medicines Institute, 00-725 Warsaw, Poland. The gut microbiota plays a key role in digestion, nutrient absorption, immune system regulation and metabolite production, significantly impacting human health. The balance of the gut microbiota can be easily disturbed by external factors such as lifestyle, diet and drugs. Some medications-such as metformin used to treat...


--- Doc 558 (dist=37.250) ---
1. Pharmaceuticals (Basel). 2025 Sep 13;18(9):1372. Drugs Versus Microbiota: How Pharmacotherapy Affects G

🔎 Question:  What are the common complications of diabetes?”



Retrieved (doc_index, distance): [(546, 37.3448), (549, 38.0465), (558, 38.0465), (697, 38.0465), (1209, 38.1571)]

⚠️ No confident extractive answer found. Showing top retrieved snippets for manual inspection:

--- Doc 546 (dist=37.345) ---
1. Pharmaceutics. 2025 Aug 28;17(9):1125. From Current Therapeutics to Multitarget Ligands: A Review of Diabetes Pharmacological Treatments. Cabré F(1)(2), Centelles JJ(1)(3)(4), Cascante M(1)(3)(4). (1)Department of Biochemistry & Molecular Biomedicine, University of Barcelona, 08028 Barcelona, Spain. (2)Medical Area, Menarini Group, 08918 Badalona, Spain. (3)Institute of Biomedicine of University of Barcelona (IBUB), University of Barcelona (UB), 08028 Barcelona, Spain. (4)CIBER of Hepatic and Digestive Diseases (CIBEREHD), Institute of Health Carlos III (ISCIII), 28029 Madrid, Spain. Diabetes is a chronic and complex pathological...


--- Doc 549 (dist=38.046) ---
1. Pharmaceuticals (Basel). 2025 Sep 13;18(9):1372. Drugs Versus Microbiota: How 

🔎 Question:  exit


👋 Bye.


In [1]:
# Notebook 05: Hybrid RAG QA with Flan-T5
# ---------------------------------------
# Requirements: sentence-transformers, transformers, faiss-cpu, gradio, torch

import os
import pickle
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import gradio as gr

# ----------------------------
# 1️⃣ Load embeddings & texts
# ----------------------------
embeddings_file = "all_pubmed_embeddings.pkl"
faiss_index_file = "pubmed_faiss.index"

if not os.path.exists(embeddings_file) or not os.path.exists(faiss_index_file):
    raise FileNotFoundError("Embeddings or FAISS index not found. Run Notebook 03 first.")

with open(embeddings_file, "rb") as f:
    texts, embeddings = pickle.load(f)

# Load FAISS index
dim = embeddings.shape[1]
index = faiss.read_index(faiss_index_file)
print(f"✅ Loaded {len(texts)} texts, embeddings shape = {embeddings.shape}")
print(f"✅ FAISS index loaded, ntotal = {index.ntotal}")

# ----------------------------
# 2️⃣ Load Flan-T5 generative QA model
# ----------------------------
model_name = "google/flan-t5-base"  # can switch to smaller/larger variants
print(f"Loading {model_name} ...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"✅ Model {model_name} loaded on {device}")

# ----------------------------
# 3️⃣ Define retrieval + generative function
# ----------------------------
def hybrid_rag_qa(query, top_k=5, max_input_chars=3000, max_output_tokens=150):
    # Compute query embedding using BioBERT-like embeddings
    # Here we reuse MiniLM embedding code or any CPU-friendly BioBERT embedding
    from sentence_transformers import SentenceTransformer
    embedding_model = SentenceTransformer("dmis-lab/biobert-base-cased-v1.1")
    query_emb = embedding_model.encode([query], convert_to_numpy=True).astype("float32")
    
    # Retrieve top-k docs
    D, I = index.search(query_emb, top_k)
    retrieved_texts = [texts[i] for i in I[0]]
    
    # Concatenate retrieved snippets (truncate if too long)
    context = " ".join(retrieved_texts)
    context = context[:max_input_chars]  # prevent very long inputs
    
    # Prepare input for Flan-T5
    input_text = f"Answer the medical question based on the context below:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to(device)
    
    # Generate answer
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_output_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return answer, retrieved_texts

# ----------------------------
# 4️⃣ Test function
# ----------------------------
query = "What is aspirin used for?"
answer, retrieved = hybrid_rag_qa(query)
print(f"\n💡 Answer: {answer}")
print("\nRetrieved snippets:")
for i, doc in enumerate(retrieved):
    print(f"{i+1}. {doc[:300]}...")

# ----------------------------
# 5️⃣ Optional: Gradio Deployment
# ----------------------------
def gradio_qa(query):
    answer, retrieved = hybrid_rag_qa(query)
    return answer

iface = gr.Interface(fn=gradio_qa, inputs="text", outputs="text", title="Medical QA (Hybrid RAG)")
iface.launch()


✅ Loaded 1500 texts, embeddings shape = (1500, 768)
✅ FAISS index loaded, ntotal = 1500
Loading google/flan-t5-base ...
✅ Model google/flan-t5-base loaded on cpu


No sentence-transformers model found with name dmis-lab/biobert-base-cased-v1.1. Creating a new one with mean pooling.



💡 Answer: to treat type 2 diabetes

Retrieved snippets:
1. 1. Pharmaceuticals (Basel). 2025 Sep 13;18(9):1372. doi: 10.3390/ph18091372.

Drugs Versus Microbiota: How Pharmacotherapy Affects Gut and Probiotic Bacteria.

Zawistowska-Rojek A(1), Tyski S(1).

Author information:
(1)Department of Pharmaceutical Microbiology and Laboratory Diagnostic, National 
M...
2. 1. Pharmaceuticals (Basel). 2025 Sep 13;18(9):1372. doi: 10.3390/ph18091372.

Drugs Versus Microbiota: How Pharmacotherapy Affects Gut and Probiotic Bacteria.

Zawistowska-Rojek A(1), Tyski S(1).

Author information:
(1)Department of Pharmaceutical Microbiology and Laboratory Diagnostic, National 
M...
3. 1. Pharmaceuticals (Basel). 2025 Sep 13;18(9):1372. doi: 10.3390/ph18091372.

Drugs Versus Microbiota: How Pharmacotherapy Affects Gut and Probiotic Bacteria.

Zawistowska-Rojek A(1), Tyski S(1).

Author information:
(1)Department of Pharmaceutical Microbiology and Laboratory Diagnostic, National 
M...
4. 1. Atenolol.

Drugs

