In [None]:
"""
===============================================================================
STEP 0 — INSTALL ALL DEPENDENCIES UP FRONT
-------------------------------------------------------------------------------
- This cell installs only the libraries needed for a fully local pipeline:
  pandas/numpy for data handling, transformers/torch for DPR encoders, faiss-cpu
  for vector search, and tqdm for progress bars.
- Intentionally omits OpenAI and external chat/LLM calls.
- Keeps FAISS CPU to avoid faiss-gpu install issues and ensure portability.
===============================================================================
"""
%pip install -q --upgrade pandas numpy tqdm torch transformers faiss-cpu


In [None]:
"""
===============================================================================
STEP 1 — IMPORTS, GLOBAL CONFIG, AND SEEDING
-------------------------------------------------------------------------------
- Imports core libraries and sets deterministic behavior where possible.
- Selects device automatically (GPU if available, else CPU).
- Defines model names for DPR question and context encoders.
- Sets default paths for input dataset and output artifacts.
===============================================================================
"""
import os
import json
import math
import random
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import faiss

from transformers import (
    DPRQuestionEncoder,
    DPRQuestionEncoderTokenizer,
    DPRContextEncoder,
    DPRContextEncoderTokenizer,
)

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# DPR model names (stable defaults)
QUESTION_ENCODER_NAME = "facebook/dpr-question_encoder-single-nq-base"
CONTEXT_ENCODER_NAME  = "facebook/dpr-ctx_encoder-single-nq-base"

# IO paths — update INPUT_CSV if needed
INPUT_CSV   = "medquad.csv"  # if running on Kaggle: "/kaggle/input/medquad-medical-question-answer-for-ai-research/medquad.csv"
OUTPUT_DIR  = "artifacts"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Encoding / batching parameters
MAX_LENGTH  = 256
BATCH_SIZE  = 64
DTYPE       = torch.float32


In [None]:
"""
===============================================================================
STEP 2 — LOAD DATASET AND VERIFY COLUMNS
-------------------------------------------------------------------------------
- Loads the dataset and drops empty rows.
- Validates presence of expected columns: 'question', 'answer', 'source', 'focus_area'.
- Displays the basic shape to confirm successful load.
===============================================================================
"""
# Load CSV
df = pd.read_csv(INPUT_CSV)
df = df.dropna().reset_index(drop=True)

# Validate columns present
required_cols = {"question", "answer", "source", "focus_area"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

print(f"Dataset shape: {df.shape}")
df.head(3)


In [None]:
"""
===============================================================================
STEP 3 — LIGHTWEIGHT TEXT PREPROCESSING
-------------------------------------------------------------------------------
- Performs minimal, robust normalization suitable for DPR encoders:
  lowercasing, trimming whitespace, and collapsing repeated spaces.
- Creates new columns 'question_clean' and 'answer_clean'.
- Keeps original text columns unchanged for traceability.
===============================================================================
"""
import re

def clean_text(text: str) -> str:
    text = str(text).lower().strip()
    text = re.sub(r"\s+", " ", text)
    return text

df["question_clean"] = df["question"].apply(clean_text)
df["answer_clean"]   = df["answer"].apply(clean_text)

df[["question", "question_clean", "answer", "answer_clean"]].head(3)


In [None]:
"""
===============================================================================
STEP 4 — LOAD DPR QUESTION AND CONTEXT ENCODERS
-------------------------------------------------------------------------------
- Loads the proper tokenizer/encoder pairs for DPR question and context encoders.
- Moves models to the chosen device (GPU if available, otherwise CPU).
- Uses .eval() and torch.no_grad() for deterministic, inference-only behavior.
===============================================================================
"""
# Question side
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(QUESTION_ENCODER_NAME)
q_encoder   = DPRQuestionEncoder.from_pretrained(QUESTION_ENCODER_NAME).to(DEVICE).eval()

# Context/answer side
c_tokenizer = DPRContextEncoderTokenizer.from_pretrained(CONTEXT_ENCODER_NAME)
c_encoder   = DPRContextEncoder.from_pretrained(CONTEXT_ENCODER_NAME).to(DEVICE).eval()


In [None]:
"""
===============================================================================
STEP 5 — BATCHED ENCODING UTILITIES
-------------------------------------------------------------------------------
- Defines a memory-efficient, batched encoding function for arbitrary text arrays.
- For DPR models, the pooled CLS representation is accessed via .pooler_output.
- Returns a NumPy float32 array of shape [N, D].
===============================================================================
"""
def encode_texts(texts, tokenizer, model, batch_size=BATCH_SIZE, max_length=MAX_LENGTH, device=DEVICE):
    embeddings = []
    model.eval()
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", unit="batch"):
            batch_texts = texts[i:i+batch_size]
            inputs = tokenizer(
                batch_texts,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=max_length,
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            # DPR returns pooler_output as [batch, hidden_size]
            pooled = outputs.pooler_output.detach().to("cpu").type(torch.float32).numpy()
            embeddings.append(pooled)
    return np.vstack(embeddings).astype("float32")


In [None]:
"""
===============================================================================
STEP 6 — ENCODE ANSWERS (CONTEXTS) AND QUESTIONS
-------------------------------------------------------------------------------
- Encodes the cleaned answers with the context encoder (index documents).
- Encodes the cleaned questions with the question encoder (optional, saved for QA).
- Shapes match DPR hidden size, typically 768.
===============================================================================
"""
answers   = df["answer_clean"].tolist()
questions = df["question_clean"].tolist()

answer_embeddings   = encode_texts(answers,   c_tokenizer, c_encoder, batch_size=BATCH_SIZE, max_length=MAX_LENGTH, device=DEVICE)
question_embeddings = encode_texts(questions, q_tokenizer, q_encoder, batch_size=BATCH_SIZE, max_length=MAX_LENGTH, device=DEVICE)

print("Answer embeddings:",   answer_embeddings.shape)
print("Question embeddings:", question_embeddings.shape)


In [None]:
"""
===============================================================================
STEP 7 — BUILD FAISS INDEX (CPU, COSINE VIA NORMALIZED INNER PRODUCT)
-------------------------------------------------------------------------------
- Normalizes embeddings to unit length so that inner product equals cosine similarity.
- Uses IndexFlatIP for high recall and simplicity.
- Adds all answer embeddings to the index to enable retrieval.
===============================================================================
"""
# Normalize to use cosine similarity via inner product
faiss.normalize_L2(answer_embeddings)

dim = answer_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(answer_embeddings)

print(f"FAISS CPU index built with {index.ntotal} vectors at dimension {dim}.")


In [None]:
"""
===============================================================================
STEP 8 — OPTIONAL QUICK SMOKE TEST (NO EXTERNAL LLM)
-------------------------------------------------------------------------------
- Runs a tiny retrieval check using a random dataset question.
- Encodes the sample question, normalizes it, and searches top-k in FAISS.
- Prints the top results with basic fields to verify end-to-end correctness.
===============================================================================
"""
do_quick_test = True
TOP_K = 5

if do_quick_test and len(questions) > 0:
    test_q = random.choice(questions)
    with torch.no_grad():
        qi = q_tokenizer([test_q], return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH)
        qi = {k: v.to(DEVICE) for k, v in qi.items()}
        qo = q_encoder(**qi).pooler_output.detach().cpu().numpy().astype("float32")
    faiss.normalize_L2(qo)
    scores, idx = index.search(qo, TOP_K)
    idx = idx[0].tolist()
    scores = scores[0].tolist()

    print("\nQuery:", test_q[:200], "...")
    print("\nTop results:")
    for rank, (i, s) in enumerate(zip(idx, scores), 1):
        print(f"- {rank:>2}. score={s:.4f} | focus_area={df.loc[i, 'focus_area']} | source={df.loc[i, 'source']}")
        print(f"      answer: {df.loc[i, 'answer_clean'][:200]} ...")


In [None]:
"""
===============================================================================
STEP 9 — SAVE EVERYTHING AT THE END
-------------------------------------------------------------------------------
- Persists all generated artifacts to OUTPUT_DIR:
  1) Cleaned dataset as CSV.
  2) Numpy .npy files for answer and question embeddings.
  3) FAISS index file for CPU.
  4) A small config.json with model names, dims, counts, and parameters.
  5) A row-to-id mapping CSV for explicit doc_id tracking.
===============================================================================
"""
# 1) Cleaned dataset
clean_csv_path = os.path.join(OUTPUT_DIR, "dataset_clean.csv")
df.to_csv(clean_csv_path, index=False)

# 2) Embeddings
ans_path = os.path.join(OUTPUT_DIR, "answer_embeddings.npy")
que_path = os.path.join(OUTPUT_DIR, "question_embeddings.npy")
np.save(ans_path, answer_embeddings)
np.save(que_path, question_embeddings)

# 3) FAISS index
faiss_path = os.path.join(OUTPUT_DIR, "faiss_index_cpu.index")
faiss.write_index(index, faiss_path)

# 4) Config metadata
config = {
    "question_encoder_name": QUESTION_ENCODER_NAME,
    "context_encoder_name": CONTEXT_ENCODER_NAME,
    "embedding_dim": int(answer_embeddings.shape[1]),
    "num_docs": int(answer_embeddings.shape[0]),
    "batch_size": int(BATCH_SIZE),
    "max_length": int(MAX_LENGTH),
    "device": str(DEVICE),
    "files": {
        "clean_csv": clean_csv_path,
        "answer_embeddings": ans_path,
        "question_embeddings": que_path,
        "faiss_index_cpu": faiss_path,
    },
}
with open(os.path.join(OUTPUT_DIR, "config.json"), "w") as f:
    json.dump(config, f, indent=2)

# 5) Explicit id mapping
id_map = pd.DataFrame({
    "doc_id": np.arange(len(df), dtype=np.int32),
    "source": df["source"],
    "focus_area": df["focus_area"],
})
id_map_path = os.path.join(OUTPUT_DIR, "id_map.csv")
id_map.to_csv(id_map_path, index=False)

print("\nSaved artifacts:")
for k, v in config["files"].items():
    print(f"- {k}: {v}")
print(f"- id_map: {id_map_path}")
