In [11]:
import json
import numpy as np
import faiss
from pathlib import Path
from sentence_transformers import SentenceTransformer


In [12]:
chunks_path = Path("E:/MiiHA/app/data/processed/chunks_medline.jsonl")

In [13]:
docs = []
with open(chunks_path, "r", encoding="utf-8") as f:
    for line in f:
        docs.append(json.loads(line))

# Create texts for embeddings
texts = [doc["text"] for doc in docs]


In [14]:
# Create fresh metadata
metadata = [{"id": doc["id"], "title": doc["title"], "url": doc["url"]} for doc in docs]


In [16]:
# Save NEW metadata
metadata_path = Path("E:/MiiHA/app/data/metadata/medline_metadata.json")
metadata_path.parent.mkdir(parents=True, exist_ok=True)
with open(metadata_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)

print(f"✅ Loaded {len(texts)} chunks")
print(f"✅ Saved {len(metadata)} metadata entries")

✅ Loaded 2612 chunks
✅ Saved 2612 metadata entries


In [17]:
# Generate embeddings
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)


Batches: 100%|██████████| 82/82 [00:28<00:00,  2.86it/s]


In [18]:
# Build FAISS index
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype("float32"))

print("✅ FAISS index created and populated")

✅ FAISS index created and populated


In [19]:
# Save FAISS index
output_index_path = Path("E:/MiiHA/app/db/miiha_medline.index")
output_index_path.parent.mkdir(parents=True, exist_ok=True)
faiss.write_index(index, str(output_index_path))

print("✅ Index saved successfully!")

✅ Index saved successfully!
