In [1]:
# --- Step 1: Imports ---
import os
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

In [2]:
# --- Step 2: Path to your dataset ---
# Make sure this is correct for your setup
json_dir = "../notebooks/merged_json_dataset/merged_json_dataset"

In [3]:

# --- Step 3: Load the embedding model ---
model = SentenceTransformer("intfloat/multilingual-e5-base")

In [5]:
# Path to your merged JSON dataset
json_dir = "../notebooks/merged_json_dataset/merged_json_dataset"

texts = []      # will hold all text chunks
metadata = []   # will hold info like disorder name, section, source

# Loop through every file in the folder
for file_name in os.listdir(json_dir):
    if not file_name.endswith(".json"):
        continue  # skip non-JSON files

    with open(os.path.join(json_dir, file_name), "r", encoding="utf-8") as f:
        data = json.load(f)  # here 'data' will be a list of dicts

    for entry in data:
        # Go through each field in the dict
        disorder_name = entry.get("condition", "Unknown")
        section_name = entry.get("section", "Unknown")
        source = entry.get("source", None)

        # Some entries have text1, text2, ... others have just "text"
        for key, value in entry.items():
            if key.startswith("text") and value.strip():
                texts.append(value.strip())
                metadata.append({
                    "disorder": disorder_name,
                    "section": section_name,
                    "source": source,
                    "file": file_name
                })

print(f"✅ Loaded {len(texts)} text chunks from {len(os.listdir(json_dir))} JSON files.")


✅ Loaded 1610 text chunks from 9 JSON files.


In [11]:
model = SentenceTransformer("intfloat/multilingual-e5-base")
# 2️⃣ Prepare "passage: " prefixed texts
texts_prefixed = [f"passage: {t}" for t in texts]  # texts = your dataset list


In [12]:
# 3️⃣ Create embeddings
embeddings = model.encode(texts_prefixed, convert_to_numpy=True, show_progress_bar=True)
print(f"✅ Generated embeddings of shape: {embeddings.shape}")

Batches:   0%|          | 0/51 [00:00<?, ?it/s]

✅ Generated embeddings of shape: (1610, 768)


In [13]:
# 4️⃣ Create FAISS index (L2 distance)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"✅ FAISS index contains {index.ntotal} vectors")

✅ FAISS index contains 1610 vectors


In [14]:
# 5️⃣ Save FAISS + metadata + texts
faiss.write_index(index, "faiss_index.bin")

with open("metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)  # list of metadata dicts

with open("texts.pkl", "wb") as f:
    pickle.dump(texts, f)  # list of original text passages

print("💾 Saved FAISS index, metadata & texts")

💾 Saved FAISS index, metadata & texts
