In [16]:
from sentence_transformers import CrossEncoder

In [None]:
import os
import json
import faiss
import torch
import numpy as np
import math
from tqdm import tqdm
from typing import List, Dict
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import CrossEncoder

embed_model_name = "intfloat/e5-base-v2"
tokenizer = AutoTokenizer.from_pretrained(embed_model_name)
embed_model = AutoModel.from_pretrained(embed_model_name)

def get_embedding(texts: List[str], max_length=512, batch_size=8):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        with torch.no_grad():
            outputs = embed_model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0].cpu().numpy()
        all_embeddings.append(batch_embeddings)
    return np.vstack(all_embeddings)


reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device='cpu')

def rerank(query: str, docs: List[str]) -> List[str]:
    pairs = [[query, doc] for doc in docs]
    scores = reranker.predict(pairs)
    scored_docs = list(zip(docs, scores))
    scored_docs.sort(key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in scored_docs]

file_paths = [
    'Pdf/dataset (1).json',
    'Pdf/Merged Q& A data.json',
    'Pdf/public_health_batch_2_51_to_100.json',
    'Pdf/public_health_batch_3_101_to_150.json',
    'Pdf/public_health_batch_4_next_50.json',
    'Pdf/public_health_batch_5_next_50.json',
    'Pdf/public_health_batch_6_statewise_200_to_249.json',
    'Pdf/public_health_batch_7_statewise_250_to_299.json',
    'Pdf/public_health_batch_9_statewise_350_to_399.json',
    'Pdf/public_health_batch_10_countywise_400_to_449.json',
    'Pdf/public_health_batch_11_countywise_450_to_499.json',
    'Pdf/public_health_batch_12_countywise_500_to_549.json',
    'Pdf/public_health_batch_13_countywise_550_to_599.json',
    'Pdf/public_health_socioeconomic_dataset.json',
    'Pdf/testing.json'
]

def extract_text_from_file(path: str) -> List[Dict]:
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    texts = []
    if isinstance(data, dict) and "entries" in data:
        for entry in data["entries"]:
            content = f"Q: {entry.get('question')} A: {entry.get('answer')}"
            texts.append({"text": content, "source": os.path.basename(path)})
    elif isinstance(data, dict):
        for doc in data.values():
            content = ' '.join([
                doc.get("title", ""),
                doc.get("abstract", ""),
                doc.get("summary", ""),
                doc.get("key_findings", ""),
                doc.get("conclusion", "")
            ])
            texts.append({"text": content, "source": os.path.basename(path)})
    elif isinstance(data, list):
        for item in data:
            content = ' '.join([
                item.get("socio_economic_indicator", ""),
                item.get("summary", ""),
                item.get("analysis", ""),
                item.get("statistical_findings", "")
            ])
            texts.append({"text": content, "source": os.path.basename(path)})
    return texts

documents = []
for path in file_paths:
    documents.extend(extract_text_from_file(path))

texts = [doc["text"] for doc in documents]
print(f"\nGenerating embeddings for {len(texts)} documents...")
embeddings = get_embedding(texts)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

faiss.write_index(index, "vector_index.faiss")
with open("vector_metadata.json", "w") as f:
    json.dump(documents, f, indent=2)


def dcg(scores):
    return sum([(rel / math.log2(rank + 2)) for rank, rel in enumerate(scores)])

def ndcg_at_k(predicted_ids, relevance_dict, k=5):
    scores = [relevance_dict.get(str(doc_id), 0) for doc_id in predicted_ids[:k]]
    dcg_val = dcg(scores)
    ideal_scores = sorted(relevance_dict.values(), reverse=True)[:k]
    idcg_val = dcg(ideal_scores)
    return dcg_val / idcg_val if idcg_val > 0 else 0.0


def evaluate_ndcg(index, documents, model, eval_queries, k=5):
    ndcg_total = 0
    num_queries = len(eval_queries)

    print("\nPer-query nDCG@5 scores:")

    for item in tqdm(eval_queries):
        query = item["query"]
        relevance = item["relevance"]

        query_embedding = get_embedding([query])
        distances, indices = index.search(np.array(query_embedding), 10)
        top_docs_raw = [documents[i]["text"] for i in indices[0]]

        reranked_texts = rerank(query, top_docs_raw)
        top_k_docs = reranked_texts[:k]

        predicted_ids = [
            idx for idx in indices[0]
            if documents[idx]["text"] in top_k_docs
        ][:k]

        score = ndcg_at_k(predicted_ids, relevance, k)
        ndcg_total += score

        print(f" - Query: {query[:50]}... → nDCG@{k}: {score:.4f}")

    print(f"\nFinal Evaluation Summary (Graded Relevance):")
    print(f" - Mean nDCG@{k}: {ndcg_total / num_queries:.4f}")


def convert_old_eval_to_graded(path="evaluation_queries.json"):
    with open(path, "r") as f:
        data = json.load(f)

    if "relevance" in data[0]:
        print("Already in graded format.")
        return

    new_data = []
    for item in data:
        new_data.append({
            "query": item["query"],
            "relevance": {str(doc_id): 3 for doc_id in item["relevant_doc_ids"]}
        })

    with open(path, "w") as f:
        json.dump(new_data, f, indent=2)
    print("Converted evaluation_queries.json to graded format.")

if os.path.exists("evaluation_queries.json"):
    convert_old_eval_to_graded("evaluation_queries.json")
    with open("evaluation_queries.json", "r") as f:
        eval_queries = json.load(f)
    index = faiss.read_index("vector_index.faiss")
    evaluate_ndcg(index, documents, embed_model, eval_queries, k=5)
else:
    print("No evaluation file found.")


You are trying to use a model that was created with Sentence Transformers version 4.1.0.dev0, but you're currently using version 4.0.2. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



🔍 Generating embeddings for 1441 documents...


Embedding: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 181/181 [04:28<00:00,  1.48s/it]


✅ Converted evaluation_queries.json to graded format.


  0%|                                                                                                                                 | 0/5 [00:00<?, ?it/s]
Embedding: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.29it/s][A
 20%|████████████████████████▏                                                                                                | 1/5 [00:00<00:01,  3.68it/s]
Embedding: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.78it/s][A
 40%|████████████████████████████████████████████████▍                                                                        | 2/5 [00:00<00:00,  3.37it/s]
Embedding: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.26it/s][A
 60%|████████████████████████████████████████████


📊 Evaluation Summary (Graded Relevance, Top-5):
 - nDCG@5: 1.0000





In [21]:
np.savez_compressed("document_embeddings.npz", embeddings=embeddings)

In [22]:
with open("document_metadata.json", "w") as f:
    json.dump(documents, f, indent=2)

In [23]:
# Load embeddings
loaded = np.load("document_embeddings.npz")
embeddings = loaded["embeddings"]

# Load metadata
with open("document_metadata.json", "r") as f:
    documents = json.load(f)
