In [None]:
cd /content/drive/MyDrive/GenAI Project/

/content/drive/MyDrive/GenAI Project


In [None]:
import json

def read_json(filepath):
    """Reads a JSON file and returns the data."""
    with open(filepath, 'r') as f:
        return json.load(f)

def write_json(data, filepath):
    """Writes data to a JSON file."""
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=4)

In [None]:
entity_index = read_json('Dataset/Corpus/entity_index.json')
mahabharata_questions = read_json('Dataset/Validation/mahabharata_questions.json')
entities_kb = read_json('Dataset/Corpus/entities_kb.json')
chapter_entity_ids = read_json('Dataset/Corpus/chapter_entity_ids.json')


In [None]:
import json

with open('Dataset/Validation/final_data.json', 'r') as f:
    itihasa_data = json.load(f)

In [None]:
!pip install sentence-transformers faiss-cpu tqdm


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
import numpy as np
import pickle
import faiss
import pandas as pd
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


class EmbeddingSearchEvaluator:
    def __init__(self,
                 model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
                 emb_path="embeddings.npy",
                 ids_path="doc_ids.pkl"):
        # Load model and data
        self.model = SentenceTransformer(model_name)
        self.embeddings = np.load(emb_path)
        with open(ids_path, "rb") as f:
            self.doc_ids = pickle.load(f)
        print(f"✅ Loaded {len(self.embeddings)} embeddings.")

        # Normalize for cosine/dot similarity
        faiss.normalize_L2(self.embeddings)

        # Build FAISS index
        dim = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dim)
        self.index.add(self.embeddings)

    def evaluate_full_scores(self, questions_dict):
        """
        For each question, compute similarity scores with ALL chapters (doc_ids).
        Returns: dict[question][ch_id] = similarity_score
        """
        results = {}

        for true_id, entry in tqdm(questions_dict.items(), desc="Evaluating questions"):
            for q in entry.get("questions", []):
                # Encode and normalize query
                query_vec = self.model.encode([q], convert_to_numpy=True)
                faiss.normalize_L2(query_vec)

                # Compute similarity with all docs
                D, I = self.index.search(query_vec, k=len(self.doc_ids))
                scores = D[0]
                ids = [self.doc_ids[i] for i in I[0]]

                # Map chapter IDs → similarity scores
                results[q] = {doc_id: float(score) for doc_id, score in zip(ids, scores)}

        self.full_scores = results
        return results

    def save_full_scores(self, path="question_scores.json"):
        """Save the full score dictionary to JSON."""
        if not hasattr(self, "full_scores"):
            raise ValueError("Run evaluate_full_scores() first.")
        with open(path, "w", encoding="utf-8") as f:
            json.dump(self.full_scores, f, indent=4, ensure_ascii=False)
        print(f"✅ Saved question–chapter scores to {path}")


In [None]:
# Initialize evaluator
evaluator = EmbeddingSearchEvaluator(
    model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    emb_path="Retrieval Experiments/Models/embeddings_multi_mp.npy",
    ids_path="Retrieval Experiments/Models/doc_ids.pkl"
)

✅ Loaded 2108 embeddings.


In [None]:
# results = evaluator.evaluate_full_scores(mahabharata_questions)
evaluator.save_full_scores("Retrieval Experiments/1.9/embedding_scores.json")


✅ Saved question–chapter scores to Retrieval Experiments/1.8/embedding_scores.json


182