In [None]:
cd /content/drive/MyDrive/GenAI Project/

/content/drive/MyDrive/GenAI Project


In [None]:
import json

with open('Dataset/Validation/final_data.json', 'r') as f:
    itihasa_data = json.load(f)

In [None]:
itihasa_data

Output hidden; open in https://colab.research.google.com to view.

In [None]:
total_len = 0
for doc in itihasa_data.values():
  total_len += len(doc)

In [None]:
total_len

13406678

In [None]:
len(itihasa_data)

2108

In [None]:
!pip install sentence-transformers faiss-cpu tqdm


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
import json

with open('Dataset/Validation/mahabharata_questions.json', 'r') as f:
    mahabharata_questions = json.load(f)

In [None]:
import numpy as np
import pickle
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

class EmbeddingSearchEvaluator:
    def __init__(self,
                 model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
                 emb_path="embeddings.npy",
                 ids_path="doc_ids.pkl"):
        # Load model and data
        self.model = SentenceTransformer(model_name)
        self.embeddings = np.load(emb_path)
        with open(ids_path, "rb") as f:
            self.doc_ids = pickle.load(f)
        print(f"Loaded {len(self.embeddings)} embeddings.")

        # Normalize for cosine/dot similarity
        faiss.normalize_L2(self.embeddings)

        # Build FAISS index
        dim = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dim)
        self.index.add(self.embeddings)

    def search(self, query, top_k=3):
        """Search for top_k most similar documents for a query string."""
        query_vec = self.model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_vec)
        D, I = self.index.search(query_vec, k=top_k)
        results = [
            {"rank": r + 1,
             "score": float(D[0][r]),
             "doc_id": self.doc_ids[I[0][r]]}
            for r in range(top_k)
        ]
        return results

    def evaluate_questions(self, questions_dict, top_k=3):
        rows = []
        for true_id, entry in tqdm(questions_dict.items()):
            for q in entry.get("questions", []):
                # Encode query and normalize
                query_vec = self.model.encode([q], convert_to_numpy=True)
                faiss.normalize_L2(query_vec)

                # Search top-k
                D, I = self.index.search(query_vec, k=top_k)
                retrieved_ids = [self.doc_ids[idx] for idx in I[0]]
                retrieved_scores = [float(score) for score in D[0]]

                # Compute similarity with ground-truth document
                if true_id in self.doc_ids:
                    true_idx = self.doc_ids.index(true_id)
                    true_emb = self.embeddings[true_idx].reshape(1, -1)
                    # dot product (since embeddings are normalized → cosine similarity)
                    correct_score = float(np.dot(query_vec, true_emb.T))
                else:
                    correct_score = None  # in case ground truth not in index

                # Find rank if ground truth appears in retrieved
                correct = true_id in retrieved_ids
                rank = retrieved_ids.index(true_id) + 1 if correct else None

                rows.append({
                    "query": q,
                    "ground_truth": true_id,
                    "top_ids": retrieved_ids,
                    "top_scores": retrieved_scores,
                    "correct_in_top_k": correct,
                    "rank_of_correct": rank,
                    "ground_truth_score": correct_score
                })

        self.results_df = pd.DataFrame(rows)
        return self.results_df


    def accuracy(self):
        """Compute Top-k retrieval accuracy."""
        if not hasattr(self, "results_df"):
            raise ValueError("Run evaluate_questions() first.")
        return self.results_df["correct"].mean()

    def save_results(self, path="retrieval_results.csv"):
        """Save results as CSV."""
        if hasattr(self, "results_df"):
            self.results_df.to_csv(path, index=False)
            print(f"✅ Saved results to {path}")
        else:
            print("No results to save.")


In [None]:
# Initialize evaluator
evaluator = EmbeddingSearchEvaluator(
    model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    emb_path="embeddings_multi_mp.npy",
    ids_path="doc_ids.pkl"
)

Loaded 2108 embeddings.


In [None]:
df_results = evaluator.evaluate_questions(mahabharata_questions, top_k=3)
print(df_results.head())

# Save to CSV
evaluator.save_results("1.3/retrieval_results_3.csv")


  correct_score = float(np.dot(query_vec, true_emb.T))
100%|██████████| 182/182 [00:52<00:00,  3.43it/s]


                                               query ground_truth  \
0  What is the main premise and scope of the Maha...          1.1   
1  Which topics are briefly listed or summarized ...          1.1   
2  What glories or benefits are said to come from...          1.1   
3  What is the significance of the Samantpanchak ...          1.2   
4  How is the Akshauhini army described or enumer...          1.2   

                 top_ids                                         top_scores  \
0  [18.6, 12.275, 14.41]  [0.5655632615089417, 0.557400107383728, 0.5542...   
1    [14.44, 18.6, 1.59]  [0.5797037482261658, 0.5629180669784546, 0.553...   
2     [18.6, 1.1, 3.164]  [0.5846554040908813, 0.5622318983078003, 0.551...   
3     [1.2, 3.163, 9.55]  [0.6505209803581238, 0.5605310797691345, 0.537...   
4    [5.19, 5.155, 6.19]  [0.6155834794044495, 0.5672976970672607, 0.561...   

   correct_in_top_k  rank_of_correct  ground_truth_score  
0             False              NaN            0.4

In [None]:
df_results = evaluator.evaluate_questions(mahabharata_questions, top_k=5)
print(df_results.head())

# Save to CSV
evaluator.save_results("1.3/retrieval_results_5.csv")


  correct_score = float(np.dot(query_vec, true_emb.T))
100%|██████████| 182/182 [00:51<00:00,  3.52it/s]

                                               query ground_truth  \
0  What is the main premise and scope of the Maha...          1.1   
1  Which topics are briefly listed or summarized ...          1.1   
2  What glories or benefits are said to come from...          1.1   
3  What is the significance of the Samantpanchak ...          1.2   
4  How is the Akshauhini army described or enumer...          1.2   

                                top_ids  \
0  [18.6, 12.275, 14.41, 14.44, 12.231]   
1     [14.44, 18.6, 1.59, 1.62, 12.231]   
2      [18.6, 1.1, 3.164, 1.59, 12.271]   
3       [1.2, 3.163, 9.55, 6.59, 5.111]   
4      [5.19, 5.155, 6.19, 6.76, 6.100]   

                                          top_scores  correct_in_top_k  \
0  [0.5655632615089417, 0.557400107383728, 0.5542...             False   
1  [0.5797037482261658, 0.5629180669784546, 0.553...             False   
2  [0.5846554040908813, 0.5622318983078003, 0.551...              True   
3  [0.6505209803581238, 0.5605




In [None]:
import numpy as np
import pickle
import faiss
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, CrossEncoder


class EmbeddingSearchEvaluator:
    def __init__(self,
                 model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
                 cross_encoder_name="cross-encoder/ms-marco-MiniLM-L-6-v2",
                 emb_path="embeddings.npy",
                 ids_path="doc_ids.pkl",
                 texts_path="doc_texts.pkl"):
        # Load bi-encoder model
        self.model = SentenceTransformer(model_name)

        # Load cross-encoder reranker
        self.reranker = CrossEncoder(cross_encoder_name)

        # Load embeddings and doc IDs
        self.embeddings = np.load(emb_path)
        with open(ids_path, "rb") as f:
            self.doc_ids = pickle.load(f)

        # Load document texts
        self.doc_texts = itihasa_data

        print(f"Loaded {len(self.embeddings)} embeddings and reranker '{cross_encoder_name}'.")

        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(self.embeddings)

        # Build FAISS index
        dim = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dim)
        self.index.add(self.embeddings)


    def search(self, query, top_k=10, rerank_top_n=3):
        """Search and rerank results."""
        query_vec = self.model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_vec)

        # Retrieve top-k using FAISS
        D, I = self.index.search(query_vec, k=top_k)
        retrieved = [
            {
                "doc_id": self.doc_ids[idx],
                "text": self.doc_texts[self.doc_ids[idx]],
                "score": float(D[0][rank])
            }
            for rank, idx in enumerate(I[0])
        ]

        # Prepare for reranking (query, doc_text) pairs
        pairs = [(query, doc["text"]) for doc in retrieved]

        # Get reranker scores
        rerank_scores = self.reranker.predict(pairs)

        # Merge scores and sort
        for doc, rscore in zip(retrieved, rerank_scores):
            doc["rerank_score"] = float(rscore)

        reranked = sorted(retrieved, key=lambda x: x["rerank_score"], reverse=True)
        return reranked[:rerank_top_n]


    def evaluate_questions(self, questions_dict, top_k=10, rerank_top_n=3):
        rows = []
        for true_id, entry in tqdm(questions_dict.items()):
            for q in entry.get("questions", []):
                results = self.search(q, top_k=top_k, rerank_top_n=rerank_top_n)
                retrieved_ids = [r["doc_id"] for r in results]
                correct = true_id in retrieved_ids
                rank = retrieved_ids.index(true_id) + 1 if correct else None

                rows.append({
                    "query": q,
                    "ground_truth": true_id,
                    "top_ids": retrieved_ids,
                    "correct_in_top_k": correct,
                    "rank_of_correct": rank
                })

        self.results_df = pd.DataFrame(rows)
        return self.results_df


    def topk_accuracy(self, k=3):
        """Compute top-k retrieval accuracy after reranking."""
        if not hasattr(self, "results_df"):
            raise ValueError("Run evaluate_questions() first.")
        return (self.results_df["rank_of_correct"].notnull() &
                (self.results_df["rank_of_correct"] <= k)).mean()

    def save_results(self, path="retrieval_results.csv"):
        """Save results as CSV."""
        if hasattr(self, "results_df"):
            self.results_df.to_csv(path, index=False)
            print(f"✅ Saved results to {path}")
        else:
            print("No results to save.")


In [None]:
evaluator = EmbeddingSearchEvaluator(
    model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    emb_path="Retrieval Experiments/embeddings_multi_mp.npy",
    cross_encoder_name="cross-encoder/ms-marco-electra-base",
    ids_path="Retrieval Experiments/doc_ids.pkl",
)



config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Loaded 2108 embeddings and reranker 'cross-encoder/ms-marco-electra-base'.


In [None]:
df_results = evaluator.evaluate_questions(mahabharata_questions, top_k=5)
print(df_results.head())

# Save to CSV
evaluator.save_results("Retrieval Experiments/1.4/1.5_retrieval_results_5.csv")


100%|██████████| 182/182 [1:33:21<00:00, 30.78s/it]

                                               query ground_truth  \
0  What is the main premise and scope of the Maha...          1.1   
1  Which topics are briefly listed or summarized ...          1.1   
2  What glories or benefits are said to come from...          1.1   
3  What is the significance of the Samantpanchak ...          1.2   
4  How is the Akshauhini army described or enumer...          1.2   

                top_ids  correct_in_top_k  rank_of_correct  
0  [18.6, 14.41, 14.44]             False              NaN  
1    [1.62, 18.6, 1.59]             False              NaN  
2     [18.6, 1.1, 1.59]              True              2.0  
3    [9.55, 1.2, 3.163]              True              2.0  
4   [5.19, 5.155, 6.19]             False              NaN  
✅ Saved results to Retrieval Experiments/1.4/1.5_retrieval_results_5.csv



