In [1]:
cd /content/drive/MyDrive/GenAI Project/

/content/drive/MyDrive/GenAI Project


In [2]:
import json

def read_json(filepath):
    """Reads a JSON file and returns the data."""
    with open(filepath, 'r') as f:
        return json.load(f)

def write_json(data, filepath):
    """Writes data to a JSON file."""
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=4)

In [4]:
mahabharata_questions =  read_json('Dataset/Test/questions.json')
itihasa_data = read_json('Dataset/Validation/final_data.json')

In [3]:
!pip install sentence-transformers faiss-cpu tqdm


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm

# Load fine-tuned model
model_path = "Retrieval Experiments/1.9/epoch_1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_path)
encoder = AutoModel.from_pretrained(model_path).to(device)

# Convert texts to embeddings
doc_ids = list(itihasa_data.keys())
texts = list(itihasa_data.values())

all_embeddings = []

encoder.eval()
with torch.no_grad():
    for i in tqdm(range(0, len(texts), 32), desc="Encoding documents"):
        batch_texts = texts[i:i+32]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
        outputs = encoder(**inputs)
        # Mean pooling
        emb = outputs.last_hidden_state.mean(dim=1)
        # Normalize
        emb = torch.nn.functional.normalize(emb, p=2, dim=1)
        all_embeddings.append(emb.cpu().numpy())

# Stack all batches
embeddings = np.vstack(all_embeddings)

# Save embeddings and IDs
np.save("Retrieval Experiments/1.9/embeddings_manual_mpnet_1.npy", embeddings)

print("✅ Saved embeddings:", embeddings.shape)


Encoding documents: 100%|██████████| 66/66 [01:27<00:00,  1.33s/it]


✅ Saved embeddings: (2108, 768)


In [12]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import faiss
from tqdm import tqdm
import json
import pickle
import pandas as pd

class EmbeddingSearchEvaluator:
    def __init__(self,
                 model_path="Retrieval Experiments/1.9/epoch_1",
                 emb_path="Retrieval Experiments/1.9/embeddings_manual_mpnet_1.npy",
                 ids_path="retrieval_modules_testing/embedding_models/doc_ids.pkl"):
        # Load fine-tuned model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.encoder = AutoModel.from_pretrained(model_path).to(self.device)
        self.encoder.eval()

        # Load embeddings and IDs
        self.embeddings = np.load(emb_path)
        with open(ids_path, "rb") as f:
            self.doc_ids = pickle.load(f)
        print(f"✅ Loaded {len(self.embeddings)} embeddings.")

        # Normalize (if not already)
        faiss.normalize_L2(self.embeddings)

        # Build FAISS index
        dim = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dim)
        self.index.add(self.embeddings)

    def _encode_query(self, text):
        """Encode and normalize a single query string."""
        inputs = self.tokenizer([text], return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.encoder(**inputs)
            emb = outputs.last_hidden_state.mean(dim=1)
            emb = torch.nn.functional.normalize(emb, p=2, dim=1)
        return emb.cpu().numpy()

    def evaluate_full_scores(self, questions_dict):
        results = {}

        for q_id, q in tqdm(questions_dict.items(), desc="Evaluating questions"):
                # Encode query and normalize
                query_vec = self._encode_query(q['question'])
                D, I = self.index.search(query_vec, k=len(self.doc_ids))
                scores = D[0]
                ids = [self.doc_ids[i] for i in I[0]]
                results[q_id] = {doc_id: float(score) for doc_id, score in zip(ids, scores)}

        self.full_scores = results
        return results

    def save_full_scores(self, path="Retrieval Experiments/1.9/embedding_scores_1.json"):
        if not hasattr(self, "full_scores"):
            raise ValueError("Run evaluate_full_scores() first.")
        with open(path, "w", encoding="utf-8") as f:
            json.dump(self.full_scores, f, indent=4, ensure_ascii=False)
        print(f"✅ Saved question–chapter scores to {path}")

    def search(self, query, top_k=3):
        """Search for top_k most similar documents for a query string."""
        query_vec = self.model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_vec)
        D, I = self.index.search(query_vec, k=top_k)
        results = [
            {"rank": r + 1,
             "score": float(D[0][r]),
             "doc_id": self.doc_ids[I[0][r]]}
            for r in range(top_k)
        ]
        return results

    def evaluate_questions(self, questions_dict, top_k=3):
        rows = []
        for q_id, q in tqdm(questions_dict.items(), desc="Evaluating questions"):
                # Encode query and normalize
                query_vec = self._encode_query(q['question'])

                # Search top-k documents
                D, I = self.index.search(query_vec, k=top_k)
                retrieved_ids = [self.doc_ids[idx] for idx in I[0]]
                retrieved_scores = [float(score) for score in D[0]]

                # Compute ground-truth similarity score
                if true_id in self.doc_ids:
                    true_idx = self.doc_ids.index(true_id)
                    true_emb = self.embeddings[true_idx].reshape(1, -1)
                    correct_score = float(np.dot(query_vec, true_emb.T))
                else:
                    correct_score = None

                # Find rank if ground truth appears in top-k
                correct_in_top_k = true_id in retrieved_ids
                rank = retrieved_ids.index(true_id) + 1 if correct_in_top_k else None

                rows.append({
                    "query": q,
                    "ground_truth": true_id,
                    "top_ids": retrieved_ids,
                    "top_scores": retrieved_scores,
                    "correct_in_top_k": correct_in_top_k,
                    "rank_of_correct": rank,
                    "ground_truth_score": correct_score
                })

        self.results_df = pd.DataFrame(rows)
        return self.results_df

    # -------------------------------------------------------------------
    # 3️⃣ Compute Accuracy Metrics
    # -------------------------------------------------------------------
    def accuracy(self):
        """Compute Top-k retrieval accuracy (fraction of queries whose true doc appears in top-k)."""
        if not hasattr(self, "results_df"):
            raise ValueError("Run evaluate_questions() first.")
        return self.results_df["correct_in_top_k"].mean()

    # -------------------------------------------------------------------
    # 4️⃣ Save results as CSV
    # -------------------------------------------------------------------
    def save_results(self, path="retrieval_results.csv"):
        if hasattr(self, "results_df"):
            self.results_df.to_csv(path, index=False)
            print(f"✅ Saved retrieval results to {path}")
        else:
            print("⚠️ No results to save. Run evaluate_questions() first.")


In [13]:
# Initialize evaluator
evaluator = EmbeddingSearchEvaluator()
results = evaluator.evaluate_full_scores(mahabharata_questions)
evaluator.save_full_scores()

✅ Loaded 2108 embeddings.


Evaluating questions: 100%|██████████| 1536/1536 [00:19<00:00, 80.11it/s]


✅ Saved question–chapter scores to Retrieval Experiments/1.9/embedding_scores_1.json


In [14]:
ground_truth = {}
for qid, q in mahabharata_questions.items():
  ground_truth[qid] = q["ground_truth"]

In [15]:
import numpy as np
import math

def evaluate_retrieval(scores, ground_truth, top_k=5):
    """
    Evaluate retrieval performance using Precision@k, Recall@k, MRR, Hits@k, and nDCG@k.
    """
    hits = []
    reciprocal_ranks = []
    precision_at_k = []
    recall_at_k = []
    ndcg_at_k = []

    for qid, doc_scores in scores.items():
        if qid not in ground_truth:
            continue

        true_doc = ground_truth[qid]
        ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
        ranked_doc_ids = [d for d, _ in ranked_docs]

        # Top-k
        topk_docs = ranked_doc_ids[:top_k]

        # Hit@k
        hit = 1 if true_doc in topk_docs else 0
        hits.append(hit)

        # Rank
        if true_doc in ranked_doc_ids:
            rank = ranked_doc_ids.index(true_doc) + 1
            reciprocal_ranks.append(1.0 / rank)
        else:
            reciprocal_ranks.append(0.0)
            rank = None

        # Precision@k and Recall@k
        retrieved_relevant = 1 if true_doc in topk_docs else 0
        precision = retrieved_relevant / top_k
        recall = retrieved_relevant / 1  # only one relevant doc
        precision_at_k.append(precision)
        recall_at_k.append(recall)

        # nDCG@k
        if rank and rank <= top_k:
            ndcg = 1 / math.log2(rank + 1)
        else:
            ndcg = 0.0
        ndcg_at_k.append(ndcg)

    metrics = {
        f"Precision@{top_k}": np.mean(precision_at_k),
        f"Recall@{top_k}": np.mean(recall_at_k),
        f"Hits@{top_k}": np.mean(hits),
        f"nDCG@{top_k}": np.mean(ndcg_at_k),
        "MRR": np.mean(reciprocal_ranks)
    }

    return metrics


In [16]:
scores = read_json('Retrieval Experiments/1.9/embedding_scores_1.json')
metrics = evaluate_retrieval(scores, ground_truth, top_k=3)
metrics

{'Precision@3': np.float64(0.09678819444444442),
 'Recall@3': np.float64(0.2903645833333333),
 'Hits@3': np.float64(0.2903645833333333),
 'nDCG@3': np.float64(0.23855858848121494),
 'MRR': np.float64(0.2600468996410918)}