In [1]:
cd /content/drive/MyDrive/GenAI Project/

/content/drive/MyDrive/GenAI Project


In [2]:
import json

def read_json(filepath):
    """Reads a JSON file and returns the data."""
    with open(filepath, 'r') as f:
        return json.load(f)

def write_json(data, filepath):
    """Writes data to a JSON file."""
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=4)

In [3]:
mahabharata_questions =  read_json('Dataset/Test/questions.json')

ground_truth = {}
for qid, q in mahabharata_questions.items():
  ground_truth[qid] = q["ground_truth"]

In [None]:
ground_truth

{'q1': 'M.1.1',
 'q2': 'M.1.1',
 'q3': 'M.1.2',
 'q4': 'M.1.2',
 'q5': 'M.1.3',
 'q6': 'M.1.3',
 'q7': 'M.1.5',
 'q8': 'M.1.5',
 'q9': 'M.1.6',
 'q10': 'M.1.6',
 'q11': 'M.1.7',
 'q12': 'M.1.7',
 'q13': 'M.1.8',
 'q14': 'M.1.8',
 'q15': 'M.1.9',
 'q16': 'M.1.9',
 'q17': 'M.1.10',
 'q18': 'M.1.10',
 'q19': 'M.1.11',
 'q20': 'M.1.11',
 'q21': 'M.1.12',
 'q22': 'M.1.12',
 'q23': 'M.1.13',
 'q24': 'M.1.13',
 'q25': 'M.1.14',
 'q26': 'M.1.14',
 'q27': 'M.1.23',
 'q28': 'M.1.23',
 'q29': 'M.1.24',
 'q30': 'M.1.24',
 'q31': 'M.1.25',
 'q32': 'M.1.25',
 'q33': 'M.1.26',
 'q34': 'M.1.26',
 'q35': 'M.1.27',
 'q36': 'M.1.27',
 'q37': 'M.1.28',
 'q38': 'M.1.28',
 'q39': 'M.1.29',
 'q40': 'M.1.29',
 'q41': 'M.1.72',
 'q42': 'M.1.72',
 'q43': 'M.1.73',
 'q44': 'M.1.73',
 'q45': 'M.1.74',
 'q46': 'M.1.74',
 'q47': 'M.1.118',
 'q48': 'M.1.118',
 'q49': 'M.1.119',
 'q50': 'M.1.119',
 'q51': 'M.1.121',
 'q52': 'M.1.121',
 'q53': 'M.1.126',
 'q54': 'M.1.126',
 'q55': 'M.1.127',
 'q56': 'M.1.127',
 'q57':

In [None]:
import numpy as np
import math

def evaluate_retrieval(scores, ground_truth, top_k=5):
    """
    Evaluate retrieval performance using Precision@k, Recall@k, MRR, Hits@k, and nDCG@k.
    """
    hits = []
    reciprocal_ranks = []
    precision_at_k = []
    recall_at_k = []
    ndcg_at_k = []

    for qid, doc_scores in scores.items():
        if qid not in ground_truth:
            continue

        true_doc = ground_truth[qid]
        ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
        ranked_doc_ids = [d for d, _ in ranked_docs]

        # Top-k
        topk_docs = ranked_doc_ids[:top_k]

        # Hit@k
        hit = 1 if true_doc in topk_docs else 0
        hits.append(hit)

        # Rank
        if true_doc in ranked_doc_ids:
            rank = ranked_doc_ids.index(true_doc) + 1
            reciprocal_ranks.append(1.0 / rank)
        else:
            reciprocal_ranks.append(0.0)
            rank = None

        # Precision@k and Recall@k
        retrieved_relevant = 1 if true_doc in topk_docs else 0
        precision = retrieved_relevant / top_k
        recall = retrieved_relevant / 1  # only one relevant doc
        precision_at_k.append(precision)
        recall_at_k.append(recall)

        # nDCG@k
        if rank and rank <= top_k:
            ndcg = 1 / math.log2(rank + 1)
        else:
            ndcg = 0.0
        ndcg_at_k.append(ndcg)

    metrics = {
        f"Precision@{top_k}": np.mean(precision_at_k),
        f"Recall@{top_k}": np.mean(recall_at_k),
        f"Hits@{top_k}": np.mean(hits),
        f"nDCG@{top_k}": np.mean(ndcg_at_k),
        "MRR": np.mean(reciprocal_ranks)
    }

    return metrics


In [None]:
scores = read_json('retrieval_modules_testing/scores/multi-qa-mpnet-base-dot.json')
evaluate_retrieval(scores, ground_truth, top_k=3)


{'Precision@3': np.float64(0.11957465277777778),
 'Recall@3': np.float64(0.3587239583333333),
 'Hits@3': np.float64(0.3587239583333333),
 'nDCG@3': np.float64(0.30501116468099193),
 'MRR': np.float64(0.31989618080939924)}

In [None]:
scores = read_json('retrieval_modules_testing/scores/multi-qa-mpnet-base-dot.json')
evaluate_retrieval(scores, ground_truth, top_k=5)

{'Precision@5': np.float64(0.08307291666666666),
 'Recall@5': np.float64(0.4153645833333333),
 'Hits@5': np.float64(0.4153645833333333),
 'nDCG@5': np.float64(0.3280925240649751),
 'MRR': np.float64(0.31989618080939924)}

In [None]:
scores = read_json('retrieval_modules_testing/scores/ada_large.json')
evaluate_retrieval(scores, ground_truth, top_k=3)

{'Precision@3': np.float64(0.20182291666666666),
 'Recall@3': np.float64(0.60546875),
 'Hits@3': np.float64(0.60546875),
 'nDCG@3': np.float64(0.5178851101795053),
 'MRR': np.float64(0.5236444713425317)}

In [None]:
scores = read_json('retrieval_modules_testing/scores/ada_large.json')
evaluate_retrieval(scores, ground_truth, top_k=5)

{'Precision@5': np.float64(0.13749999999999998),
 'Recall@5': np.float64(0.6875),
 'Hits@5': np.float64(0.6875),
 'nDCG@5': np.float64(0.5517304300191065),
 'MRR': np.float64(0.5236444713425317)}

In [None]:
scores = read_json('retrieval_modules_testing/scores/ada_graph.json')
evaluate_retrieval(scores, ground_truth, top_k=3)

{'Precision@3': np.float64(0.20399305555555558),
 'Recall@3': np.float64(0.6119791666666666),
 'Hits@3': np.float64(0.6119791666666666),
 'nDCG@3': np.float64(0.5247908463378944),
 'MRR': np.float64(0.530957865258701)}

In [None]:
scores = read_json('retrieval_modules_testing/scores/ada_graph.json')
evaluate_retrieval(scores, ground_truth, top_k=5)

{'Precision@5': np.float64(0.13880208333333333),
 'Recall@5': np.float64(0.6940104166666666),
 'Hits@5': np.float64(0.6940104166666666),
 'nDCG@5': np.float64(0.5587502905286384),
 'MRR': np.float64(0.530957865258701)}

In [None]:
scores = read_json('retrieval_modules_testing/scores/mpnet_graph.json')
evaluate_retrieval(scores, ground_truth, top_k=5)

{'Precision@5': np.float64(0.09322916666666665),
 'Recall@5': np.float64(0.4661458333333333),
 'Hits@5': np.float64(0.4661458333333333),
 'nDCG@5': np.float64(0.3586656062390368),
 'MRR': np.float64(0.3446133606416084)}

In [None]:
scores = read_json('retrieval_modules_testing/scores/mpnet_graph.json')
evaluate_retrieval(scores, ground_truth, top_k=3)

{'Precision@3': np.float64(0.1323784722222222),
 'Recall@3': np.float64(0.3971354166666667),
 'Hits@3': np.float64(0.3971354166666667),
 'nDCG@3': np.float64(0.33019980538039734),
 'MRR': np.float64(0.3446133606416084)}

In [4]:
from sklearn.cluster import KMeans
import numpy as np

def cluster_top_docs(scores, top_n=20, num_clusters=3):
    """
    For each query, take top-N docs and cluster their scores.
    Return the docs in the best-scoring cluster.
    """

    clustered_results = {}     # qid → filtered doc list
    cluster_sizes = {}         # qid → size of selected cluster

    for qid, doc_scores in scores.items():
        # Sort top N
        ranked = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

        docs = [d for d,_ in ranked]
        vals = np.array([s for _,s in ranked]).reshape(-1,1)

        # Edge case: if fewer docs than clusters
        k = min(num_clusters, len(vals))

        # KMeans clustering
        kmeans = KMeans(n_clusters=k, random_state=0)
        labels = kmeans.fit_predict(vals)

        # Find best cluster = one with highest mean score
        best_label = max(range(k), key=lambda c: vals[labels==c].mean())

        # Docs belonging to that cluster
        best_docs = [docs[i] for i in range(len(docs)) if labels[i] == best_label]

        clustered_results[qid] = best_docs
        cluster_sizes[qid] = len(best_docs)

    return clustered_results, cluster_sizes

import numpy as np
import math

def evaluate_cluster_retrieval(scores, clustered_docs, ground_truth):
    """
    Evaluate retrieval using the clustered results instead of top-k.
    """

    hits = []
    reciprocal_ranks = []
    precision_scores = []
    recall_scores = []
    ndcg_scores = []

    for qid, doc_scores in scores.items():

        if qid not in ground_truth:
            continue

        true_doc = ground_truth[qid]
        retrieved_docs = clustered_docs.get(qid, [])

        # full ranking for MRR and nDCG
        ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
        ranked_doc_ids = [d for d,_ in ranked_docs]

        # Hit
        hit = 1 if true_doc in retrieved_docs else 0
        hits.append(hit)

        # Rank
        if true_doc in ranked_doc_ids:
            rank = ranked_doc_ids.index(true_doc) + 1
            reciprocal_ranks.append(1.0 / rank)
        else:
            reciprocal_ranks.append(0.0)
            rank = None

        # Precision & Recall
        retrieved_relevant = 1 if true_doc in retrieved_docs else 0
        precision_scores.append(retrieved_relevant / len(retrieved_docs) if retrieved_docs else 0)
        recall_scores.append(retrieved_relevant / 1)

        # nDCG@cluster
        if rank:
            ndcg_scores.append(1 / math.log2(rank + 1))
        else:
            ndcg_scores.append(0)

    metrics = {
        "Precision_cluster": np.mean(precision_scores),
        "Recall_cluster": np.mean(recall_scores),
        "Hits_cluster": np.mean(hits),
        "nDCG_cluster": np.mean(ndcg_scores),
        "MRR_cluster": np.mean(reciprocal_ranks)
    }

    return metrics


In [8]:

scores = read_json('retrieval_modules_testing/scores/multi-qa-mpnet-base-dot.json')

clustered_docs, cluster_sizes = cluster_top_docs(scores, top_n=20, num_clusters=2)

metrics = evaluate_cluster_retrieval(
    scores=scores,
    clustered_docs=clustered_docs,
    ground_truth=ground_truth
)

print(cluster_sizes)
print(metrics)


{'q1': 4, 'q2': 5, 'q3': 1, 'q4': 6, 'q5': 4, 'q6': 7, 'q7': 4, 'q8': 5, 'q9': 6, 'q10': 3, 'q11': 5, 'q12': 4, 'q13': 1, 'q14': 6, 'q15': 3, 'q16': 2, 'q17': 7, 'q18': 7, 'q19': 6, 'q20': 3, 'q21': 3, 'q22': 7, 'q23': 2, 'q24': 3, 'q25': 3, 'q26': 3, 'q27': 5, 'q28': 7, 'q29': 6, 'q30': 5, 'q31': 8, 'q32': 3, 'q33': 5, 'q34': 3, 'q35': 5, 'q36': 4, 'q37': 6, 'q38': 3, 'q39': 2, 'q40': 7, 'q41': 5, 'q42': 3, 'q43': 1, 'q44': 3, 'q45': 4, 'q46': 8, 'q47': 7, 'q48': 4, 'q49': 4, 'q50': 6, 'q51': 4, 'q52': 4, 'q53': 2, 'q54': 8, 'q55': 1, 'q56': 4, 'q57': 7, 'q58': 6, 'q59': 3, 'q60': 3, 'q61': 6, 'q62': 4, 'q63': 8, 'q64': 5, 'q65': 2, 'q66': 7, 'q67': 8, 'q68': 7, 'q69': 3, 'q70': 3, 'q71': 3, 'q72': 5, 'q73': 5, 'q74': 4, 'q75': 3, 'q76': 5, 'q77': 8, 'q78': 6, 'q79': 1, 'q80': 10, 'q81': 7, 'q82': 10, 'q83': 10, 'q84': 5, 'q85': 4, 'q86': 3, 'q87': 5, 'q88': 7, 'q89': 5, 'q90': 3, 'q91': 2, 'q92': 6, 'q93': 10, 'q94': 5, 'q95': 7, 'q96': 1, 'q97': 3, 'q98': 6, 'q99': 3, 'q100': 3, 'q1

In [9]:
scores = read_json('retrieval_modules_testing/scores/mpnet_graph.json')

clustered_docs, cluster_sizes = cluster_top_docs(scores, top_n=20, num_clusters=2)

metrics = evaluate_cluster_retrieval(
    scores=scores,
    clustered_docs=clustered_docs,
    ground_truth=ground_truth
)

print(cluster_sizes)
print(metrics)


{'q1': 5, 'q2': 5, 'q3': 1, 'q4': 7, 'q5': 4, 'q6': 7, 'q7': 6, 'q8': 5, 'q9': 4, 'q10': 3, 'q11': 7, 'q12': 4, 'q13': 1, 'q14': 6, 'q15': 3, 'q16': 2, 'q17': 5, 'q18': 6, 'q19': 6, 'q20': 3, 'q21': 1, 'q22': 7, 'q23': 4, 'q24': 5, 'q25': 7, 'q26': 7, 'q27': 5, 'q28': 6, 'q29': 4, 'q30': 6, 'q31': 8, 'q32': 3, 'q33': 3, 'q34': 4, 'q35': 6, 'q36': 7, 'q37': 4, 'q38': 4, 'q39': 2, 'q40': 5, 'q41': 4, 'q42': 5, 'q43': 1, 'q44': 4, 'q45': 4, 'q46': 3, 'q47': 4, 'q48': 4, 'q49': 4, 'q50': 6, 'q51': 4, 'q52': 5, 'q53': 4, 'q54': 5, 'q55': 4, 'q56': 6, 'q57': 9, 'q58': 5, 'q59': 3, 'q60': 3, 'q61': 6, 'q62': 4, 'q63': 12, 'q64': 5, 'q65': 5, 'q66': 7, 'q67': 7, 'q68': 7, 'q69': 1, 'q70': 5, 'q71': 7, 'q72': 5, 'q73': 5, 'q74': 4, 'q75': 3, 'q76': 5, 'q77': 8, 'q78': 6, 'q79': 6, 'q80': 10, 'q81': 7, 'q82': 9, 'q83': 12, 'q84': 5, 'q85': 5, 'q86': 8, 'q87': 9, 'q88': 7, 'q89': 5, 'q90': 5, 'q91': 2, 'q92': 6, 'q93': 10, 'q94': 5, 'q95': 7, 'q96': 1, 'q97': 3, 'q98': 6, 'q99': 6, 'q100': 3, 'q1

In [11]:
scores = read_json('retrieval_modules_testing/scores/ada_large.json')

clustered_docs, cluster_sizes = cluster_top_docs(scores, top_n=20, num_clusters=2)

metrics = evaluate_cluster_retrieval(
    scores=scores,
    clustered_docs=clustered_docs,
    ground_truth=ground_truth
)

print(cluster_sizes)
print(metrics)


{'q1': 9, 'q2': 6, 'q3': 1, 'q4': 2, 'q5': 5, 'q6': 7, 'q7': 5, 'q8': 8, 'q9': 6, 'q10': 2, 'q11': 4, 'q12': 6, 'q13': 2, 'q14': 3, 'q15': 2, 'q16': 4, 'q17': 3, 'q18': 3, 'q19': 6, 'q20': 3, 'q21': 4, 'q22': 4, 'q23': 2, 'q24': 4, 'q25': 4, 'q26': 4, 'q27': 7, 'q28': 7, 'q29': 9, 'q30': 7, 'q31': 2, 'q32': 6, 'q33': 3, 'q34': 3, 'q35': 1, 'q36': 11, 'q37': 9, 'q38': 3, 'q39': 6, 'q40': 10, 'q41': 4, 'q42': 3, 'q43': 4, 'q44': 4, 'q45': 4, 'q46': 5, 'q47': 2, 'q48': 4, 'q49': 3, 'q50': 4, 'q51': 4, 'q52': 2, 'q53': 11, 'q54': 1, 'q55': 2, 'q56': 8, 'q57': 2, 'q58': 15, 'q59': 7, 'q60': 3, 'q61': 5, 'q62': 4, 'q63': 2, 'q64': 7, 'q65': 3, 'q66': 4, 'q67': 5, 'q68': 4, 'q69': 1, 'q70': 4, 'q71': 3, 'q72': 2, 'q73': 4, 'q74': 1, 'q75': 3, 'q76': 4, 'q77': 4, 'q78': 4, 'q79': 5, 'q80': 4, 'q81': 6, 'q82': 4, 'q83': 6, 'q84': 6, 'q85': 3, 'q86': 8, 'q87': 4, 'q88': 4, 'q89': 3, 'q90': 6, 'q91': 4, 'q92': 7, 'q93': 4, 'q94': 5, 'q95': 7, 'q96': 3, 'q97': 5, 'q98': 1, 'q99': 6, 'q100': 2, 'q1

In [10]:
scores = read_json('retrieval_modules_testing/scores/ada_graph.json')

clustered_docs, cluster_sizes = cluster_top_docs(scores, top_n=20, num_clusters=2)

metrics = evaluate_cluster_retrieval(
    scores=scores,
    clustered_docs=clustered_docs,
    ground_truth=ground_truth
)

print(cluster_sizes)
print(metrics)


{'q1': 6, 'q2': 8, 'q3': 6, 'q4': 6, 'q5': 7, 'q6': 6, 'q7': 5, 'q8': 8, 'q9': 7, 'q10': 2, 'q11': 3, 'q12': 6, 'q13': 3, 'q14': 3, 'q15': 2, 'q16': 4, 'q17': 4, 'q18': 3, 'q19': 6, 'q20': 3, 'q21': 2, 'q22': 4, 'q23': 9, 'q24': 7, 'q25': 11, 'q26': 9, 'q27': 7, 'q28': 7, 'q29': 10, 'q30': 8, 'q31': 2, 'q32': 6, 'q33': 5, 'q34': 7, 'q35': 1, 'q36': 15, 'q37': 11, 'q38': 4, 'q39': 6, 'q40': 11, 'q41': 4, 'q42': 4, 'q43': 4, 'q44': 5, 'q45': 5, 'q46': 5, 'q47': 3, 'q48': 4, 'q49': 3, 'q50': 4, 'q51': 3, 'q52': 1, 'q53': 9, 'q54': 1, 'q55': 4, 'q56': 8, 'q57': 2, 'q58': 10, 'q59': 6, 'q60': 9, 'q61': 6, 'q62': 3, 'q63': 3, 'q64': 7, 'q65': 1, 'q66': 4, 'q67': 5, 'q68': 4, 'q69': 1, 'q70': 4, 'q71': 3, 'q72': 4, 'q73': 4, 'q74': 1, 'q75': 3, 'q76': 4, 'q77': 4, 'q78': 4, 'q79': 5, 'q80': 4, 'q81': 6, 'q82': 4, 'q83': 8, 'q84': 6, 'q85': 4, 'q86': 8, 'q87': 4, 'q88': 4, 'q89': 3, 'q90': 3, 'q91': 4, 'q92': 7, 'q93': 4, 'q94': 5, 'q95': 7, 'q96': 1, 'q97': 5, 'q98': 1, 'q99': 5, 'q100': 2, '