In [1]:
cd /content/drive/MyDrive/GenAI Project/

/content/drive/MyDrive/GenAI Project


In [2]:
import json

def read_json(filepath):
    """Reads a JSON file and returns the data."""
    with open(filepath, 'r') as f:
        return json.load(f)

def write_json(data, filepath):
    """Writes data to a JSON file."""
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=4)

In [3]:
chapters = read_json('Dataset/Corpus/chapters.json')
mahabharata_questions =  read_json('Dataset/Test/questions.json')

In [None]:
!pip install sentence-transformers faiss-cpu tqdm


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


### Naive

In [None]:
import numpy as np
import pickle
import faiss
import pandas as pd
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


class EmbeddingSearchEvaluator:
    def __init__(self,
                 model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
                 emb_path="embeddings.npy",
                 ids_path="doc_ids.pkl"):
        # Load model and data
        self.model = SentenceTransformer(model_name)
        self.embeddings = np.load(emb_path)
        with open(ids_path, "rb") as f:
            self.doc_ids = pickle.load(f)
        print(f"✅ Loaded {len(self.embeddings)} embeddings.")

        # Normalize for cosine/dot similarity
        faiss.normalize_L2(self.embeddings)

        # Build FAISS index
        dim = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dim)
        self.index.add(self.embeddings)

    def evaluate_full_scores(self, questions_dict):
        """
        For each question, compute similarity scores with ALL chapters (doc_ids).
        Returns: dict[question][ch_id] = similarity_score
        """
        results = {}

        for q_id, q in tqdm(questions_dict.items(), desc="Evaluating questions"):
            # for q in entry.get("questions", []):
                # Encode and normalize query
                query_vec = self.model.encode([q['question']], convert_to_numpy=True)
                faiss.normalize_L2(query_vec)

                # Compute similarity with all docs
                D, I = self.index.search(query_vec, k=len(self.doc_ids))
                scores = D[0]
                ids = [self.doc_ids[i] for i in I[0]]

                # Map chapter IDs → similarity scores
                results[q_id] = {doc_id: float(score) for doc_id, score in zip(ids, scores)}

        self.full_scores = results
        return results

    def save_full_scores(self, path="question_scores.json"):
        """Save the full score dictionary to JSON."""
        if not hasattr(self, "full_scores"):
            raise ValueError("Run evaluate_full_scores() first.")
        with open(path, "w", encoding="utf-8") as f:
            json.dump(self.full_scores, f, indent=4, ensure_ascii=False)
        print(f"✅ Saved question–chapter scores to {path}")


In [None]:
# Initialize evaluator
evaluator = EmbeddingSearchEvaluator(
    model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    emb_path="retrieval_modules_testing/embedding_models/embeddings_multi_mp.npy",
    ids_path="retrieval_modules_testing/embedding_models/doc_ids.pkl"
)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Loaded 2108 embeddings.


In [None]:
# msmarco‑distilbert‑dot‑v5
results = evaluator.evaluate_full_scores(mahabharata_questions)
evaluator.save_full_scores("retrieval_modules_testing/scores/multi-qa-mpnet-base-dot.json")


Evaluating questions: 100%|██████████| 1536/1536 [03:45<00:00,  6.80it/s]


✅ Saved question–chapter scores to retrieval_modules_testing/scores/multi-qa-mpnet-base-dot.json


In [None]:
from google.colab import userdata
import os
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [None]:
import numpy as np
import pickle
import faiss
import pandas as pd
from tqdm import tqdm
from openai import OpenAI

class OpenAIEmbeddingSearchEvaluator:
    def __init__(self,
                 model_name="text-embedding-3-large",
                 emb_path="embeddings.npy",
                 ids_path="doc_ids.pkl",
                 index_path="corpus.index"):
        self.client = OpenAI()
        self.model_name = model_name

        # Load embeddings and metadata
        self.embeddings = np.load(emb_path)
        with open(ids_path, "rb") as f:
            self.doc_ids = pickle.load(f)
        self.index = faiss.read_index(index_path)
        print(f"Loaded {len(self.embeddings)} embeddings with dimension {self.embeddings.shape[1]}")

        # Optional: normalize for cosine similarity
        faiss.normalize_L2(self.embeddings)

    # -----------------------------
    # 🔍 Search
    # -----------------------------
    def search(self, query, top_k=5):
        """Search for top_k most similar documents for a query string."""
        q_emb = self._embed_query(query)
        faiss.normalize_L2(q_emb)
        D, I = self.index.search(q_emb, k=top_k)

        results = [
            {"rank": r + 1,
             "score": float(D[0][r]),
             "doc_id": self.doc_ids[I[0][r]]}
            for r in range(top_k)
        ]
        return results

    # -----------------------------
    # 🧠 Evaluation
    # -----------------------------
    def evaluate_questions(self, questions_dict, save_query_embs=True):
      self.all_scores_dict = {}
      query_emb_store = {}  # <--- store query embeddings here

      for qid, ques in tqdm(questions_dict.items()):
              q = ques['question']
              q_emb = self._embed_query(q)
              faiss.normalize_L2(q_emb)

              # Save embedding
              if save_query_embs:
                  query_emb_store[q] = q_emb.flatten().astype("float32")

              sims = np.dot(q_emb, self.embeddings.T).flatten()
              self.all_scores_dict[qid] = {
                  self.doc_ids[i]: float(sims[i]) for i in range(len(self.doc_ids))
              }

      # ✅ Save query embeddings to disk
      if save_query_embs:
          np.save("retrieval_modules_testing/embedding_models/query_embeddings.npy", np.stack(list(query_emb_store.values())))
          with open("retrieval_modules_testing/embedding_models/query_texts.pkl", "wb") as f:
              pickle.dump(list(query_emb_store.keys()), f)
          print(f"💾 Saved {len(query_emb_store)} query embeddings to query_embeddings.npy and query_texts.pkl")

      return self.all_scores_dict

    # -----------------------------
    # 🧩 Internal: embed query
    # -----------------------------
    def _embed_query(self, query):
        resp = self.client.embeddings.create(input=[query], model=self.model_name)
        q_emb = np.array(resp.data[0].embedding, dtype="float32").reshape(1, -1)
        return q_emb


In [None]:
retriever = OpenAIEmbeddingSearchEvaluator(
    model_name="text-embedding-3-large",
    emb_path="retrieval_modules_testing/embedding_models/embeddings_ada_large.npy",
    ids_path="retrieval_modules_testing/embedding_models/doc_ids.pkl",
    index_path="retrieval_modules_testing/embedding_models/corpus.index"
)


# all_scores_dict = retriever.evaluate_questions(mahabharata_questions)

Loaded 2108 embeddings with dimension 3072


In [None]:
results = retriever.search(
    """After escaping the fire at Varanavata, Pandava brothers were living in disguise. During this period they heard about the Swayamvar of King Drupada's daughter being held in Panchala and decided to attend the ceremony.
This was one of the most crucial events in the history of Mahabharata paving the way for re-emergence of Pandava into public life and many events that followed.
This ceremony is also quoted by many for the treatment of Karna, but what exactly happened? Is there any merit in the argument that Karna had to face discrimination because of being a Suta?
""",
    top_k=5
)
results

[{'rank': 1, 'score': 0.6601585745811462, 'doc_id': 'M.12.4'},
 {'rank': 2, 'score': 0.647534966468811, 'doc_id': 'M.3.309'},
 {'rank': 3, 'score': 0.6470394134521484, 'doc_id': 'M.1.111'},
 {'rank': 4, 'score': 0.640983521938324, 'doc_id': 'M.1.185'},
 {'rank': 5, 'score': 0.6408836245536804, 'doc_id': 'M.1.189'}]

In [None]:
len(retriever.all_scores_dict)

1536

In [None]:
write_json(retriever.all_scores_dict, "retrieval_modules_testing/scores/ada_large.json")


### Fuse scores

#### Entities

In [4]:
entity_index = read_json('Dataset/Corpus/entity_index.json')
mahabharata_questions = read_json('Dataset/Test/questions.json')
entities_kb = read_json('Dataset/Corpus/entities_kb.json')
chapter_entity_ids = read_json('Dataset/Corpus/chapter_entity_ids.json')


In [None]:
chapter_entity_ids

{'1.1': {'e12207': 5,
  'e4679': 5,
  'e7587': 2,
  'e9540': 1,
  'e2579': 1,
  'e7498': 2,
  'e8267': 1,
  'e9817': 11,
  'e11327': 4,
  'e4777': 3,
  'e6721': 16,
  'e6276': 1,
  'e6239': 3,
  'e7969': 15,
  'e1743': 5,
  'e8769': 9,
  'e10362': 2,
  'e1782': 2,
  'e11773': 13,
  'e12416': 9,
  'e1697': 1,
  'e9365': 1,
  'e4618': 6,
  'e5907': 23,
  'e8806': 2,
  'e12635': 4,
  'e2661': 3,
  'e11959': 1,
  'e123': 1,
  'e1741': 9,
  'e177': 1,
  'e3015': 1,
  'e7118': 2,
  'e12536': 2,
  'e4285': 1,
  'e7849': 1,
  'e8386': 2,
  'e8699': 1,
  'e9215': 1,
  'e1899': 1,
  'e10478': 3,
  'e11755': 3,
  'e1924': 1,
  'e9137': 1,
  'e10113': 1,
  'e2525': 1,
  'e2920': 1,
  'e6950': 1,
  'e9268': 1,
  'e12461': 1,
  'e12607': 1,
  'e1329': 1,
  'e6414': 1,
  'e12613': 1,
  'e11672': 1,
  'e900': 1,
  'e11176': 3,
  'e11782': 1,
  'e12531': 1,
  'e9135': 1,
  'e9342': 1,
  'e7783': 1,
  'e7810': 2,
  'e4028': 4,
  'e2802': 3,
  'e9460': 70,
  'e903': 1,
  'e9195': 1,
  'e9353': 1,
  'e110

In [None]:
chapter_entity_ids = {f"M.{_id}": v for _id, v in chapter_entity_ids.items()}
chapter_entity_ids

{'M.1.1': {'e12207': 5,
  'e4679': 5,
  'e7587': 2,
  'e9540': 1,
  'e2579': 1,
  'e7498': 2,
  'e8267': 1,
  'e9817': 11,
  'e11327': 4,
  'e4777': 3,
  'e6721': 16,
  'e6276': 1,
  'e6239': 3,
  'e7969': 15,
  'e1743': 5,
  'e8769': 9,
  'e10362': 2,
  'e1782': 2,
  'e11773': 13,
  'e12416': 9,
  'e1697': 1,
  'e9365': 1,
  'e4618': 6,
  'e5907': 23,
  'e8806': 2,
  'e12635': 4,
  'e2661': 3,
  'e11959': 1,
  'e123': 1,
  'e1741': 9,
  'e177': 1,
  'e3015': 1,
  'e7118': 2,
  'e12536': 2,
  'e4285': 1,
  'e7849': 1,
  'e8386': 2,
  'e8699': 1,
  'e9215': 1,
  'e1899': 1,
  'e10478': 3,
  'e11755': 3,
  'e1924': 1,
  'e9137': 1,
  'e10113': 1,
  'e2525': 1,
  'e2920': 1,
  'e6950': 1,
  'e9268': 1,
  'e12461': 1,
  'e12607': 1,
  'e1329': 1,
  'e6414': 1,
  'e12613': 1,
  'e11672': 1,
  'e900': 1,
  'e11176': 3,
  'e11782': 1,
  'e12531': 1,
  'e9135': 1,
  'e9342': 1,
  'e7783': 1,
  'e7810': 2,
  'e4028': 4,
  'e2802': 3,
  'e9460': 70,
  'e903': 1,
  'e9195': 1,
  'e9353': 1,
  'e1

In [5]:
!pip install RapidFuzz

Collecting RapidFuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/3.2 MB[0m [31m21.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m60.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: RapidFuzz
Successfully installed RapidFuzz-3.14.3


In [6]:
from rapidfuzz import fuzz, process
from collections import defaultdict

class ProcessQuery:

  def __init__(self, query, entity_index):
      self.query = query
      self.entity_index = entity_index

  def extract_query_entities(self, query_word, threshold=90):
      matches = process.extract(query_word, list(self.entity_index.keys()), scorer=fuzz.token_sort_ratio, limit=5)
      matched_entities = []
      scores = []
      for _match, score, _ in matches:
          if float(score) >= threshold:
              matched_entities.append(self.entity_index[_match])
              scores.append(score)
      return matched_entities, scores

  def score_query_entities(self):
    self.query_entities = {}
    for query_word in self.query.split():
      entities, scores = self.extract_query_entities(query_word.lower())
      if entities:
        for _e in entities:
          for e in _e:
            self.query_entities[e] = scores[0]



In [7]:
query =  """What is the story of Prahlada and Bali told by Draupadi to Yudhishtira
"""
p = ProcessQuery(query, entity_index)
p.score_query_entities()

p.query_entities

{'e8487': 100.0,
 'e8488': 100.0,
 'e8489': 100.0,
 'e8490': 100.0,
 'e1197': 100.0,
 'e1198': 100.0,
 'e1199': 100.0,
 'e3659': 100.0,
 'e3660': 100.0,
 'e12633': 95.65217391304348,
 'e12585': 95.65217391304348,
 'e12586': 95.65217391304348,
 'e12587': 95.65217391304348}

In [None]:
data = []

for qid, ques in mahabharata_questions.items():
    q = ques['question']
    p = ProcessQuery(q, entity_index)
    p.score_query_entities()
    entities_names = [entities_kb[k] for k in p.query_entities.keys()]
    entities_ids = list(p.query_entities.keys())
    scores = list(p.query_entities.values())
    data.append({
          "qid": qid,
          "ground_truth": ques["ground_truth"],
          "question": q,
          "entity_ids": entities_ids,
          "entity_scores": scores
    })

import pandas as pd
df = pd.DataFrame(data)

In [None]:
query =  """After escaping the fire at Varanavata, Pandava brothers were living in disguise. During this period they heard about the Swayamvar of King Drupada's daughter being held in Panchala and decided to attend the ceremony.
This was one of the most crucial events in the history of Mahabharata paving the way for re-emergence of Pandava into public life and many events that followed.
This ceremony is also quoted by many for the treatment of Karna, but what exactly happened? Is there any merit in the argument that Karna had to face discrimination because of being a Suta?
"""
p = ProcessQuery(query, entity_index)
p.score_query_entities()

p.query_entities

{'e11566': 95.23809523809523,
 'e7969': 100.0,
 'e7970': 100.0,
 'e7971': 100.0,
 'e7972': 100.0,
 'e7973': 100.0,
 'e7974': 100.0,
 'e7975': 100.0,
 'e7976': 100.0,
 'e7900': 94.11764705882352,
 'e7901': 94.11764705882352,
 'e7902': 94.11764705882352,
 'e7903': 94.11764705882352,
 'e7904': 94.11764705882352,
 'e7905': 94.11764705882352,
 'e7906': 94.11764705882352,
 'e6721': 100.0,
 'e5432': 100.0,
 'e5433': 100.0,
 'e5401': 100.0,
 'e5402': 100.0,
 'e5403': 100.0,
 'e5404': 100.0}

In [None]:
topk, final_scores = chapter_similarity_propagation(chapter_entity_ids, list(p.query_entities.keys()), list(p.query_entities.values()))

In [None]:
topk, final_scores

(['M.18.2', 'M.4.30', 'M.18.4'],
 {'M.18.2': 1.0,
  'M.4.30': 1.0,
  'M.18.4': 1.0,
  'M.4.26': 1.0,
  'M.18.5': 1.0,
  'M.4.52': 1.0,
  'M.4.51': 1.0,
  'M.1.1': 1.0,
  'M.6.109': 1.0,
  'M.6.110': 1.0,
  'M.6.124': 1.0,
  'M.7.1': 1.0,
  'M.6.97': 1.0,
  'M.6.98': 1.0,
  'M.6.99': 1.0,
  'M.6.100': 1.0,
  'M.3.86': 1.0,
  'M.3.91': 1.0,
  'M.6.106': 1.0,
  'M.7.2': 1.0,
  'M.7.4': 1.0,
  'M.10.9': 1.0,
  'M.10.10': 1.0,
  'M.11.1': 1.0,
  'M.4.60': 1.0,
  'M.4.63': 1.0,
  'M.3.241': 1.0,
  'M.3.247': 1.0,
  'M.3.249': 1.0,
  'M.3.250': 1.0,
  'M.3.251': 1.0,
  'M.11.21': 1.0,
  'M.11.27': 1.0,
  'M.12.1': 1.0,
  'M.12.2': 1.0,
  'M.12.3': 1.0,
  'M.5.2': 1.0,
  'M.5.3': 1.0,
  'M.5.8': 1.0,
  'M.3.252': 1.0,
  'M.3.253': 1.0,
  'M.3.254': 1.0,
  'M.3.255': 1.0,
  'M.3.256': 1.0,
  'M.4.54': 1.0,
  'M.4.55': 1.0,
  'M.4.59': 1.0,
  'M.1.61': 1.0,
  'M.1.62': 1.0,
  'M.11.16': 1.0,
  'M.11.18': 1.0,
  'M.9.32': 1.0,
  'M.9.33': 1.0,
  'M.9.7': 1.0,
  'M.9.8': 1.0,
  'M.9.9': 1.0,
  'M.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity


def build_entity_matrix(ch_entities):
    """Create normalized entity frequency matrix (chapters × entities)."""
    all_entities = sorted({e for ents in ch_entities.values() for e in ents})
    df = pd.DataFrame(0, index=ch_entities.keys(), columns=all_entities)

    for ch, ents in ch_entities.items():
        for ent, freq in ents.items():
            df.loc[ch, ent] = freq

    # Normalize per chapter (term frequency normalization)
    df_norm = df.div(df.sum(axis=1), axis=0).fillna(0)
    return df, df_norm, all_entities

df_entities, df_norm, all_entities = build_entity_matrix(chapter_entity_ids)

def compute_tfidf(df_norm):
    """Compute TF-IDF matrix from normalized entity frequencies."""
    tfidf = TfidfTransformer(norm=None)
    X_tfidf = tfidf.fit_transform(df_norm.values)
    return X_tfidf


def get_valid_query_indices(query_entities, query_weights, all_entities):
    """Return indices and weights of query entities that exist in the corpus."""
    valid_pairs = [
        (all_entities.index(e), w)
        for e, w in zip(query_entities, query_weights)
        if e in all_entities
    ]
    if not valid_pairs:
        return None, None
    query_indices, valid_weights = zip(*valid_pairs)
    return list(query_indices), np.array(valid_weights)

def group_consecutive_entities(entities):
    groups = []
    current_group = [entities[0]]

    for prev, curr in zip(entities, entities[1:]):
        if int(curr[1:]) == int(prev[1:]) + 1:  # check consecutive numbers
            current_group.append(curr)
        else:
            groups.append(current_group)
            current_group = [curr]
    groups.append(current_group)
    return groups

def compute_group_mask(df_entities, groups):
    """
    Return a binary mask [num_chapters x num_groups]
    where mask[i, g] = 1 if chapter i contains any entity from group g.
    """
    mask = np.zeros((len(df_entities), len(groups)), dtype=int)
    for g_idx, group in enumerate(groups):
        valid_entities = [e for e in group if e in df_entities.columns]
        mask[:, g_idx] = (df_entities[valid_entities].sum(axis=1) > 0).astype(int)
    return mask


def compute_direct_relevance(X_tfidf, query_entities, query_indices, query_weights):
    """Compute direct relevance of each chapter using weighted query entities."""
    # Step 1: compute normal relevance
    relevance = np.ravel(X_tfidf[:, query_indices].dot(query_weights))

    # Step 2: build group mask
    groups = group_consecutive_entities(query_entities)
    group_mask = compute_group_mask(df_entities, groups)

    # Step 3: find which chapters have all groups represented
    has_all_groups = (group_mask.sum(axis=1) == len(groups))

    # Step 4: zero out relevance for chapters missing a group
    group_coverage = group_mask.sum(axis=1) / len(groups)
    relevance = relevance * group_coverage

    if not isinstance(relevance, np.ndarray):
        relevance = relevance.toarray()
    return np.ravel(relevance)


def propagate_relevance(X_tfidf, df_entities, query_entities, relevance, k=5):
    """Propagate relevance scores through similarity graph (masked)."""
    sim_matrix = cosine_similarity(X_tfidf)
    topk_idx = np.argsort(-relevance)[:k]
    prop_scores = np.zeros(len(df_entities))

    # Mask: chapters that contain at least one query entity
    valid_cols = [e for e in query_entities if e in df_entities.columns]
    chapter_mask = (df_entities[valid_cols].sum(axis=1) > 0).astype(int).values

    for idx in topk_idx:
        masked_sim = sim_matrix[:, idx] * chapter_mask
        prop_scores += relevance[idx] * masked_sim

    return prop_scores


def normalize_scores(prop_scores, alpha=2.0):
    """Apply sigmoid normalization to propagated scores."""
    return 1 / (1 + np.exp(-alpha * (prop_scores - np.mean(prop_scores))))

def chapter_similarity_propagation(chapter_entities, query_entities, query_weights, k=5, alpha=2.0):
    """
    Compute chapter relevance given entity frequencies and query entities.
    Stage 1: Direct relevance using TF-IDF
    Stage 2: Propagated relevance via similarity graph (masked)
    """

    # Stage 1: TF-IDF
    X_tfidf = compute_tfidf(df_norm)

    # Stage 1.5: Query entity matching
    query_indices, query_weights = get_valid_query_indices(query_entities, query_weights, all_entities)
    # print(query_indices, query_weights)
    if query_indices is None:
        zero_scores = pd.Series(0.0, index=df_entities.index)
        return [], zero_scores.to_dict()

    # Stage 2: Direct relevance
    relevance = compute_direct_relevance(X_tfidf, query_entities, query_indices, query_weights)


    # Stage 3: Propagation
    prop_scores = propagate_relevance(X_tfidf, df_entities, query_entities, relevance, k=k)

    # Stage 4: Normalization
    scores_normalized = normalize_scores(prop_scores, alpha=alpha)

    # Stage 5: Output
    final_scores = pd.Series(scores_normalized, index=df_entities.index).sort_values(ascending=False)
    top3_ids = list(final_scores.index[:3])
    return top3_ids, final_scores.to_dict()


In [None]:
top_chapters_list = []
final_data = {}
from tqdm import tqdm

for index, row in tqdm(df.iterrows(), total = df.shape[0]):
    query_entities = row['entity_ids']
    query_weights = row['entity_scores']
    qid = row['qid']
    try:
        topk, final_scores = chapter_similarity_propagation(chapter_entity_ids, query_entities, query_weights)
        top_chapters_list.append(topk)
        final_data[qid] = final_scores
    except ValueError as e:
        print(f"Skipping row {qid} due to error: {e}")
        final_data[qid] = {}
        top_chapters_list.append(None)


100%|██████████| 1536/1536 [15:25<00:00,  1.66it/s]


In [None]:
graph_scores = {}
for index, row in tqdm(df.iterrows(), total = df.shape[0]):
    graph_scores[row['qid']] = final_data[row['qid']]

with open('retrieval_modules_testing/scores/graph_scores.json', 'w') as f:
    json.dump(graph_scores, f, indent = 4)

100%|██████████| 1536/1536 [00:00<00:00, 11754.92it/s]


#### Fuse

In [None]:
def fuse(embedding_scores, graph_scores):
  combined_scores = {}

  a = 0.9
  b = 0.1

  for question, scores in embedding_scores.items():
      combined_scores[question] = {}
      for chapter, score in scores.items():
          if chapter in graph_scores[question]:
              combined_scores[question][chapter] = (a * score) + (b * graph_scores[question][chapter])
          else:
              combined_scores[question][chapter] = score # Handle cases where a chapter is not in graph_scores

  return combined_scores

In [None]:
combined_scores = fuse(evaluator.full_scores, graph_scores)

In [None]:
write_json(combined_scores, 'retrieval_modules_testing/scores/mpnet_graph.json')

In [None]:
top_5_chapters_combined = {}

chapters_ids_net = []
accuracies = []
for i, (qid, scores) in enumerate(combined_scores.items()):
    sorted_chapters = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:5]
    top_5_chapters_combined[qid] = dict(sorted_chapters)
    cids = [chapter for chapter, _ in sorted_chapters]
    chapters_ids_net.append(cids)
    gt = df.loc[df['qid'] == qid, 'ground_truth'].iloc[0]
    # print(gt)
    acc = 1 if gt in cids else 0
    # print(gt, cids, acc)
    accuracies.append(acc)


In [None]:
sum(accuracies)

716

In [None]:
1124 / 1536

0.7317708333333334

In [None]:
embedding_scores = retriever.all_scores_dict

def get_combined_scores(a,b):
  combined_scores = {}
  for question, scores in embedding_scores.items():
      combined_scores[question] = {}
      for chapter, score in scores.items():
          if chapter in graph_scores[question]:
              combined_scores[question][chapter] = (a * score) + (b * graph_scores[question][chapter])
          else:
              combined_scores[question][chapter] = score # Handle cases where a chapter is not in graph_scores

  return combined_scores

def get_acc(a, b, top_k):

  combined_scores = get_combined_scores(a,b)
  top_5_chapters_combined = {}

  chapters_ids_net = []
  accuracies = []
  for i, (qid, scores) in enumerate(combined_scores.items()):
      sorted_chapters = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:top_k]
      top_5_chapters_combined[qid] = dict(sorted_chapters)
      cids = [chapter for chapter, _ in sorted_chapters]
      chapters_ids_net.append(cids)
      gt = df.loc[df['qid'] == qid, 'ground_truth'].iloc[0]
      # print(gt)
      acc = 1 if gt in cids else 0
      # print(gt, cids, acc)
      accuracies.append(acc)
  print(sum(accuracies))
  return sum(accuracies) / len(accuracies)


In [None]:
A = [0.9, 1.0]

for a in A:
  # get_acc(a, 1-a)
  print(a, get_acc(a, 1-a, 5))

1066
0.9 0.6940104166666666
1056
1.0 0.6875


In [None]:
A = [0.9, 1.0]

for a in A:
  # get_acc(a, 1-a)
  print(a, 1-a, get_acc(a, 1-a, 3))

940
0.9 0.09999999999999998 0.6119791666666666
930
1.0 0.0 0.60546875


In [None]:
716 / 1536

0.4661458333333333

In [None]:
638 / 1536

0.4153645833333333

In [None]:
A = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

for a in A:
  # get_acc(a, 1-a)
  print(a, 1-a, get_acc(a, 1-a, 3))

0.1 0.9 555
0.2 0.8 556
0.3 0.7 555
0.4 0.6 555
0.5 0.5 555
0.6 0.4 563
0.7 0.30000000000000004 575
0.8 0.19999999999999996 586
0.9 0.09999999999999998 610
1.0 0.0 551


In [None]:
610 / 1536

0.3971354166666667

In [None]:
551 / 1536

0.3587239583333333