In [None]:
cd /content/drive/MyDrive/GenAI Project/

/content/drive/MyDrive/GenAI Project


In [None]:
import json

with open('Dataset/Validation/mahabharata_questions.json', 'r') as f:
    mahabharata_questions = json.load(f)

with open('Dataset/Corpus/entities_speakers_verses.json', 'r') as f:
    entities_speakers_verses = json.load(f)

with open('Dataset/Corpus/verses.json', 'r') as f:
    verses = json.load(f)

with open('Dataset/Corpus/chapters.json', 'r') as f:
    chapters = json.load(f)

### Explore

In [None]:
!pip install faiss-cpu

In [None]:
from google.colab import userdata
import os
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [None]:
# pip install openai faiss-cpu tqdm numpy
from openai import OpenAI
import faiss, numpy as np, pickle
from tqdm import tqdm

client = OpenAI()

doc_ids = list(chapters.keys())
texts = list(chapters.values())

import textwrap

def chunk_text(text, max_tokens=7500, approx_chars_per_token=4):
    """Split text into chunks within token limit."""
    max_chars = max_tokens * approx_chars_per_token
    return textwrap.wrap(text, max_chars)

model="text-embedding-3-large"
batch_size=50
embeddings = []

for i in tqdm(range(0, len(texts), batch_size)):
  batch = texts[i:i+batch_size]
  processed_batch = []
  for text in batch:
      if len(text) > 30000:  # Rough safeguard
          chunks = chunk_text(text)
          # Embed each chunk and average their embeddings
          chunk_embs = [
              client.embeddings.create(input=[chunk], model=model).data[0].embedding
              for chunk in chunks
          ]
          avg_emb = np.mean(chunk_embs, axis=0)
          processed_batch.append(avg_emb)
      else:
          processed_batch.append(text)
  # Filter texts vs embeddings
  batch_embeds = []
  for item in processed_batch:
      if isinstance(item, np.ndarray):
          batch_embeds.append(item)
      else:
          response = client.embeddings.create(input=[item], model=model)
          batch_embeds.append(response.data[0].embedding)
  embeddings.extend(batch_embeds)

print("Embeddings shape:", embeddings.shape)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 43/43 [11:29<00:00, 16.04s/it]


AttributeError: 'list' object has no attribute 'shape'

In [None]:
embeddings = np.array(embeddings, dtype="float32")
faiss.normalize_L2(embeddings)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

np.save("Retrieval Experiments/Models/embeddings.npy", embeddings)
with open("Retrieval Experiments/Models/doc_ids.pkl", "wb") as f:
    pickle.dump(doc_ids, f)
faiss.write_index(index, "Retrieval Experiments/Models/corpus.index")
print("‚úÖ Saved FAISS index and metadata.")

‚úÖ Saved FAISS index and metadata.


In [None]:
import numpy as np
import pickle
import faiss
import pandas as pd
from tqdm import tqdm
from openai import OpenAI

class OpenAIEmbeddingSearchEvaluator:
    def __init__(self,
                 model_name="text-embedding-3-large",
                 emb_path="embeddings.npy",
                 ids_path="doc_ids.pkl",
                 index_path="corpus.index"):
        self.client = OpenAI()
        self.model_name = model_name

        # Load embeddings and metadata
        self.embeddings = np.load(emb_path)
        with open(ids_path, "rb") as f:
            self.doc_ids = pickle.load(f)
        self.index = faiss.read_index(index_path)
        print(f"Loaded {len(self.embeddings)} embeddings with dimension {self.embeddings.shape[1]}")

        # Optional: normalize for cosine similarity
        faiss.normalize_L2(self.embeddings)

    # -----------------------------
    # üîç Search
    # -----------------------------
    def search(self, query, top_k=3):
        """Search for top_k most similar documents for a query string."""
        q_emb = self._embed_query(query)
        faiss.normalize_L2(q_emb)
        D, I = self.index.search(q_emb, k=top_k)

        results = [
            {"rank": r + 1,
             "score": float(D[0][r]),
             "doc_id": self.doc_ids[I[0][r]]}
            for r in range(top_k)
        ]
        return results

    # -----------------------------
    # üß† Evaluation
    # -----------------------------
    def evaluate_questions(self, questions_dict, top_k=3, return_all_scores=True, save_query_embs=True):
      rows = []
      all_scores_dict = {}
      query_emb_store = {}  # <--- store query embeddings here

      for true_id, entry in tqdm(questions_dict.items()):
          for q in entry.get("questions", []):
              q_emb = self._embed_query(q)
              faiss.normalize_L2(q_emb)

              # Save embedding
              if save_query_embs:
                  query_emb_store[q] = q_emb.flatten().astype("float32")

              # ---- Top-k retrieval ----
              D, I = self.index.search(q_emb, k=top_k)
              retrieved_ids = [self.doc_ids[idx] for idx in I[0]]
              retrieved_scores = [float(score) for score in D[0]]

              # ---- Ground truth check ----
              if true_id in self.doc_ids:
                  true_idx = self.doc_ids.index(true_id)
                  true_emb = self.embeddings[true_idx].reshape(1, -1)
                  correct_score = float(np.dot(q_emb, true_emb.T))
              else:
                  correct_score = None

              correct = true_id in retrieved_ids
              rank = retrieved_ids.index(true_id) + 1 if correct else None

              rows.append({
                  "query": q,
                  "ground_truth": true_id,
                  "top_ids": retrieved_ids,
                  "top_scores": retrieved_scores,
                  "correct_in_top_k": correct,
                  "rank_of_correct": rank,
                  "ground_truth_score": correct_score
              })

              # ---- full similarity scores ----
              if return_all_scores:
                  sims = np.dot(q_emb, self.embeddings.T).flatten()
                  all_scores_dict[q] = {
                      self.doc_ids[i]: float(sims[i]) for i in range(len(self.doc_ids))
                  }

      self.results_df = pd.DataFrame(rows)

      # ‚úÖ Save query embeddings to disk
      if save_query_embs:
          np.save("query_embeddings.npy", np.stack(list(query_emb_store.values())))
          with open("query_texts.pkl", "wb") as f:
              pickle.dump(list(query_emb_store.keys()), f)
          print(f"üíæ Saved {len(query_emb_store)} query embeddings to query_embeddings.npy and query_texts.pkl")

      return (self.results_df, all_scores_dict) if return_all_scores else self.results_df


    # -----------------------------
    # üìà Accuracy + Save
    # -----------------------------
    def accuracy(self):
        if not hasattr(self, "results_df"):
            raise ValueError("Run evaluate_questions() first.")
        return self.results_df["correct_in_top_k"].mean()

    def save_results(self, path="retrieval_results.csv"):
        if hasattr(self, "results_df"):
            self.results_df.to_csv(path, index=False)
            print(f"‚úÖ Saved results to {path}")
        else:
            print("No results to save.")

    # -----------------------------
    # üß© Internal: embed query
    # -----------------------------
    def _embed_query(self, query):
        resp = self.client.embeddings.create(input=[query], model=self.model_name)
        q_emb = np.array(resp.data[0].embedding, dtype="float32").reshape(1, -1)
        return q_emb


In [None]:
retriever = OpenAIEmbeddingSearchEvaluator(
    model_name="text-embedding-3-large",
    emb_path="Retrieval Experiments/Models/embeddings.npy",
    ids_path="Retrieval Experiments/Models/doc_ids.pkl",
    index_path="Retrieval Experiments/Models/corpus.index"
)

results = retriever.search("When  did Draupadi meet Satyabhama?", top_k=3)
for r in results:
    print(r)


Loaded 2108 embeddings with dimension 3072
{'rank': 1, 'score': 0.6559776067733765, 'doc_id': '3.235'}
{'rank': 2, 'score': 0.6144251823425293, 'doc_id': '3.267'}
{'rank': 3, 'score': 0.5938724279403687, 'doc_id': '3.233'}


In [None]:
df_results, all_scores_dict = retriever.evaluate_questions(mahabharata_questions, top_k=5)
print(df_results.head())

# Save to CSV
retriever.save_results("Retrieval Experiments/1.7/retrieval_results_5.csv")


  correct_score = float(np.dot(q_emb, true_emb.T))
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 182/182 [01:59<00:00,  1.53it/s]


üíæ Saved 450 query embeddings to query_embeddings.npy and query_texts.pkl
                                               query ground_truth  \
0  What is the main premise and scope of the Maha...          1.1   
1  Which topics are briefly listed or summarized ...          1.1   
2  What glories or benefits are said to come from...          1.1   
3  What is the significance of the Samantpanchak ...          1.2   
4  How is the Akshauhini army described or enumer...          1.2   

                           top_ids  \
0   [1.59, 1.62, 1.61, 1.2, 14.11]   
1    [1.59, 6.15, 1.61, 6.1, 1.63]   
2  [18.5, 8.96, 1.62, 18.6, 15.33]   
3   [9.53, 8.47, 3.87, 14.83, 6.7]   
4   [6.18, 5.19, 6.16, 5.56, 6.17]   

                                          top_scores  correct_in_top_k  \
0  [0.542840838432312, 0.536239743232727, 0.49386...             False   
1  [0.5005851984024048, 0.4749210476875305, 0.463...             False   
2  [0.6013244390487671, 0.5991901159286499, 0.594...      

In [None]:
type(all_scores_dict)

dict

In [None]:
with open("Retrieval Experiments/1.7/all_scores_dict.json", "w") as f:
    json.dump(all_scores_dict, f, indent = 4)

In [None]:
df_results = retriever.evaluate_questions(mahabharata_questions, top_k=5)
print(df_results.head())

# Save to CSV
retriever.save_results("Retrieval Experiments/1.5/retrieval_results_5.csv")


  correct_score = float(np.dot(q_emb, true_emb.T))
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 182/182 [01:56<00:00,  1.56it/s]

                                               query ground_truth  \
0  What is the main premise and scope of the Maha...          1.1   
1  Which topics are briefly listed or summarized ...          1.1   
2  What glories or benefits are said to come from...          1.1   
3  What is the significance of the Samantpanchak ...          1.2   
4  How is the Akshauhini army described or enumer...          1.2   

                           top_ids  \
0   [1.59, 1.62, 1.61, 1.2, 14.11]   
1    [1.59, 6.15, 1.61, 6.1, 1.63]   
2  [18.5, 8.96, 1.62, 18.6, 15.33]   
3   [9.53, 8.47, 3.87, 14.83, 6.7]   
4   [6.18, 5.19, 6.16, 5.56, 6.17]   

                                          top_scores  correct_in_top_k  \
0  [0.5428798794746399, 0.5362762212753296, 0.493...             False   
1  [0.50065016746521, 0.47499576210975647, 0.4633...             False   
2  [0.6012923717498779, 0.5991548299789429, 0.593...             False   
3  [0.5492348670959473, 0.4511876702308655, 0.450...        




In [None]:
query_scores = all_scores_dict['How did Uttanka inspire Janamejaya to perform the Sarpayajna (snake sacrifice)?']

# Convert the dictionary items to a list of tuples and sort by score (the second element)
sorted_scores = sorted(query_scores.items(), key=lambda item: item[1], reverse=True)

sorted_scores

[('1.3', 0.7164081335067749),
 ('1.53', 0.6441617608070374),
 ('1.51', 0.6318194270133972),
 ('1.54', 0.6220879554748535),
 ('1.38', 0.614304780960083),
 ('1.12', 0.6141235828399658),
 ('1.13', 0.6048241257667542),
 ('1.58', 0.5998374223709106),
 ('1.37', 0.5911877155303955),
 ('1.15', 0.5895086526870728),
 ('1.60', 0.5840315818786621),
 ('1.50', 0.5833141207695007),
 ('1.56', 0.57966148853302),
 ('1.44', 0.5708365440368652),
 ('1.20', 0.5693553686141968),
 ('1.40', 0.5662754774093628),
 ('1.48', 0.5577189922332764),
 ('14.58', 0.5533550977706909),
 ('1.57', 0.547672688961029),
 ('1.39', 0.5438976287841797),
 ('14.56', 0.5438263416290283),
 ('3.201', 0.5424317121505737),
 ('9.36', 0.5422331690788269),
 ('1.43', 0.5415443181991577),
 ('1.49', 0.5350316762924194),
 ('1.11', 0.5330500602722168),
 ('3.203', 0.5284730195999146),
 ('1.16', 0.5268726348876953),
 ('1.47', 0.5258104801177979),
 ('1.52', 0.5257957577705383),
 ('1.42', 0.5255434513092041),
 ('15.35', 0.5249353647232056),
 ('12.15