In [25]:
import pickle

model = "dunzhang/stella_en_400M_v5"
number = 5000
with open(f'clustering/graph/connected_graph.pkl', 'rb') as file: connected_graph = pickle.load(file)
with open(f'clustering/graph/cluster_graph.pkl', 'rb') as file: cluster_graph = pickle.load(file)
with open(f'embeddings/{model}/hard/{number}/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
with open(f'embeddings/{model}/hard/{number}/contexts.pkl', 'rb') as file: contexts = pickle.load(file)

hotpot_qa_df['actual_contexts'] = hotpot_qa_df['actual_contexts'].apply(lambda x: [int(i) for i in x])
hotpot_qa_df.head()

Unnamed: 0,level,question,answer,actual_contexts
0,hard,"George Boscawen, 9th Viscount Falmouth is a fo...","the Guards Division, Foot Guards regiments","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
1,hard,When Vladimir Kashpur portrayed Baba Yaga she ...,trio of sisters,"[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]"
2,hard,Which musician has a solo punk rock project: T...,"Frank Anthony Iero, Jr.","[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]"
3,hard,A Disney voice actor has won which Emmy award?,Outstanding Supporting Actor,"[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]"
4,hard,Which north-western suburb of Adelaide lies wi...,Birkenhead,"[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]"


In [2]:
from typing import Dict,List
import networkx
import random

cluster_node_embedding_sample:Dict[int,List[float]] = {}
for cluster_id in cluster_graph.keys():
    graph:networkx.Graph = cluster_graph[cluster_id]
    node:int = list(graph.nodes)[0]#random.choice(list(graph.nodes)) # select a random node from the cluster
    cluster_node_embedding_sample[cluster_id] = contexts[str(node)].embedding

start_node = 0

# A* Traversal

In [26]:
from tqdm import tqdm
import torch
import torch.nn.functional as F

def cosine_similarity_gpu(list_A, list_B):
    # Convert lists to tensors and move to GPU
    A = torch.tensor(list_A, dtype=torch.float32).cuda()
    B = torch.tensor(list_B, dtype=torch.float32).cuda()

    # Compute cosine similarity on GPU
    cos_sim = F.cosine_similarity(A.unsqueeze(0), B.unsqueeze(0))

    return cos_sim.item()  # Convert tensor result to a Python float

def get_nodes(question_embedding,clusters,contexts=contexts):
    cosine_similarities = []
    nodes = []
    for cluster in clusters:
        for node in list(cluster_graph[cluster].nodes):
            nodes.append(node)

    # Calculate cosine similarities and store them with node identifiers
    for node in nodes:
        node_embedding = contexts[str(node)].embedding
        cosine_sim = cosine_similarity_gpu(question_embedding, node_embedding)
        cosine_similarities.append((node, cosine_sim))  # Store as (node, similarity)

    # Sort the nodes based on the highest similarity
    sorted_nodes = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)

    # Extract sorted node identifiers and their similarities
    sorted_node_ids = [node for node, sim in sorted_nodes]
    sorted_similarities = [sim for node, sim in sorted_nodes]

    return sorted_node_ids[:10]


In [None]:
THRESHOLD = 0.60

retrieved_contexts = []
for question in tqdm(hotpot_qa_df['question'], desc="Retrieving contexts", unit="question"):

    question_embedding = question.embedding

    max_cluster_similarities:List[int] = []
    max_similarity:float = -float('inf')
    max_similirity_node:int = None

    for id,sample_embedding in zip(cluster_node_embedding_sample.keys(),cluster_node_embedding_sample.values()):
        cosine_sim = cosine_similarity_gpu(question_embedding,sample_embedding)
        if cosine_sim > max_similarity: 
            max_similarity = cosine_sim
            max_similirity_node = id
        if cosine_sim > THRESHOLD:max_cluster_similarities.append(id)

    if len(max_cluster_similarities) == 0: 
        max_cluster_similarities.append(max_similirity_node)
    
    retrieved_contexts.append(get_nodes(question_embedding,max_cluster_similarities))

hotpot_qa_df["Cluster_Retrieval"] = retrieved_contexts

In [None]:
evaluate_index(retrieved_contexts)

# FAISS

In [26]:
import faiss
from tqdm import tqdm
import numpy as np
import pandas as pd
from utils.evaluation_metrics.retriever import RetrieverEvaluator

context_embeddings = []

for text_node in contexts.values():
    context_embeddings.append(text_node.embedding)
    
def lists_to_arrays(list_of_lists):
    return np.array([np.array(lst) for lst in list_of_lists], dtype=object)

query_embeddings = [query.embedding for query in hotpot_qa_df['question'].tolist()]

def retrieve_contexts(index):
    retrieved_results = [] 
    for query_embedding in tqdm(query_embeddings):
        D, I = index.search(np.array(query_embedding).reshape(1, -1), 10)
        results = I[0].tolist()
        retrieved_results.append(results)
    return retrieved_results

actual_contexts = hotpot_qa_df['actual_contexts'].tolist()
def evaluate_index(retrieved_contexts,actual_contexts=actual_contexts):
    df = pd.DataFrame({
                        'actual_contexts': actual_contexts,
                        'retrieved_contexts': retrieved_contexts
                    })
    evaluator = RetrieverEvaluator(df,'retrieved_contexts')
    order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
    order_aware_metrics = evaluator.get_order_aware_metrics() 

    print(f"order unaware metrics : {order_unaware_metrics}")
    print(f"order aware metrics   : {order_aware_metrics}")

from beir.retrieval.evaluation import EvaluateRetrieval

def beir_evaluation(actual_contexts,results):
    actual_contexts_dict = {
        str(i): {str(doc_id): 1 for doc_id in context} for i, context in enumerate(actual_contexts)
    }
    results_dict = {
        str(i): {str(doc_id): rank + 1 for rank, doc_id in enumerate(result)} for i, result in enumerate(results)
    }

    ndcg, map_score, recall, precision = EvaluateRetrieval.evaluate(
        actual_contexts_dict, results_dict, k_values=[10]
    )
    print("ndcg:", ndcg)
    print("map:", map_score)
    print("mrr:", EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [5,10], metric="mrr"))

    print()

    print("recall:", recall)
    print("precision:", precision)
    print("acc:", EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [5,10], metric="acc"))


## Flat Methods : IP & L2

### Inner Product (IP)

In [10]:
index = faiss.IndexFlatIP(len(query_embeddings[0]))
index.add(lists_to_arrays(context_embeddings))

contexts = retrieve_contexts(index)
beir_evaluation(actual_contexts,contexts)

100%|██████████| 5000/5000 [00:45<00:00, 109.92it/s]


ndcg: {'NDCG@10': 0.31884}
map: {'MAP@10': 0.17334}
mrr: {'MRR@5': 0.3094, 'MRR@10': 0.35986}

recall: {'Recall@10': 0.3848}
precision: {'P@10': 0.38146}
acc: {'Accuracy@5': 0.6036, 'Accuracy@10': 0.9938}


### Euclidean (L2)

In [11]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

index = faiss.IndexFlatL2(len(query_embeddings[0]))
index.add(lists_to_arrays(context_embeddings))

retrieved_contexts = retrieve_contexts(index)
beir_evaluation(actual_contexts,retrieved_contexts)

100%|██████████| 5000/5000 [00:46<00:00, 108.69it/s]


ndcg: {'NDCG@10': 0.32358}
map: {'MAP@10': 0.17793}
mrr: {'MRR@5': 0.32001, 'MRR@10': 0.37008}

recall: {'Recall@10': 0.38762}
precision: {'P@10': 0.38436}
acc: {'Accuracy@5': 0.6048, 'Accuracy@10': 0.9938}


## Hierarchical Navigable Small World 

In [12]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

max_neighbours = 16
ef_search = 10
ef_construction = 256

index = faiss.IndexHNSWFlat(len(query_embeddings[0]), max_neighbours)
index.hnsw.efSearch = ef_search
index.hnsw.efConstruction = ef_construction

index.add(lists_to_arrays(context_embeddings))

contexts = retrieve_contexts(index)
beir_evaluation(actual_contexts,contexts)

100%|██████████| 5000/5000 [00:00<00:00, 5116.40it/s]


ndcg: {'NDCG@10': 0.30287}
map: {'MAP@10': 0.16825}
mrr: {'MRR@5': 0.30056, 'MRR@10': 0.34862}

recall: {'Recall@10': 0.36139}
precision: {'P@10': 0.3586}
acc: {'Accuracy@5': 0.5656, 'Accuracy@10': 0.9434}


# Hybrid Search

## BM25 then IndexFlat

In [None]:
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss

class HybridSearch:
    def __init__(self, documents):
        self.documents = documents

        # BM25 initialization
        tokenized_corpus = [text_node.text.split(" ") for text_node in documents]
        self.bm25 = BM25Okapi(tokenized_corpus)

        self.document_embeddings = [text_node.embedding for text_node in documents]
        
        # FAISS initialization
        self.index = faiss.IndexFlatIP(len(self.document_embeddings[0]))
        self.index.add(lists_to_arrays(self.document_embeddings))

    def search(self, query, top_n=10):
        # BM25 search
        bm25_scores = self.bm25.get_scores(query.query_str.split(" "))
        top_docs_indices = np.argsort(bm25_scores)[-top_n*5:]
        print(top_docs_indices)
        print()
        
        # Get embeddings of top documents from BM25 search
        top_docs_embeddings = [self.document_embeddings[i] for i in top_docs_indices]

        query_embedding = np.array(query.embedding).reshape(1, -1)

        # FAISS search on the top documents
        sub_index = faiss.IndexFlatIP(len(self.document_embeddings[0]))
        sub_index.add(np.array(top_docs_embeddings))
        distances, sub_dense_ranked_indices = sub_index.search(np.array(query_embedding), top_n)

        # Map FAISS results back to original document indices
        final_ranked_indices = [top_docs_indices[i] for i in sub_dense_ranked_indices[0]]

        # Retrieve the actual documents
        ranked_docs = [int(self.documents[i].id_) for i in final_ranked_indices]
        return ranked_docs

def retrieve_contexts(hs):
    retrieved_results = [] 
    questions = hotpot_qa_df["question"].tolist()
    for query in tqdm(questions[:2]):
        retrieved_results.append(hs.search(query, top_n=10))
    return retrieved_results

hs = HybridSearch(list(contexts.values()))
results = retrieve_contexts(hs)
evaluate_index(results)

In [18]:
from baguetter.indices import FaissDenseIndex,BMXSparseIndex,MultiIndex
import pickle

def load_data(model_name):
    with open(f'embeddings/{model_name}/hard/5000/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
    with open(f'embeddings/{model_name}/hard/5000/contexts.pkl', 'rb') as file: contexts = pickle.load(file)

    questions = hotpot_qa_df["question"].tolist()


    return [q.embedding for q in questions],contexts

dense_model = "dunzhang/stella_en_400M_v5"
dense_questions,dense_contexts = load_data(dense_model)
result = {i: i + 1 for i in range(len(context_embeddings))}
# Create an index
dense_index = FaissDenseIndex(index,"dense_index",len(query_embeddings[0]),result)

_ = dense_index.search(np.array(dense_questions[0]),top_k=10)
_

SearchResults(keys=[1, 3, 7, 9, 11095, 24286, 38539, 10, 29819, 2771], scores=array([0.00727236, 0.00646719, 0.00515218, 0.00448899, 0.00323097,
       0.0031733 , 0.00315333, 0.00310439, 0.00306923, 0.00300407],
      dtype=float32), normalized=True)

In [21]:
from baguetter.indices import *


context_str = [x.text for x in list(dense_contexts.values())]
sparse_index = BMXSparseIndex(index_name="BMX_Test")
sparse_index.add_many(keys=result,values=context_str,show_progress=True)

questions = hotpot_qa_df["question"].tolist()
question_str = [q.query_str for q in questions]
x = sparse_index.search(question_str[0],top_k=10)
print(x)

Tokenization: 100%|██████████| 49776/49776 [00:12<00:00, 4094.39it/s]
Building doc-term matrix: 100%|██████████| 49776/49776 [00:01<00:00, 48878.58it/s]
Building inverted index: 100%|██████████| 116039/116039 [00:18<00:00, 6383.51it/s]


SearchResults(keys=[6, 0, 2, 8, 7, 4, 1, 3, 9, 32657], scores=array([44.847828, 42.857597, 40.865738, 39.84939 , 28.379486, 26.272427,
       26.180256, 24.131647, 23.960619, 19.673334], dtype=float32), normalized=False)


In [23]:
multi_index = MultiIndex()

idx = multi_index.add_index(sparse_index)
idx = multi_index.add_index(dense_index)

embeddings = [q.embedding for q in hotpot_qa_df["question"].tolist()]
question_strs = [q.query_str for q in hotpot_qa_df["question"].tolist()]

results = []
for question,embedding in zip(question_strs,embeddings):
    embedding_np = np.array(embedding)
    query = {
             "BMX_Test":question,"dense_index":embedding_np
             }
    x = idx.search(query=query,
                top_k=10)
    res = x.keys[:10]
    results.append(res)

beir_evaluation(actual_contexts,results)

ndcg: {'NDCG@10': 0.51022}
map: {'MAP@10': 0.34057}
mrr: {'MRR@5': 0.57872, 'MRR@10': 0.58973}

recall: {'Recall@10': 0.56587}
precision: {'P@10': 0.56196}
acc: {'Accuracy@5': 0.9198, 'Accuracy@10': 0.9984}


In [None]:
ndcg: {'NDCG@10': 0.58479}
map: {'MAP@10': 0.43666}
mrr: {'MRR@5': 0.59957, 'MRR@10': 0.61119}

recall: {'Recall@10': 0.64619}
precision: {'P@10': 0.64222}
acc: {'Accuracy@5': 0.9164, 'Accuracy@10': 0.9984}
