In [None]:
import pickle

model = "dunzhang/stella_en_400M_v5"

with open(f'clustering/graph/connected_graph.pkl', 'rb') as file: connected_graph = pickle.load(file)
with open(f'clustering/graph/cluster_graph.pkl', 'rb') as file: cluster_graph = pickle.load(file)
with open(f'embeddings/{model}/hard/3000/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
with open(f'embeddings/{model}/hard/3000/contexts.pkl', 'rb') as file: contexts = pickle.load(file)

hotpot_qa_df['actual_contexts'] = hotpot_qa_df['actual_contexts'].apply(lambda x: [int(i) for i in x])
hotpot_qa_df.head()

In [2]:
from typing import Dict,List
import networkx
import random

cluster_node_embedding_sample:Dict[int,List[float]] = {}
for cluster_id in cluster_graph.keys():
    graph:networkx.Graph = cluster_graph[cluster_id]
    node:int = list(graph.nodes)[0]#random.choice(list(graph.nodes)) # select a random node from the cluster
    cluster_node_embedding_sample[cluster_id] = contexts[str(node)].embedding

start_node = 0

# A* Traversal

In [26]:
from tqdm import tqdm
import torch
import torch.nn.functional as F

def cosine_similarity_gpu(list_A, list_B):
    # Convert lists to tensors and move to GPU
    A = torch.tensor(list_A, dtype=torch.float32).cuda()
    B = torch.tensor(list_B, dtype=torch.float32).cuda()

    # Compute cosine similarity on GPU
    cos_sim = F.cosine_similarity(A.unsqueeze(0), B.unsqueeze(0))

    return cos_sim.item()  # Convert tensor result to a Python float

def get_nodes(question_embedding,clusters,contexts=contexts):
    cosine_similarities = []
    nodes = []
    for cluster in clusters:
        for node in list(cluster_graph[cluster].nodes):
            nodes.append(node)

    # Calculate cosine similarities and store them with node identifiers
    for node in nodes:
        node_embedding = contexts[str(node)].embedding
        cosine_sim = cosine_similarity_gpu(question_embedding, node_embedding)
        cosine_similarities.append((node, cosine_sim))  # Store as (node, similarity)

    # Sort the nodes based on the highest similarity
    sorted_nodes = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)

    # Extract sorted node identifiers and their similarities
    sorted_node_ids = [node for node, sim in sorted_nodes]
    sorted_similarities = [sim for node, sim in sorted_nodes]

    return sorted_node_ids[:10]


In [None]:
THRESHOLD = 0.60

retrieved_contexts = []
for question in tqdm(hotpot_qa_df['question'], desc="Retrieving contexts", unit="question"):

    question_embedding = question.embedding

    max_cluster_similarities:List[int] = []
    max_similarity:float = -float('inf')
    max_similirity_node:int = None

    for id,sample_embedding in zip(cluster_node_embedding_sample.keys(),cluster_node_embedding_sample.values()):
        cosine_sim = cosine_similarity_gpu(question_embedding,sample_embedding)
        if cosine_sim > max_similarity: 
            max_similarity = cosine_sim
            max_similirity_node = id
        if cosine_sim > THRESHOLD:max_cluster_similarities.append(id)

    if len(max_cluster_similarities) == 0: 
        max_cluster_similarities.append(max_similirity_node)
    
    retrieved_contexts.append(get_nodes(question_embedding,max_cluster_similarities))

hotpot_qa_df["Cluster_Retrieval"] = retrieved_contexts

In [None]:
evaluate_index(retrieved_contexts)

# ChromaDB Results

In [None]:
import chromadb
from tqdm import tqdm

documents = []
ids = []
embeddings = []
metadatas = []

for context in contexts.values():
    documents.append(context.text)
    ids.append(context.id_)
    embeddings.append(context.embedding)
    metadatas.append({'caption': context.metadata['caption']})

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="my_collection",metadata={"hnsw:space": "cosine"})
collection.add(documents=documents,
               ids=ids,
               embeddings=embeddings,
               metadatas=metadatas)

retrieved_contexts = []
for question in tqdm(hotpot_qa_df['question'], desc="Retrieving contexts", unit="question"):

    question_embedding = question.embedding
    result = collection.query(
                                query_embeddings=[question_embedding], # Chroma will embed this for you
                                n_results=10 # how many results to return
                              )
    retrieved_contexts.append([int(node) for node in result["ids"][0]] )

hotpot_qa_df["Vector_Similarity_Retrieval"] = retrieved_contexts


from utils.evaluation_metrics.retriever import RetrieverEvaluator

evaluator = RetrieverEvaluator(hotpot_qa_df,'Vector_Similarity_Retrieval')

order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
order_aware_metrics = evaluator.get_order_aware_metrics() 

print(f"Cluster_Retrieval:")
print(f"order unaware metrics : {order_unaware_metrics}")
print(f"order aware metrics   : {order_aware_metrics}")

# FAISS

In [3]:
import faiss
from tqdm import tqdm
import numpy as np
import pandas as pd
from utils.evaluation_metrics.retriever import RetrieverEvaluator

context_embeddings = []

for text_node in contexts.values():
    context_embeddings.append(text_node.embedding)
    
def lists_to_arrays(list_of_lists):
    return np.array([np.array(lst) for lst in list_of_lists], dtype=object)

query_embeddings = [query.embedding for query in hotpot_qa_df['question'].tolist()]

def retrieve_contexts(index):
    retrieved_results = [] 
    for query_embedding in tqdm(query_embeddings):
        D, I = index.search(np.array(query_embedding).reshape(1, -1), 10)
        results = I[0].tolist()
        retrieved_results.append(results)
    return retrieved_results

actual_contexts = hotpot_qa_df['actual_contexts'].tolist()
def evaluate_index(retrieved_contexts,actual_contexts=actual_contexts):
    df = pd.DataFrame({
                        'actual_contexts': actual_contexts,
                        'retrieved_contexts': retrieved_contexts
                    })
    evaluator = RetrieverEvaluator(df,'retrieved_contexts')
    order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
    order_aware_metrics = evaluator.get_order_aware_metrics() 

    print(f"order unaware metrics : {order_unaware_metrics}")
    print(f"order aware metrics   : {order_aware_metrics}")


## Flat Methods : IP & L2

### Inner Product (IP)

In [None]:
index = faiss.IndexFlatIP(len(query_embeddings[0]))
index.add(lists_to_arrays(context_embeddings))

contexts = retrieve_contexts(index)
evaluate_index(contexts)

### Euclidean (L2)

In [None]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

index = faiss.IndexFlatL2(len(query_embeddings[0]))
index.add(lists_to_arrays(context_embeddings))

retrieved_contexts = retrieve_contexts(index)
evaluate_index(retrieved_contexts)

## Hierarchical Navigable Small World 

In [None]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

max_neighbours = 16
ef_search = 10
ef_construction = 256

index = faiss.IndexHNSWFlat(len(query_embeddings[0]), max_neighbours)
index.hnsw.efSearch = ef_search
index.hnsw.efConstruction = ef_construction

index.add(lists_to_arrays(context_embeddings))

contexts = retrieve_contexts(index)
evaluate_index(contexts)

# Hybrid Search

## BM25 then IndexFlat

In [None]:
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss

class HybridSearch:
    def __init__(self, documents):
        self.documents = documents

        # BM25 initialization
        tokenized_corpus = [text_node.text.split(" ") for text_node in documents]
        self.bm25 = BM25Okapi(tokenized_corpus)

        self.document_embeddings = [text_node.embedding for text_node in documents]
        
        # FAISS initialization
        self.index = faiss.IndexFlatIP(len(self.document_embeddings[0]))
        self.index.add(lists_to_arrays(self.document_embeddings))

    def search(self, query, top_n=10):
        # BM25 search
        bm25_scores = self.bm25.get_scores(query.query_str.split(" "))
        top_docs_indices = np.argsort(bm25_scores)[-top_n*5:]
        print(top_docs_indices)
        print()
        
        # Get embeddings of top documents from BM25 search
        top_docs_embeddings = [self.document_embeddings[i] for i in top_docs_indices]

        query_embedding = np.array(query.embedding).reshape(1, -1)

        # FAISS search on the top documents
        sub_index = faiss.IndexFlatIP(len(self.document_embeddings[0]))
        sub_index.add(np.array(top_docs_embeddings))
        distances, sub_dense_ranked_indices = sub_index.search(np.array(query_embedding), top_n)

        # Map FAISS results back to original document indices
        final_ranked_indices = [top_docs_indices[i] for i in sub_dense_ranked_indices[0]]

        # Retrieve the actual documents
        ranked_docs = [int(self.documents[i].id_) for i in final_ranked_indices]
        print(ranked_docs)
        print()
        print()
        return ranked_docs

def retrieve_contexts(hs):
    retrieved_results = [] 
    questions = hotpot_qa_df["question"].tolist()
    for query in tqdm(questions[:2]):
        retrieved_results.append(hs.search(query, top_n=10))
    return retrieved_results

hs = HybridSearch(list(contexts.values()))
results = retrieve_contexts(hs)
evaluate_index(results)

In [None]:
from baguetter.indices import FaissDenseIndex,BMXSparseIndex,MultiIndex
import pickle

def load_data(model_name):
    with open(f'embeddings/{model_name}/hard/3000/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
    with open(f'embeddings/{model_name}/hard/3000/contexts.pkl', 'rb') as file: contexts = pickle.load(file)

    questions = hotpot_qa_df["question"].tolist()


    return [q.embedding for q in questions],contexts

dense_model = "dunzhang/stella_en_400M_v5"
dense_questions,dense_contexts = load_data(dense_model)
result = {i: i + 1 for i in range(len(context_embeddings))}
# Create an index
dense_index = FaissDenseIndex(index,"dense_index",len(query_embeddings[0]),result)

_ = dense_index.search(np.array(dense_questions[0]),top_k=10)
_

In [None]:
from baguetter.indices import *


context_str = [x.text for x in list(dense_contexts.values())]
sparse_index = BMXSparseIndex(index_name="BMX_Test")
sparse_index.add_many(keys=result,values=context_str,show_progress=True)

questions = hotpot_qa_df["question"].tolist()
question_str = [q.query_str for q in questions]
x = sparse_index.search(question_str[0],top_k=10)
print(x)

In [None]:
multi_index = MultiIndex()

idx = multi_index.add_index(sparse_index)
#idx = multi_index.add_index(dense_index)

embeddings = [q.embedding for q in hotpot_qa_df["question"].tolist()]
question_strs = [q.query_str for q in hotpot_qa_df["question"].tolist()]

results = []
for question,embedding in zip(question_strs,embeddings):

    query = {
             "BMX_Test":question}
    x = idx.search(query=query,
                top_k=10)
    res = x.keys[:10]
    results.append(res)

In [None]:
evaluate_index(results)

In [None]:
order unaware metrics : {'avg precision@10': 0.8364, 'avg recall@10': 0.8388, 'avg F1@10': 0.8368000000000001}
order aware metrics   : {'avg mrr': 0.33138762975560593, 'avg ndcg': 0.877741837170104, 'mean avg precision': 0.8128007936507937}


# SPLADE then IndexFlat

In [4]:
import pickle

def load_data(model_name):
    with open(f'embeddings/{model_name}/hard/500/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
    with open(f'embeddings/{model_name}/hard/500/contexts.pkl', 'rb') as file: contexts = pickle.load(file)

    questions = hotpot_qa_df["question"].tolist()


    return [q.embedding for q in questions],contexts

dense_model = "dunzhang/stella_en_400M_v5"
dense_questions,dense_contexts = load_data(dense_model)

sparse_model = "naver/splade-v3"
sparse_questions,sparse_contexts = load_data(sparse_model)