In [1]:
import pickle

model = "dunzhang/stella_en_400M_v5"

with open(f'clustering/graph/connected_graph.pkl', 'rb') as file: connected_graph = pickle.load(file)
with open(f'clustering/graph/cluster_graph.pkl', 'rb') as file: cluster_graph = pickle.load(file)
with open(f'embeddings/{model}/hard/3000/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
with open(f'embeddings/{model}/hard/3000/contexts.pkl', 'rb') as file: contexts = pickle.load(file)

hotpot_qa_df['actual_contexts'] = hotpot_qa_df['actual_contexts'].apply(lambda x: [int(i) for i in x])
hotpot_qa_df.head()

Unnamed: 0,level,question,answer,actual_contexts
0,hard,"George Boscawen, 9th Viscount Falmouth is a fo...","the Guards Division, Foot Guards regiments","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
1,hard,When Vladimir Kashpur portrayed Baba Yaga she ...,trio of sisters,"[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]"
2,hard,Which musician has a solo punk rock project: T...,"Frank Anthony Iero, Jr.","[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]"
3,hard,A Disney voice actor has won which Emmy award?,Outstanding Supporting Actor,"[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]"
4,hard,Which north-western suburb of Adelaide lies wi...,Birkenhead,"[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]"


In [2]:
from typing import Dict,List
import networkx
import random

cluster_node_embedding_sample:Dict[int,List[float]] = {}
for cluster_id in cluster_graph.keys():
    graph:networkx.Graph = cluster_graph[cluster_id]
    node:int = list(graph.nodes)[0]#random.choice(list(graph.nodes)) # select a random node from the cluster
    cluster_node_embedding_sample[cluster_id] = contexts[str(node)].embedding

start_node = 0

# A* Traversal

In [26]:
from tqdm import tqdm
import torch
import torch.nn.functional as F

def cosine_similarity_gpu(list_A, list_B):
    # Convert lists to tensors and move to GPU
    A = torch.tensor(list_A, dtype=torch.float32).cuda()
    B = torch.tensor(list_B, dtype=torch.float32).cuda()

    # Compute cosine similarity on GPU
    cos_sim = F.cosine_similarity(A.unsqueeze(0), B.unsqueeze(0))

    return cos_sim.item()  # Convert tensor result to a Python float

def get_nodes(question_embedding,clusters,contexts=contexts):
    cosine_similarities = []
    nodes = []
    for cluster in clusters:
        for node in list(cluster_graph[cluster].nodes):
            nodes.append(node)

    # Calculate cosine similarities and store them with node identifiers
    for node in nodes:
        node_embedding = contexts[str(node)].embedding
        cosine_sim = cosine_similarity_gpu(question_embedding, node_embedding)
        cosine_similarities.append((node, cosine_sim))  # Store as (node, similarity)

    # Sort the nodes based on the highest similarity
    sorted_nodes = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)

    # Extract sorted node identifiers and their similarities
    sorted_node_ids = [node for node, sim in sorted_nodes]
    sorted_similarities = [sim for node, sim in sorted_nodes]

    return sorted_node_ids[:10]


In [None]:
THRESHOLD = 0.60

retrieved_contexts = []
for question in tqdm(hotpot_qa_df['question'], desc="Retrieving contexts", unit="question"):

    question_embedding = question.embedding

    max_cluster_similarities:List[int] = []
    max_similarity:float = -float('inf')
    max_similirity_node:int = None

    for id,sample_embedding in zip(cluster_node_embedding_sample.keys(),cluster_node_embedding_sample.values()):
        cosine_sim = cosine_similarity_gpu(question_embedding,sample_embedding)
        if cosine_sim > max_similarity: 
            max_similarity = cosine_sim
            max_similirity_node = id
        if cosine_sim > THRESHOLD:max_cluster_similarities.append(id)

    if len(max_cluster_similarities) == 0: 
        max_cluster_similarities.append(max_similirity_node)
    
    retrieved_contexts.append(get_nodes(question_embedding,max_cluster_similarities))

hotpot_qa_df["Cluster_Retrieval"] = retrieved_contexts

In [None]:
evaluate_index(retrieved_contexts)

# ChromaDB Results

In [2]:
import chromadb
from tqdm import tqdm

documents = []
ids = []
embeddings = []
metadatas = []

for context in contexts.values():
    documents.append(context.text)
    ids.append(context.id_)
    embeddings.append(context.embedding)
    metadatas.append({'caption': context.metadata['caption']})

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="my_collection",metadata={"hnsw:space": "cosine"})
collection.add(documents=documents,
               ids=ids,
               embeddings=embeddings,
               metadatas=metadatas)

retrieved_contexts = []
for question in tqdm(hotpot_qa_df['question'], desc="Retrieving contexts", unit="question"):

    question_embedding = question.embedding
    result = collection.query(
                                query_embeddings=[question_embedding], # Chroma will embed this for you
                                n_results=10 # how many results to return
                              )
    retrieved_contexts.append([int(node) for node in result["ids"][0]] )

hotpot_qa_df["Vector_Similarity_Retrieval"] = retrieved_contexts


from utils.evaluation_metrics.retriever import RetrieverEvaluator

evaluator = RetrieverEvaluator(hotpot_qa_df,'Vector_Similarity_Retrieval')

order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
order_aware_metrics = evaluator.get_order_aware_metrics() 

print(f"Cluster_Retrieval:")
print(f"order unaware metrics : {order_unaware_metrics}")
print(f"order aware metrics   : {order_aware_metrics}")

Retrieving contexts: 100%|██████████| 3000/3000 [00:05<00:00, 583.86question/s]


Cluster_Retrieval:
order unaware metrics : {'avg precision@10': 0.41919999999999996, 'avg recall@10': 0.42191333333333336, 'avg F1@10': 0.4198677938994976}
order aware metrics   : {'avg mrr': 0.47018115866717053, 'avg ndcg': 0.5143042362129897, 'mean avg precision': 0.3659910714285714}


# FAISS

In [2]:
import faiss
from tqdm import tqdm
import numpy as np
import pandas as pd
from utils.evaluation_metrics.retriever import RetrieverEvaluator

context_embeddings = []

for text_node in contexts.values():
    context_embeddings.append(text_node.embedding)
    
def lists_to_arrays(list_of_lists):
    return np.array([np.array(lst) for lst in list_of_lists], dtype=object)

query_embeddings = [query.embedding for query in hotpot_qa_df['question'].tolist()]

def retrieve_contexts(index):
    retrieved_results = [] 
    for query_embedding in tqdm(query_embeddings):
        D, I = index.search(np.array(query_embedding).reshape(1, -1), 10)
        results = I[0].tolist()
        retrieved_results.append(results)
    return retrieved_results

actual_contexts = hotpot_qa_df['actual_contexts'].tolist()
def evaluate_index(retrieved_contexts,actual_contexts=actual_contexts):
    df = pd.DataFrame({
                        'actual_contexts': actual_contexts,
                        'retrieved_contexts': retrieved_contexts
                    })
    evaluator = RetrieverEvaluator(df,'retrieved_contexts')
    order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
    order_aware_metrics = evaluator.get_order_aware_metrics() 

    print(f"order unaware metrics : {order_unaware_metrics}")
    print(f"order aware metrics   : {order_aware_metrics}")


## Flat Methods : IP & L2

### Inner Product (IP)

In [4]:
index = faiss.IndexFlatIP(len(query_embeddings[0]))
index.add(lists_to_arrays(context_embeddings))

contexts = retrieve_contexts(index)
evaluate_index(contexts)

100%|██████████| 3000/3000 [00:17<00:00, 175.01it/s]

order unaware metrics : {'avg precision@10': 0.4238, 'avg recall@10': 0.42674666666666666, 'avg F1@10': 0.4245090175062565}
order aware metrics   : {'avg mrr': 0.4852274370643319, 'avg ndcg': 0.525069685827108, 'mean avg precision': 0.3673632671957672}





### Euclidean (L2)

In [5]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

index = faiss.IndexFlatL2(len(query_embeddings[0]))
index.add(lists_to_arrays(context_embeddings))

retrieved_contexts = retrieve_contexts(index)
evaluate_index(retrieved_contexts)

100%|██████████| 3000/3000 [00:19<00:00, 156.29it/s]

order unaware metrics : {'avg precision@10': 0.4288, 'avg recall@10': 0.4316933333333333, 'avg F1@10': 0.42950981507991104}
order aware metrics   : {'avg mrr': 0.4875897411711598, 'avg ndcg': 0.5294633020383592, 'mean avg precision': 0.37254224867724867}





## Hierarchical Navigable Small World 

In [6]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

max_neighbours = 16
ef_search = 10
ef_construction = 256

index = faiss.IndexHNSWFlat(len(query_embeddings[0]), max_neighbours)
index.hnsw.efSearch = ef_search
index.hnsw.efConstruction = ef_construction

index.add(lists_to_arrays(context_embeddings))

contexts = retrieve_contexts(index)
evaluate_index(contexts)

100%|██████████| 3000/3000 [00:00<00:00, 6971.26it/s]


order unaware metrics : {'avg precision@10': 0.40723333333333334, 'avg recall@10': 0.40941666666666665, 'avg F1@10': 0.4077799617129026}
order aware metrics   : {'avg mrr': 0.472127469345763, 'avg ndcg': 0.5019419625831757, 'mean avg precision': 0.3533193896447468}


# Hybrid Search

## BM25 then IndexFlat

In [None]:
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss

class HybridSearch:
    def __init__(self, documents):
        self.documents = documents

        # BM25 initialization
        tokenized_corpus = [text_node.text.split(" ") for text_node in documents]
        self.bm25 = BM25Okapi(tokenized_corpus)

        self.document_embeddings = [text_node.embedding for text_node in documents]
        
        # FAISS initialization
        self.index = faiss.IndexFlatIP(len(self.document_embeddings[0]))
        self.index.add(lists_to_arrays(self.document_embeddings))

    def search(self, query, top_n=10):
        # BM25 search
        bm25_scores = self.bm25.get_scores(query.query_str.split(" "))
        top_docs_indices = np.argsort(bm25_scores)[-top_n*5:]
        print(top_docs_indices)
        print()
        
        # Get embeddings of top documents from BM25 search
        top_docs_embeddings = [self.document_embeddings[i] for i in top_docs_indices]

        query_embedding = np.array(query.embedding).reshape(1, -1)

        # FAISS search on the top documents
        sub_index = faiss.IndexFlatIP(len(self.document_embeddings[0]))
        sub_index.add(np.array(top_docs_embeddings))
        distances, sub_dense_ranked_indices = sub_index.search(np.array(query_embedding), top_n)

        # Map FAISS results back to original document indices
        final_ranked_indices = [top_docs_indices[i] for i in sub_dense_ranked_indices[0]]

        # Retrieve the actual documents
        ranked_docs = [int(self.documents[i].id_) for i in final_ranked_indices]
        print(ranked_docs)
        print()
        print()
        return ranked_docs

def retrieve_contexts(hs):
    retrieved_results = [] 
    questions = hotpot_qa_df["question"].tolist()
    for query in tqdm(questions[:2]):
        retrieved_results.append(hs.search(query, top_n=10))
    return retrieved_results

hs = HybridSearch(list(contexts.values()))
results = retrieve_contexts(hs)
evaluate_index(results)

In [6]:
from baguetter.indices import FaissDenseIndex,BMXSparseIndex,MultiIndex
import pickle

def load_data(model_name):
    with open(f'embeddings/{model_name}/hard/3000/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
    with open(f'embeddings/{model_name}/hard/3000/contexts.pkl', 'rb') as file: contexts = pickle.load(file)

    questions = hotpot_qa_df["question"].tolist()


    return [q.embedding for q in questions],contexts

dense_model = "dunzhang/stella_en_400M_v5"
dense_questions,dense_contexts = load_data(dense_model)
result = {i: i + 1 for i in range(len(context_embeddings))}
# Create an index
dense_index = FaissDenseIndex(index,"dense_index",len(query_embeddings[0]),result)

_ = dense_index.search(np.array(dense_questions[0]),top_k=10)
_

SearchResults(keys=[1, 3, 7, 9, 11095, 24286, 10, 29819, 2771, 4689], scores=array([0.00727236, 0.00646719, 0.00515218, 0.00448899, 0.00323097,
       0.0031733 , 0.00310439, 0.00306923, 0.00300407, 0.00300362],
      dtype=float32), normalized=True)

In [17]:
from baguetter.indices import *


context_str = [x.text for x in list(dense_contexts.values())]
sparse_index = BM25SparseIndex(index_name="BMX_Test")
sparse_index.add_many(keys=result,values=context_str,show_progress=True)

questions = hotpot_qa_df["question"].tolist()
question_str = [q.query_str for q in questions]
x = sparse_index.search(question_str[0],top_k=10)
print(x)

Tokenization:   0%|          | 0/29874 [00:00<?, ?it/s]

Tokenization: 100%|██████████| 29874/29874 [00:10<00:00, 2985.82it/s]
Calculating Unique Tokens: 100%|██████████| 29874/29874 [00:00<00:00, 170039.58it/s]
Converting tokens to token IDs: 100%|██████████| 29874/29874 [00:01<00:00, 20010.72it/s]
Counting Tokens: 100%|██████████| 29874/29874 [00:00<00:00, 88972.03it/s]
Computing IDF: 100%|██████████| 85399/85399 [00:00<00:00, 1745102.98it/s]
Computing BM25 Scores: 100%|██████████| 29874/29874 [00:00<00:00, 38807.05it/s]


SearchResults(keys=[6, 0, 2, 8, 7, 4, 1, 3, 9, 5483], scores=array([23.43219 , 21.867594, 20.710741, 20.615828, 14.102193, 13.447546,
       13.258555, 11.743535, 11.357604,  9.480198], dtype=float32), normalized=False)


In [18]:
multi_index = MultiIndex()

idx = multi_index.add_index(sparse_index)
#idx = multi_index.add_index(dense_index)

embeddings = [q.embedding for q in hotpot_qa_df["question"].tolist()]
question_strs = [q.query_str for q in hotpot_qa_df["question"].tolist()]

results = []
for question,embedding in zip(question_strs,embeddings):
    embedding_np = np.array(embedding)
    query = {
             "BMX_Test":question#,"dense_index":embedding_np
             }
    x = idx.search(query=query,
                top_k=10)
    res = x.keys[:10]
    results.append(res)

In [19]:
results[0]

[6, 0, 2, 8, 7, 4, 1, 3, 9, 5483]

In [21]:
hotpot_qa_df.head()

Unnamed: 0,level,question,answer,actual_contexts
0,hard,"George Boscawen, 9th Viscount Falmouth is a fo...","the Guards Division, Foot Guards regiments","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
1,hard,When Vladimir Kashpur portrayed Baba Yaga she ...,trio of sisters,"[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]"
2,hard,Which musician has a solo punk rock project: T...,"Frank Anthony Iero, Jr.","[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]"
3,hard,A Disney voice actor has won which Emmy award?,Outstanding Supporting Actor,"[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]"
4,hard,Which north-western suburb of Adelaide lies wi...,Birkenhead,"[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]"


In [24]:
actual_contexts[0]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [39]:
from beir.retrieval.evaluation import EvaluateRetrieval

actual_contexts_dict = {
    str(i): {str(doc_id): 1 for doc_id in context} for i, context in enumerate(actual_contexts)
}
results_dict = {
    str(i): {str(doc_id): rank + 1 for rank, doc_id in enumerate(result)} for i, result in enumerate(results)
}

ndcg, map_score, recall, precision = EvaluateRetrieval.evaluate(
    actual_contexts_dict, results_dict, k_values=[10]
)
print("ndcg:", ndcg)
print("map:", map_score)
print("mrr:", EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [5,10], metric="mrr"))

print()

print("recall:", recall)
print("precision:", precision)
print("acc:", EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [5,10], metric="acc"))


ndcg: {'NDCG@10': 0.64287}
map: {'MAP@10': 0.5064}
mrr: {'MRR@5': 0.64042, 'MRR@10': 0.64858}

recall: {'Recall@10': 0.70542}
precision: {'P@10': 0.7017}
acc: {'Accuracy@5': 0.94233, 'Accuracy@10': 0.99967}


In [37]:
evaluate_index(results)

order unaware metrics : {'avg precision@10': 0.7017333333333333, 'avg recall@10': 0.70545, 'avg F1@10': 0.7027293429726666}
order aware metrics   : {'avg mrr': 0.3558409853972453, 'avg ndcg': 0.7594046268482914, 'mean avg precision': 0.6463913189720333}


In [None]:
BMX Alone
order unaware metrics : {'avg precision@10': 0.6945666666666666, 'avg recall@10': 0.6983166666666666, 'avg F1@10': 0.6955756880019065}
order aware metrics   : {'avg mrr': 0.35843464600655073, 'avg ndcg': 0.7521629937272413, 'mean avg precision': 0.6377886370937264}

BM25 Alone  
order unaware metrics : {'avg precision@10': 0.7017666666666668, 'avg recall@10': 0.7054833333333332, 'avg F1@10': 0.7027626763060001}
order aware metrics   : {'avg mrr': 0.35583251795162507, 'avg ndcg': 0.7594267115894429, 'mean avg precision': 0.6464246523053666}

# SPLADE then IndexFlat

In [4]:
import pickle

def load_data(model_name):
    with open(f'embeddings/{model_name}/hard/500/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
    with open(f'embeddings/{model_name}/hard/500/contexts.pkl', 'rb') as file: contexts = pickle.load(file)

    questions = hotpot_qa_df["question"].tolist()


    return [q.embedding for q in questions],contexts

dense_model = "dunzhang/stella_en_400M_v5"
dense_questions,dense_contexts = load_data(dense_model)

sparse_model = "naver/splade-v3"
sparse_questions,sparse_contexts = load_data(sparse_model)