In [1]:
import os
import numpy
from utils.hotpot_data_loader import load_test_data

model_name="mixedbread-ai/mxbai-embed-large-v1"

query_embeddings = numpy.load(f"test/{model_name}/query_embeddings.npy")
context_embeddings = numpy.load(f"test/{model_name}/context_embeddings.npy")

questions,contexts,benchmarks = load_test_data()

# Retriever Evaluation

In [None]:

from utils.evaluation_metrics.retriever import RetrieverEvaluator
import numpy as np
import pickle
from tqdm import tqdm

def retrieve_contexts(index):
    retrieved_results = [] 
    for query_embedding in tqdm(query_embeddings):
        #D, I = index.search(np.array(query_embedding).reshape(1, -1), 10)
        D, I = index.search(query_embedding.reshape(1, -1), 10)
        results = I[0].tolist()
        retrieved_results.append(results)
    return retrieved_results



save_dir = f"{f"test_results/{model_name}/"}"
if not os.path.exists(save_dir):os.makedirs(save_dir)


Recall@10   : 1.00
Precision@10: 1.00

NDCG@10     : 1.00
MAP@10      : 1.00
MRR@10      : 1.00


# Flat Methods

## Inner Product (IP)

In [4]:
import faiss

index = faiss.IndexFlatIP(len(query_embeddings[0]))
index.add(context_embeddings)

retrieved_contexts = retrieve_contexts(index)

with open(f"{save_dir}/index_flat_ip.pkl", 'wb') as file: pickle.dump(retrieved_contexts, file)

100%|██████████| 7405/7405 [01:37<00:00, 75.87it/s]


## Euclidean Distance

In [5]:
index_l2 = faiss.IndexFlatL2(len(query_embeddings[0]))
index_l2.add(context_embeddings)

retrieved_contexts = retrieve_contexts(index_l2)

with open(f"{save_dir}/index_flat_l2.pkl", 'wb') as file: pickle.dump(retrieved_contexts, file)

100%|██████████| 7405/7405 [01:43<00:00, 71.87it/s]


# Hierarchical Navigable Small World 

In [7]:
import faiss

max_neighbours = 10 # maximum number of neighbour connections a vector can have
search_ef = 1000 # number of neighbours in the HNSW graph to explore when searching.
search_ef = 500 # number of neighbours in the HNSW graph to explore when adding new vectors. 

index = faiss.IndexHNSWFlat(len(query_embeddings[0]), max_neighbours)
index.hnsw.efSearch = search_ef
index.hnsw.efConstruction = search_ef

index.add(context_embeddings)

retrieved_contexts = retrieve_contexts(index)
with open(f"{save_dir}/index_hnsw.pkl", 'wb') as file: pickle.dump(retrieved_contexts, file)

100%|██████████| 7405/7405 [00:11<00:00, 617.75it/s]


# Inverted File System

In [8]:
nlist = 5000  # how many voronoi cells
nprobe = 50 # how many nearby voronoi cells to search

quantizer = faiss.IndexFlatIP(len(query_embeddings[0]))
index_ivf = faiss.IndexIVFFlat(quantizer, len(query_embeddings[0]), nlist)
index_ivf.train(context_embeddings)
index_ivf.add(context_embeddings)
index_ivf.nprobe = nprobe

retrieved_contexts = retrieve_contexts(index_ivf)
with open(f"{save_dir}/index_ivf.pkl", 'wb') as file: pickle.dump(retrieved_contexts, file)

100%|██████████| 7405/7405 [00:07<00:00, 960.01it/s] 


# Lexical (BMX)

In [9]:
from baguetter.indices import *

index_lexical =  BMXSparseIndex(index_name="BMX_Test")
keys = {i: i + 1 for i in range(len(context_embeddings))}
index_lexical.add_many(keys=keys,
                       values=[f"Title: {doc.metadata["Title"]}\nExtract: {doc.page_content}" for doc in contexts],
                       show_progress=True)

retrieved_contexts = []

for question in tqdm(questions):
    x = index_lexical.search(query=question,top_k=10*2)

    keys = x.keys[:10]
    retrieved_contexts.append(x.keys)

with open(f"{save_dir}/index_bmx.pkl", 'wb') as file: pickle.dump(retrieved_contexts, file)

Tokenization: 100%|██████████| 73774/73774 [00:17<00:00, 4220.13it/s]
Building doc-term matrix: 100%|██████████| 73774/73774 [00:01<00:00, 43249.97it/s]
Building inverted index: 100%|██████████| 147615/147615 [00:24<00:00, 6088.05it/s]
100%|██████████| 7405/7405 [00:09<00:00, 744.95it/s]


# Ensemble

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "WhereIsAI/UAE-Large-V1"
model_kwargs = {'device': 'cuda',
                'trust_remote_code':True,}
hf = HuggingFaceEmbeddings(
    model_name=f"D:/Users/nikhi/hugging_face_embedding_models/{model_name}",
    model_kwargs=model_kwargs,
)

In [None]:
from langchain.vectorstores import FAISS
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

contexts = [f"Title: {doc.metadata["Title"]}\nExtract: {doc.page_content}" for doc in contexts]
metadatas = [{"id":i} for i in range(len(contexts))]
bm25_retriever = BM25Retriever.from_texts(
    contexts, metadatas=metadatas
)
bm25_retriever.k = 10

kwargs = {"embedding_function":hf}
text_embedding_pairs = zip(contexts, context_embeddings)
faiss_vector_store = FAISS.from_embeddings(text_embedding_pairs,context_embeddings)
faiss_vector_store.embedding_function = hf
faiss_retriever = faiss_vector_store.as_retriever(search_kwargs={"k": 10})


In [None]:
from langchain_core.documents import Document
from langchain_core.runnables import chain

ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever,faiss_retriever], 
    weights=[0.5, 0.5],
)

@chain
def search_by_vector(vector: list[float]) -> list[Document]:
    return ensemble_retriever.similarity_search_by_vector(vector)

x = search_by_vector.invoke(query_embeddings[0])

In [None]:
from tqdm import tqdm

def ensemble_retrieval(question):
    x = ensemble_retriever.invoke(question)
    retrieved_contexts = []

    for _ in x:
        retireved = []
        try:
            id = _.metadata["id"]
            retrieved_contexts.append(id)
        except Exception:
            continue

    return retrieved_contexts


contexts = []
for query in tqdm(questions, desc="Processing queries"):
    result = ensemble_retrieval(query)
    contexts.append(result)

beir_evaluation(contexts)

# Hybrid

In [10]:
import faiss
import numpy
from tqdm import tqdm
from baguetter.indices import *

query_embeddings = numpy.load(f"test/{model_name}/query_embeddings.npy")
context_embeddings = numpy.load(f"test/{model_name}/context_embeddings.npy")

questions,contexts,benchmarks = load_test_data()

class HybridSearch:
    def __init__(self, 
                 top_n,
                 contexts=contexts,
                 context_embeddings=context_embeddings,
                 questions=questions,
                 query_embeddings=query_embeddings):
        
        self.top_n = top_n
        self.contexts = contexts
        self.context_embeddings = context_embeddings
        self.questions = questions
        self.query_embeddings = query_embeddings
        
    def create_lexical_index(self,contexts):

        index_lexical =  BMXSparseIndex(index_name="BMX_Test")
        keys = {i: i + 1 for i in range(len(contexts))}
        index_lexical.add_many(keys=keys,
                            values=[f"Title: {doc.metadata["Title"]}\nExtract: {doc.page_content}" for doc in contexts],
                            show_progress=True)
        
        return index_lexical

    def create_semantic_index(self,context_embeddings):
        
        semantic_index = faiss.IndexFlatIP(len(context_embeddings[0]))
        semantic_index.add(context_embeddings)

        return semantic_index
        
    def lexical_then_semantic(self,query_num:int):

        lexical_index = self.lexical_index
        lexical_retrived_docs = lexical_index.search(query=self.questions[query_num],top_k=self.top_n*2)
        lexical_retrieved_embeddings = numpy.array([self.context_embeddings[i] for i in list(lexical_retrived_docs.keys)])

        semantic_index = self.create_semantic_index(lexical_retrieved_embeddings)
        D, I = semantic_index.search(self.query_embeddings[query_num].reshape(1, -1), self.top_n)
        semantic_retrieved_docs = I[0].tolist()
        _ = [lexical_retrived_docs.keys[i] for i in semantic_retrieved_docs]

        return _
        
    def retrieve_contexts(self):

        self.lexical_index = self.create_lexical_index(self.contexts)
        self.semantic_index = self.create_semantic_index(self.context_embeddings)
        print(f"semantic and lexical indices created\n")

        retrieved_results = [] 
        for query_num in tqdm(range(len(questions))):
            retrieved_results.append(self.lexical_then_semantic(query_num))
            
        return retrieved_results

hs = HybridSearch(top_n=10)
results = hs.retrieve_contexts()
with open(f"{save_dir}/index_hybrid.pkl", 'wb') as file: pickle.dump(retrieved_contexts, file)

Tokenization: 100%|██████████| 73774/73774 [00:16<00:00, 4494.27it/s] 
Building doc-term matrix: 100%|██████████| 73774/73774 [00:01<00:00, 46154.01it/s]
Building inverted index: 100%|██████████| 147615/147615 [00:23<00:00, 6159.63it/s]


semantic and lexical indices created



100%|██████████| 7405/7405 [00:09<00:00, 806.40it/s]
