In [135]:
import pickle

model = "dunzhang/stella_en_400M_v5"
number = 5000
difficulty = "hard"

with open(f'embeddings/{model}/{difficulty}/{number}/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
with open(f'embeddings/{model}/{difficulty}/{number}/contexts.pkl', 'rb') as file: contexts = pickle.load(file)

hotpot_qa_df['actual_contexts'] = hotpot_qa_df['actual_contexts'].apply(lambda x: [int(i) for i in x])

query_embeddings = [query.embedding for query in hotpot_qa_df['question'].tolist()]
query_strs = [query.query_str for query in hotpot_qa_df['question'].tolist()]

context_texts = [textnode.text for textnode in contexts.values()]
context_embeddings = [textnode.embedding for textnode in contexts.values()]

benchmark_contexts = hotpot_qa_df['actual_contexts'].tolist()

hotpot_qa_df.head()

Unnamed: 0,level,question,answer,actual_contexts
0,hard,"George Boscawen, 9th Viscount Falmouth is a fo...","the Guards Division, Foot Guards regiments","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
1,hard,When Vladimir Kashpur portrayed Baba Yaga she ...,trio of sisters,"[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]"
2,hard,Which musician has a solo punk rock project: T...,"Frank Anthony Iero, Jr.","[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]"
3,hard,A Disney voice actor has won which Emmy award?,Outstanding Supporting Actor,"[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]"
4,hard,Which north-western suburb of Adelaide lies wi...,Birkenhead,"[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]"


In [136]:
from beir.retrieval.evaluation import EvaluateRetrieval
import numpy as np
from tqdm import tqdm

def lists_to_arrays(list_of_lists):
    return np.array([np.array(lst) for lst in list_of_lists], dtype=object)

def retrieve_contexts(index):
    retrieved_results = [] 
    for query_embedding in tqdm(query_embeddings):
        D, I = index.search(np.array(query_embedding).reshape(1, -1), 10)
        results = I[0].tolist()
        retrieved_results.append(results)
    return retrieved_results

def beir_evaluation(retrieved_contexts,benchmark_contexts = benchmark_contexts):
    actual_contexts_dict = {
        str(i): {str(doc_id): 1 for doc_id in context} for i, context in enumerate(retrieved_contexts)
    }
    results_dict = {
        str(i): {str(doc_id): rank + 1 for rank, doc_id in enumerate(result)} for i, result in enumerate(benchmark_contexts)
    }

    ndcg, map_score, recall, precision = EvaluateRetrieval.evaluate(
        actual_contexts_dict, results_dict, k_values=[10]
    )

    print("recall:", recall)
    print("precision:", precision)
    #print("acc:", EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [10], metric="acc"))

    print()

    print("ndcg:", ndcg)
    print("map:", map_score)
    print("mrr:", EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [10], metric="mrr"))

# Flat Methods

## Inner Product (IP)

In [3]:
import faiss

index = faiss.IndexFlatIP(len(query_embeddings[0]))
index.add(lists_to_arrays(context_embeddings))

retrieved_contexts = retrieve_contexts(index)

beir_evaluation(retrieved_contexts)

100%|██████████| 5000/5000 [00:48<00:00, 103.94it/s]


recall: {'Recall@10': 0.38146}
precision: {'P@10': 0.38146}

ndcg: {'NDCG@10': 0.38303}
map: {'MAP@10': 0.22887}
mrr: {'MRR@10': 0.59057}


## Euclidean Distance

In [4]:
index_l2 = faiss.IndexFlatL2(len(query_embeddings[0]))
index_l2.add(lists_to_arrays(context_embeddings))

retrieved_contexts = retrieve_contexts(index_l2)

beir_evaluation(retrieved_contexts)

100%|██████████| 5000/5000 [00:49<00:00, 101.34it/s]


recall: {'Recall@10': 0.38436}
precision: {'P@10': 0.38436}

ndcg: {'NDCG@10': 0.38624}
map: {'MAP@10': 0.23262}
mrr: {'MRR@10': 0.59279}


# Hierarchical Navigable Small World 

In [5]:
max_neighbours = 10 # maximum number of neighbour connections a vector can have
search_ef = 1000 # number of neighbours in the HNSW graph to explore when searching.
search_ef = 500 # number of neighbours in the HNSW graph to explore when adding new vectors. 

index = faiss.IndexHNSWFlat(len(query_embeddings[0]), max_neighbours)
index.hnsw.efSearch = search_ef
index.hnsw.efConstruction = search_ef

index.add(lists_to_arrays(context_embeddings))

retrieved_contexts = retrieve_contexts(index)
beir_evaluation(retrieved_contexts)

100%|██████████| 5000/5000 [00:08<00:00, 620.83it/s]

recall: {'Recall@10': 0.38414}
precision: {'P@10': 0.38414}

ndcg: {'NDCG@10': 0.38595}
map: {'MAP@10': 0.23256}
mrr: {'MRR@10': 0.59194}





# Inverted File System

In [6]:
nlist = 5000  # how many voronoi cells
nprobe = 50 # how many nearby voronoi cells to search

quantizer = faiss.IndexFlatIP(len(query_embeddings[0]))
index_ivf = faiss.IndexIVFFlat(quantizer, len(query_embeddings[0]), nlist)
index_ivf.train(lists_to_arrays(context_embeddings))
index_ivf.add(lists_to_arrays(context_embeddings))
index_ivf.nprobe = nprobe

retrieved_contexts = retrieve_contexts(index_ivf)
beir_evaluation(retrieved_contexts)

100%|██████████| 5000/5000 [00:05<00:00, 921.09it/s]


recall: {'Recall@10': 0.3712}
precision: {'P@10': 0.3712}

ndcg: {'NDCG@10': 0.37269}
map: {'MAP@10': 0.22263}
mrr: {'MRR@10': 0.57608}


# Hybrid Retrievers

## Lexical (BMX)

In [10]:
from baguetter.indices import *

index_lexical =  BM25SparseIndex(index_name="BMX_Test")
keys = {i: i + 1 for i in range(len(context_embeddings))}
index_lexical.add_many(keys=keys,values=context_texts,show_progress=True)

retrieved_contexts = []
scores = []
for question,embedding in zip(query_strs,query_embeddings):
    x = index_lexical.search(query=question,top_k=15)
    retrieved_contexts.append(x.keys)
    scores.append(x.scores)

beir_evaluation(retrieved_contexts)

recall: {'Recall@10': 0.49952}
precision: {'P@10': 0.74928}

ndcg: {'NDCG@10': 0.75008}
map: {'MAP@10': 0.42671}
mrr: {'MRR@10': 0.85524}


In [56]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "dunzhang/stella_en_400M_v5"
model_kwargs = {'device': 'cuda',
                'trust_remote_code':True,}
hf = HuggingFaceEmbeddings(
    model_name=f"D:/Users/nikhi/hugging_face_embedding_models/{model_name}",
    model_kwargs=model_kwargs,
)

Traceback (most recent call last):
  File "d:\Users\nikhi\anaconda3\envs\masters\Lib\site-packages\xformers\__init__.py", line 57, in _is_triton_available
    import triton  # noqa
    ^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'triton'


In [129]:
from langchain.vectorstores import FAISS
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

metadatas = [{"id":i} for i in range(len(context_texts))]
bm25_retriever = BM25Retriever.from_texts(
    context_texts, metadatas=metadatas
)
bm25_retriever.k = 10

kwargs = {"embedding_function":hf}
text_embedding_pairs = zip(context_texts, context_embeddings)
faiss_vector_store = FAISS.from_embeddings(text_embedding_pairs,context_embeddings)
faiss_vector_store.embedding_function = hf
faiss_retriever = faiss_vector_store.as_retriever(search_kwargs={"k": 10})




In [132]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever,faiss_retriever], 
    weights=[0.5, 0.5],
)

In [131]:
from tqdm import tqdm

def ensemble_retrieval(question):
    x = ensemble_retriever.invoke(question)
    retrieved_contexts = []

    for _ in x:
        retireved = []
        try:
            id = _.metadata["id"]
            retrieved_contexts.append(id)
        except Exception:
            continue

    return retrieved_contexts


contexts = []
for query in tqdm(query_strs, desc="Processing queries"):
    result = ensemble_retrieval(query)
    contexts.append(result)

beir_evaluation(contexts)

Processing queries: 100%|██████████| 5000/5000 [36:15<00:00,  2.30it/s]

recall: {'Recall@10': 0.48571}
precision: {'P@10': 0.4435}

ndcg: {'NDCG@10': 0.46943}
map: {'MAP@10': 0.31503}
mrr: {'MRR@10': 0.63537}





In [125]:
#0.4 0. 6 

recall: {'Recall@10': 0.48571}
precision: {'P@10': 0.4435}

ndcg: {'NDCG@10': 0.46943}
map: {'MAP@10': 0.31503}
mrr: {'MRR@10': 0.63537}

#0.3, 0.7
recall: {'Recall@10': 0.48571}
precision: {'P@10': 0.4435}

ndcg: {'NDCG@10': 0.46943}
map: {'MAP@10': 0.31503}
mrr: {'MRR@10': 0.63537}

#05 0.5

recall: {'Recall@10': 0.48571}
precision: {'P@10': 0.4435}

ndcg: {'NDCG@10': 0.46943}
map: {'MAP@10': 0.31503}
mrr: {'MRR@10': 0.63537}


## Hybrid

In [141]:
import faiss
import concurrent.futures
from tqdm import tqdm

class HybridSearch:
    def __init__(self, lexical_index,contexts_dict=contexts):
        self.lexical_index = lexical_index
        self.contexts_dict = contexts_dict

    def search(self, query, top_n):

        lexical_retrived_docs = self.lexical_index.search(query=query.query_str,top_k=top_n*1)
        lexical_retrieved_embeddings = [self.contexts_dict[str(i)].embedding for i in list(lexical_retrived_docs.keys)]
    
        semantic_index = faiss.IndexFlatIP(len(query.embedding))
        semantic_index.add(lists_to_arrays(lexical_retrieved_embeddings))

        D, I = semantic_index.search(np.array(query.embedding).reshape(1, -1), top_n)
        semantic_retrieved_docs = I[0].tolist()
        _ = [lexical_retrived_docs.keys[i] for i in semantic_retrieved_docs]

        return _
    
def retrieve_contexts_parallel(hs):
    questions = hotpot_qa_df["question"].tolist()
    retrieved_results = [None] * len(questions)
    
    def fetch_result(idx, query):
        return idx, hs.search(query, top_n=10)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(fetch_result, idx, query) for idx, query in enumerate(questions)]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(questions)):
            idx, result = future.result()
            retrieved_results[idx] = result

    return retrieved_results

def retrieve_contexts(hs):
    retrieved_results = [] 
    questions = hotpot_qa_df["question"].tolist()
    for query in tqdm(questions):
        retrieved_results.append(hs.search(query, top_n=10))
        
    return retrieved_results


In [142]:
hs = HybridSearch(index_lexical)
results = retrieve_contexts(hs)
beir_evaluation(results)

100%|██████████| 5000/5000 [00:07<00:00, 682.43it/s]


recall: {'Recall@10': 0.64222}
precision: {'P@10': 0.64222}

ndcg: {'NDCG@10': 0.64413}
map: {'MAP@10': 0.50007}
mrr: {'MRR@10': 0.7928}


In [None]:
recall: {'Recall@10': 0.64222}
precision: {'P@10': 0.64222}

ndcg: {'NDCG@10': 0.64413}
map: {'MAP@10': 0.50007}
mrr: {'MRR@10': 0.7928}