# Load HotPotQA Data

In [2]:
import pandas as pd

df = pd.read_json("data/hotpot_test_fullwiki_v1.json")#data\hotpot_test_fullwiki_v1.json
questions = df["question"].tolist()
contexts = df["context"].tolist()
df.head()

Unnamed: 0,_id,question,context
0,5adf9ba1554299025d62a2db,What position on the Billboard Top 100 did Ali...,"[[The Other Side of Love, [""The Other Side of ..."
1,5a7befac5542996dd594b857,What year did the British politician born in 1...,"[[Philip Cowley, [Philip Cowley is a British p..."
2,5ab80e565542991d32223821,"Which franchise was founded in 1978, Chuck E. ...","[[The Rock-afire Explosion, [The Rock-a-fire E..."
3,5ae09e595542993d6555ebc0,Roden Cutler House is owned by an electricity ...,"[[Askin–Cutler ministry (1965–68), [The Askin–..."
4,5ae6094555429929b0807a96,What year was the inspiration for the 2009 dra...,"[[Mick Fanning, [Michael Eugene ""Mick"" Fanning..."


In [3]:
from typing import List
from langchain_core.documents import Document

context_docs:List[Document] = []
benchmarks:List[List[int]] = []
context_id:int = 0

for context in contexts:
    benchmark_context_ids:List[int] = []

    for title_and_sentences in context:
        title = title_and_sentences[0]
        benchmark_context_ids.append(context_id)

        sentences = " ".join(title_and_sentences[1])
        sentence = ' '.join(sentences.split())

        document:Document = Document(page_content=sentence,metadata={"ID":context_id,"Title":title})
        context_docs.append(document)

        context_id += 1

    benchmarks.append(benchmark_context_ids)

# Check machine configs

In [4]:
import torch

print(torch.__version__)
print("CUDA Version: ", torch.version.cuda)
print("Device name:", torch.cuda.get_device_properties("cuda").name)
print("FlashAttention available:", torch.backends.cuda.flash_sdp_enabled())

2.4.1+cu124
CUDA Version:  12.4
Device name: NVIDIA GeForce RTX 3050 Laptop GPU
FlashAttention available: True


# Select Embeddding Model

In [5]:
from sentence_transformers import SentenceTransformer
from utils.embedders.sentence_transformer_embedder import SentenceTransformerEmbedder

embedder = SentenceTransformerEmbedder(huggingface_token = "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",
                                       cache_loc         = "D:/Users/nikhi/.cache/huggingface/hub/",
                                       model_save_loc    = "D:/Users/nikhi/hugging_face_embedding_models")

model_name="mixedbread-ai/mxbai-embed-large-v1" #dunzhang/stella_en_400M_v5
#embedder.download_embedding_model(model_name=model_name)

model = SentenceTransformer(model_name_or_path = f"D:/Users/nikhi/hugging_face_embedding_models/{model_name}", 
                            trust_remote_code=True,
                            device="cuda",
                            model_kwargs={"attn_implementation": "eager"})


  from tqdm.autonotebook import tqdm, trange





## Embed queries

In [6]:
query_embeddings = model.encode(questions,prompt_name="query",show_progress_bar=True)

Batches:   0%|          | 0/232 [00:00<?, ?it/s]

## Embed Actual Contexts

In [11]:
docs = [f"Title: {doc.metadata["Title"]}\nExtract: {doc.page_content}" for doc in context_docs]
doc_embeddings = model.encode(docs,show_progress_bar=True)
context_similarities = model.similarity(doc_embeddings,doc_embeddings)

Batches:   0%|          | 0/2306 [00:00<?, ?it/s]

## Save Embeddings

In [7]:
import numpy as np
import os

save_dir = f"{f"test/{model_name}/"}"
if not os.path.exists(save_dir):os.makedirs(save_dir)

#np.save(f"{save_dir}/context_embeddings.npy",doc_embeddings)
np.save(f"{save_dir}/query_embeddings.npy",query_embeddings)
#np.save(f"{save_dir}/context_similarities.npy",context_similarities)

## Load Embeddings

In [18]:
import numpy

model_name="mixedbread-ai/mxbai-embed-large-v1"

query_embeddings = numpy.load(f"test/{model_name}/query_embeddings.npy")
context_embeddings = numpy.load(f"test/{model_name}/context_embeddings.npy")

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
from utils.embedding_stores.graph import graph_db

from typing import List
from llama_index.core.schema import TextNode
from llama_index.core.schema import QueryBundle

def create_retriever(chunk_nodes,k):


    bm25_retriever = BM25Retriever.from_defaults(nodes=chunk_nodes, similarity_top_k=k)

    print(f"BM-25 Retriever created")
    return bm25_retriever    

In [None]:
from utils.embedding_retrievers import bm25
from utils.embedding_retrievers.graph import a_star,bfs
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import pickle


def retrieve_chunks_bm25(db,query_bundles:QueryBundle):
    
    chunks = bm25.perform_retrieval(query_bundles,db)
    
    with open('.tmp/benchmarks.pkl', 'rb') as f: 
        benchmarks_dict = pickle.load(f)

    for chunk in chunks:
        ids = [int(t_node.node.id_) for t_node in chunk]
        benchmarks_dict["retrieved_contexts"].append(ids)

    df_benchmark = pd.DataFrame(benchmarks_dict)
    return df_benchmark

In [None]:
from utils.embedding_retrievers.vector import vec_similarity
from utils.embedding_stores.vector import vec_db

V = vec_db.create_or_load_vector_db(db_name=f"chromadb2",
                                        save_loc="chromadb",
                                        docs=contexts)

hotpot_qa_df['vec_retrieved'] = vec_similarity.perform_retrieval(hotpot_qa_df,V)
hotpot_qa_df.head()

In [None]:
from utils.embedding_stores.graph import graph_db
from utils.embedding_retrievers.graph import a_star

THRESHOLD = 0.5
G = graph_db.create_graph(contexts_list,context_similarities,THRESHOLD,save_dir+f"/graph_store/{THRESHOLD}",True)
G = graph_db.load_graph(save_dir+f"/graph_store/{THRESHOLD}")
nodes_with_score = a_star.perform_retrieval(hotpot_qa_df,G,THRESHOLD)
node_ids = [[str(node.id_) for node in node_with_score[:10]] for node_with_score in nodes_with_score]
hotpot_qa_df['a_star_retrieved'] = node_ids

In [None]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

evaluator = RetrieverEvaluator(hotpot_qa_df,'a_star_retrieved')

order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
order_aware_metrics = evaluator.get_order_aware_metrics() 

print(f"\nA star:")
print(f"order unaware metrics : {order_unaware_metrics}")
print(f"order aware metrics   : {order_aware_metrics}")

evaluator = RetrieverEvaluator(hotpot_qa_df,'vec_retrieved')

order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
order_aware_metrics = evaluator.get_order_aware_metrics()

print(f"\nVec sim:")
print(f"order unaware metrics : {order_unaware_metrics}")
print(f"order aware metrics   : {order_aware_metrics}")
hotpot_qa_df.head(10)

In [None]:
from cdlib import algorithms

x = algorithms.surprise_communities
def make_communities(alg,G):    
    print(f"creating community")
    community = eval(f"algorithms.{alg}(G)")
    print(f"community created")
    analyze_clusters(community)
    show_community_member_counts(community)

    return community

In [None]:
import torch
from sentence_transformers import util

def get_embeddings_from_community(contexts,community):
    embeddings = []
    for node in community:
        text_node = contexts[str(node)]
        embeddings.append(text_node.embedding)
    embeddings_tensor = torch.tensor(embeddings).to('cuda')
    
    return embeddings_tensor

def get_similarity_matrix(tensors):
    similarity_matrix = util.cos_sim(tensors, tensors)

    return similarity_matrix

def create_community_graph(community,similarity_matrix):
    G = nx.Graph()
    G.add_nodes_from(community)

    for i in range(len(community)):
        for j in range(i+1, len(community)):
            similarity = similarity_matrix[i][j]
    
            G.add_edge(community[i], community[j], weight=similarity.item())
    return G

# embeddings = get_embeddings_from_community(contexts=contexts,community=clusters[0])
# embeddings_tensor = torch.tensor(embeddings).to('cuda')
# sim_mat = get_similarity_matrix(embeddings_tensor)
# comm_graph  = create_community_graph(clusters[0],sim_mat)
# graph_db.visualize_graph(comm_graph)


# Louvain Community

In [None]:
louvain_comm = make_communities('louvain',G)

# Leiden Community

In [None]:
leiden_comm = make_communities('leiden',G)

# Surprise Community

In [None]:
surprise_comm = make_communities('surprise_communities',G)

# Walktrap Community

In [None]:
walktrap_comm = make_communities('walktrap',G)

# Clustering

In [None]:
get_num_communities(community)
draw_graph(G, pos, community.communities)
analyze_clusters(community)
louvain_cluster_counts = get_community_member_counts(community)

# Retriever Evaluation

In [None]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

evaluator = RetrieverEvaluator(hotpot_qa_df,'a_star_retrieved')

order_unaware_metrics = evaluator.get_order_unaware_metrics(k=1) 
#order_aware_metrics = evaluator.get_order_aware_metrics() 

print(order_unaware_metrics)
#print(order_aware_metrics)

In [None]:
BM25 = create_retriever(embedded_chunks,10)
df_results_bm25 = retrieve_chunks_bm25(BM25,embedded_queries)
evaluate_results(df_results_bm25)

In [None]:
THRESHOLD = 0.5
#G = graph_db.create_graph(embedded_chunks,chunk_similarities,THRESHOLD,save_dir)
df_results = retrieve_chunk_graph(db=G,
                                  traversal_method='a star',
                                  threshold=THRESHOLD,
                                  query_bundles=embedded_queries)
evaluate_results(df_results)

In [None]:
contexts, hotpot_qa_df = HotPotQA(SAMPLE=SAMPLE, DIFFICULTY=DIFFICULTY, SEED=SEED).get_data()
hotpot_qa_df.head()

In [None]:
import pandas as pd

df = df = pd.read_json("data/hotpot_test_fullwiki_v1.json")#data\hotpot_test_fullwiki_v1.json
df.head()