# Load HotPotQA Data

In [1]:
import pandas as pd
from utils.hotpot_data_loader import HotPotQA

SEED = 42
SAMPLE = 5000
DIFFICULTY = "hard"

contexts, hotpot_qa_df = HotPotQA(SAMPLE=SAMPLE, DIFFICULTY=DIFFICULTY, SEED=SEED).get_data()
hotpot_qa_df.head()

Unnamed: 0,level,question,answer,actual_contexts
0,hard,"George Boscawen, 9th Viscount Falmouth is a fo...","the Guards Division, Foot Guards regiments","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
1,hard,When Vladimir Kashpur portrayed Baba Yaga she ...,trio of sisters,"[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]"
2,hard,Which musician has a solo punk rock project: T...,"Frank Anthony Iero, Jr.","[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]"
3,hard,A Disney voice actor has won which Emmy award?,Outstanding Supporting Actor,"[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]"
4,hard,Which north-western suburb of Adelaide lies wi...,Birkenhead,"[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]"


# Check machine configs

In [2]:
import torch

print(torch.__version__)
print("CUDA Version: ", torch.version.cuda)
print("Device name:", torch.cuda.get_device_properties("cuda").name)
print("FlashAttention available:", torch.backends.cuda.flash_sdp_enabled())

2.4.1+cu124
CUDA Version:  12.4
Device name: NVIDIA GeForce RTX 3050 Laptop GPU
FlashAttention available: True


# Select Embeddding Model

In [3]:
from sentence_transformers import SentenceTransformer
from utils.embedders.sentence_transformer_embedder import SentenceTransformerEmbedder
from llama_index.core.schema import TextNode
from llama_index.core.schema import QueryBundle

embedder = SentenceTransformerEmbedder(huggingface_token = "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",
                                       cache_loc         = "D:/Users/nikhi/.cache/huggingface/hub/",
                                       model_save_loc    = "D:/Users/nikhi/hugging_face_embedding_models")

model_name="dunzhang/stella_en_1.5B_v5" #dunzhang/stella_en_1.5B_v5
#embedder.download_embedding_model(model_name=model_name)

model = SentenceTransformer(model_name_or_path = f"D:/Users/nikhi/hugging_face_embedding_models/{model_name}", 
                            trust_remote_code=True,
                            device="cuda",
                            model_kwargs={"attn_implementation": "eager"})


  from tqdm.autonotebook import tqdm, trange





A matching Triton is not available, some optimizations will not be enabled
Traceback (most recent call last):
  File "d:\Users\nikhi\anaconda3\envs\masters\Lib\site-packages\xformers\__init__.py", line 57, in _is_triton_available
    import triton  # noqa
    ^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'triton'


## Embed queries

In [4]:
query_prompt_name = "s2p_query"
queries = [(doc.text if isinstance(doc, TextNode) else doc.query_str) for doc in hotpot_qa_df['question']]
query_embeddings = model.encode(queries,show_progress_bar=True,prompt_name=query_prompt_name)
for embedding,query in zip(query_embeddings,hotpot_qa_df['question']):
    query.embedding = embedding.tolist()

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

## Embed Actual Contexts

In [5]:
docs = [(doc.text if isinstance(doc, TextNode) else doc.query_str) for doc in contexts.values()]
doc_embeddings = model.encode(docs,show_progress_bar=True)
for embedding,document in zip(doc_embeddings,contexts.values()):document.embedding = embedding.tolist()
context_similarities = model.similarity(doc_embeddings,doc_embeddings)

Batches:   0%|          | 0/1556 [00:00<?, ?it/s]

## Save Embeddings

In [6]:
import pickle

save_dir = f"embeddings/{model_name}/{DIFFICULTY}/{SAMPLE}"

embedder.save_embeddings(contexts=contexts,
                         df=hotpot_qa_df,
                         df_name = f'df',
                         similarity_matrix=context_similarities,
                         save_dir=save_dir)

embedded chunks saved to : embeddings/dunzhang/stella_en_400M_v5/hard/5000 as contexts.pkl
df saved to : embeddings/dunzhang/stella_en_400M_v5/hard/5000 as embedded_queries.pkl
similarity matrix saved to : embeddings/dunzhang/stella_en_400M_v5/hard/5000 as similarity_matrix.pkl


## Load Embeddings

In [6]:
import pickle
import pandas as pd


model_name="dunzhang/stella_en_400M_v5" #dunzhang/stella_en_1.5B_v5
SAMPLE = 500
DIFFICULTY = "500"
save_dir = f"embeddings/{model_name}/{DIFFICULTY}/{SAMPLE}"
with open(f'embeddings/dunzhang/stella_en_400M_v5/hard/500/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
with open(f'embeddings/dunzhang/stella_en_400M_v5/hard/500/contexts.pkl', 'rb') as file: contexts = pickle.load(file)
with open(f'embeddings/dunzhang/stella_en_400M_v5/hard/500/similarity_matrix.pkl', 'rb') as f:context_similarities = pickle.load(f)

hotpot_qa_df['actual_contexts'] = hotpot_qa_df['actual_contexts'].apply(lambda x: [int(i) for i in x])

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
from utils.embedding_stores.graph import graph_db

from typing import List
from llama_index.core.schema import TextNode
from llama_index.core.schema import QueryBundle

def create_retriever(chunk_nodes,k):


    bm25_retriever = BM25Retriever.from_defaults(nodes=chunk_nodes, similarity_top_k=k)

    print(f"BM-25 Retriever created")
    return bm25_retriever    

In [5]:
from utils.embedding_retrievers import bm25
from utils.embedding_retrievers.graph import a_star,bfs
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import pickle


def retrieve_chunks_bm25(db,query_bundles:QueryBundle):
    
    chunks = bm25.perform_retrieval(query_bundles,db)
    
    with open('.tmp/benchmarks.pkl', 'rb') as f: 
        benchmarks_dict = pickle.load(f)

    for chunk in chunks:
        ids = [int(t_node.node.id_) for t_node in chunk]
        benchmarks_dict["retrieved_contexts"].append(ids)

    df_benchmark = pd.DataFrame(benchmarks_dict)
    return df_benchmark

In [None]:
from utils.embedding_retrievers.vector import vec_similarity
from utils.embedding_stores.vector import vec_db

V = vec_db.create_or_load_vector_db(db_name=f"chromadb2",
                                        save_loc="chromadb",
                                        docs=contexts)

hotpot_qa_df['vec_retrieved'] = vec_similarity.perform_retrieval(hotpot_qa_df,V)
hotpot_qa_df.head()

In [None]:
from utils.embedding_stores.graph import graph_db
from utils.embedding_retrievers.graph import a_star

THRESHOLD = 0.5
G = graph_db.create_graph(contexts_list,context_similarities,THRESHOLD,save_dir+f"/graph_store/{THRESHOLD}",True)
G = graph_db.load_graph(save_dir+f"/graph_store/{THRESHOLD}")
nodes_with_score = a_star.perform_retrieval(hotpot_qa_df,G,THRESHOLD)
node_ids = [[str(node.id_) for node in node_with_score[:10]] for node_with_score in nodes_with_score]
hotpot_qa_df['a_star_retrieved'] = node_ids

In [None]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

evaluator = RetrieverEvaluator(hotpot_qa_df,'a_star_retrieved')

order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
order_aware_metrics = evaluator.get_order_aware_metrics() 

print(f"\nA star:")
print(f"order unaware metrics : {order_unaware_metrics}")
print(f"order aware metrics   : {order_aware_metrics}")

evaluator = RetrieverEvaluator(hotpot_qa_df,'vec_retrieved')

order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
order_aware_metrics = evaluator.get_order_aware_metrics()

print(f"\nVec sim:")
print(f"order unaware metrics : {order_unaware_metrics}")
print(f"order aware metrics   : {order_aware_metrics}")
hotpot_qa_df.head(10)

In [None]:
from cdlib import algorithms

x = algorithms.surprise_communities
def make_communities(alg,G):    
    print(f"creating community")
    community = eval(f"algorithms.{alg}(G)")
    print(f"community created")
    analyze_clusters(community)
    show_community_member_counts(community)

    return community

In [None]:
import torch
from sentence_transformers import util

def get_embeddings_from_community(contexts,community):
    embeddings = []
    for node in community:
        text_node = contexts[str(node)]
        embeddings.append(text_node.embedding)
    embeddings_tensor = torch.tensor(embeddings).to('cuda')
    
    return embeddings_tensor

def get_similarity_matrix(tensors):
    similarity_matrix = util.cos_sim(tensors, tensors)

    return similarity_matrix

def create_community_graph(community,similarity_matrix):
    G = nx.Graph()
    G.add_nodes_from(community)

    for i in range(len(community)):
        for j in range(i+1, len(community)):
            similarity = similarity_matrix[i][j]
    
            G.add_edge(community[i], community[j], weight=similarity.item())
    return G

# embeddings = get_embeddings_from_community(contexts=contexts,community=clusters[0])
# embeddings_tensor = torch.tensor(embeddings).to('cuda')
# sim_mat = get_similarity_matrix(embeddings_tensor)
# comm_graph  = create_community_graph(clusters[0],sim_mat)
# graph_db.visualize_graph(comm_graph)


# Louvain Community

In [None]:
louvain_comm = make_communities('louvain',G)

# Leiden Community

In [None]:
leiden_comm = make_communities('leiden',G)

# Surprise Community

In [None]:
surprise_comm = make_communities('surprise_communities',G)

# Walktrap Community

In [None]:
walktrap_comm = make_communities('walktrap',G)

# Clustering

In [None]:
get_num_communities(community)
draw_graph(G, pos, community.communities)
analyze_clusters(community)
louvain_cluster_counts = get_community_member_counts(community)

# Retriever Evaluation

In [None]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

evaluator = RetrieverEvaluator(hotpot_qa_df,'a_star_retrieved')

order_unaware_metrics = evaluator.get_order_unaware_metrics(k=1) 
#order_aware_metrics = evaluator.get_order_aware_metrics() 

print(order_unaware_metrics)
#print(order_aware_metrics)

In [None]:
BM25 = create_retriever(embedded_chunks,10)
df_results_bm25 = retrieve_chunks_bm25(BM25,embedded_queries)
evaluate_results(df_results_bm25)

In [None]:
THRESHOLD = 0.5
#G = graph_db.create_graph(embedded_chunks,chunk_similarities,THRESHOLD,save_dir)
df_results = retrieve_chunk_graph(db=G,
                                  traversal_method='a star',
                                  threshold=THRESHOLD,
                                  query_bundles=embedded_queries)
evaluate_results(df_results)