In [1]:
import pickle

with open(f'clustering/graph/connected_graph.pkl', 'rb') as file: connected_graph = pickle.load(file)
with open(f'clustering/graph/cluster_graph.pkl', 'rb') as file: cluster_graph = pickle.load(file)
with open(f'embeddings/dunzhang/stella_en_400M_v5/hard/500/df.pkl', 'rb') as file: hotpot_qa_df = pickle.load(file)
with open(f'embeddings/dunzhang/stella_en_400M_v5/hard/500/contexts.pkl', 'rb') as file: contexts = pickle.load(file)

hotpot_qa_df['actual_contexts'] = hotpot_qa_df['actual_contexts'].apply(lambda x: [int(i) for i in x])

In [2]:
from typing import Dict,List
import networkx
import random

cluster_node_embedding_sample:Dict[int,List[float]] = {}
for cluster_id in cluster_graph.keys():
    graph:networkx.Graph = cluster_graph[cluster_id]
    node:int = list(graph.nodes)[0]#random.choice(list(graph.nodes)) # select a random node from the cluster
    cluster_node_embedding_sample[cluster_id] = contexts[str(node)].embedding

start_node = 0



# A* Traversal

In [3]:
from tqdm import tqdm
import torch
import torch.nn.functional as F

def cosine_similarity_gpu(list_A, list_B):
    # Convert lists to tensors and move to GPU
    A = torch.tensor(list_A, dtype=torch.float32).cuda()
    B = torch.tensor(list_B, dtype=torch.float32).cuda()

    # Compute cosine similarity on GPU
    cos_sim = F.cosine_similarity(A.unsqueeze(0), B.unsqueeze(0))

    return cos_sim.item()  # Convert tensor result to a Python float

def get_nodes(question_embedding,clusters,contexts=contexts):
    cosine_similarities = []
    nodes = []
    for cluster in clusters:
        for node in list(cluster_graph[cluster].nodes):
            nodes.append(node)

    # Calculate cosine similarities and store them with node identifiers
    for node in nodes:
        node_embedding = contexts[str(node)].embedding
        cosine_sim = cosine_similarity_gpu(question_embedding, node_embedding)
        cosine_similarities.append((node, cosine_sim))  # Store as (node, similarity)

    # Sort the nodes based on the highest similarity
    sorted_nodes = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)

    # Extract sorted node identifiers and their similarities
    sorted_node_ids = [node for node, sim in sorted_nodes]
    sorted_similarities = [sim for node, sim in sorted_nodes]

    return sorted_node_ids[:10]


In [4]:
THRESHOLD = 0.50

retrieved_contexts = []
for question in tqdm(hotpot_qa_df['question'], desc="Retrieving contexts", unit="question"):

    question_embedding = question.embedding

    max_cluster_similarities:List[int] = []
    max_similarity:float = -float('inf')
    max_similirity_node:int = None

    for id,sample_embedding in zip(cluster_node_embedding_sample.keys(),cluster_node_embedding_sample.values()):
        cosine_sim = cosine_similarity_gpu(question_embedding,sample_embedding)
        if cosine_sim > max_similarity: 
            max_similarity = cosine_sim
            max_similirity_node = id
        if cosine_sim > THRESHOLD:max_cluster_similarities.append(id)

    if len(max_cluster_similarities) == 0: 
        max_cluster_similarities.append(max_similirity_node)
    
    retrieved_contexts.append(get_nodes(question_embedding,max_cluster_similarities))

hotpot_qa_df["Cluster_Retrieval"] = retrieved_contexts

Retrieving contexts: 100%|██████████| 500/500 [10:52<00:00,  1.31s/question]


In [7]:
import chromadb
from tqdm import tqdm

documents = []
ids = []
embeddings = []
metadatas = []

for context in contexts.values():
    documents.append(context.text)
    ids.append(context.id_)
    embeddings.append(context.embedding)
    metadatas.append({'caption': context.metadata['caption']})

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="my_collection2",metadata={"hnsw:space": "cosine"})
collection.add(documents=documents,
               ids=ids,
               embeddings=embeddings,
               metadatas=metadatas)

retrieved_contexts = []
for question in tqdm(hotpot_qa_df['question'], desc="Retrieving contexts", unit="question"):

    question_embedding = question.embedding
    result = collection.query(
                                query_embeddings=[question_embedding], # Chroma will embed this for you
                                n_results=10 # how many results to return
                              )
    retrieved_contexts.append([int(node) for node in result["ids"][0]] )

hotpot_qa_df["Vector_Similarity_Retrieval"] = retrieved_contexts

Retrieving contexts: 100%|██████████| 500/500 [00:00<00:00, 580.53question/s]


In [10]:
from utils.evaluation_metrics.retriever import RetrieverEvaluator

evaluator = RetrieverEvaluator(hotpot_qa_df,'Cluster_Retrieval')

order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
order_aware_metrics = evaluator.get_order_aware_metrics() 

print(f"Cluster_Retrieval:")
print(f"order unaware metrics : {order_unaware_metrics}")
print(f"order aware metrics   : {order_aware_metrics}")

Cluster_Retrieval:
order unaware metrics : {'avg precision@10': 0.461, 'avg recall@10': 0.95, 'avg F1@10': 0.5770785160831601}
order aware metrics   : {'avg mrr': 0.5007951284958427, 'avg ndcg': 0.5650973212401913, 'mean avg precision': 0.44296857142857143}


In [11]:
evaluator = RetrieverEvaluator(hotpot_qa_df,'Vector_Similarity_Retrieval')

order_unaware_metrics = evaluator.get_order_unaware_metrics(k=10) 
order_aware_metrics = evaluator.get_order_aware_metrics() 

print(f"Vector_Similarity_Retrieval:")
print(f"order unaware metrics : {order_unaware_metrics}")
print(f"order aware metrics   : {order_aware_metrics}")

Vector_Similarity_Retrieval:
order unaware metrics : {'avg precision@10': 0.5963999999999999, 'avg recall@10': 0.992, 'avg F1@10': 0.7160102046216597}
order aware metrics   : {'avg mrr': 0.41551405549256737, 'avg ndcg': 0.6828619205100309, 'mean avg precision': 0.5532635714285714}


In [2]:
from langchain_core.documents import Document

documents = []
ids = []
embeddings = []
metadatas = []

for context in contexts.values():
    documents.append(Document(page_content=context.text, metadata={'caption': context.metadata['caption']}))
    ids.append(context.id_)
    embeddings.append(context.embedding)
    metadatas.append({'caption': context.metadata['caption']})

In [9]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "dunzhang/stella_en_400M_v5"
model_kwargs = {
    "trust_remote_code": True,
    "device": "cuda",
    "model_kwargs": {"attn_implementation": "eager"}
}

model = HuggingFaceEmbeddings(show_progress=True,
                              model_name="D:/Users/nikhi/hugging_face_embedding_models/dunzhang/stella_en_400M_v5",
                              model_kwargs = model_kwargs)


In [11]:
!pip install scann


ERROR: Could not find a version that satisfies the requirement scann (from versions: none)
ERROR: No matching distribution found for scann


In [10]:
from langchain_community.vectorstores import ScaNN

db = ScaNN.from_documents(documents=documents[:3], embedding=model)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

ImportError: Could not import scann python package. Please install it with `pip install scann`.