In [1]:
import os
from beir.datasets.data_loader import GenericDataLoader

# Define dataset paths
base_path = r"T:\BotGauge"
nq_path = os.path.join(base_path, "NaturalQuestions")
hotpotqa_path = os.path.join(base_path, "HotpotQA")
fiqa_path = os.path.join(base_path, "fiQA")

# Function to load and print dataset statistics
def load_and_print_stats(dataset_name, dataset_path):
    corpus, queries, qrels = GenericDataLoader(data_folder=dataset_path).load(split="test")
    print(f"{dataset_name} Corpus: {len(corpus)} Queries: {len(queries)} Qrels: {len(qrels)}")
    return corpus, queries, qrels

# Load and print statistics for each dataset
nq_corpus, nq_queries, nq_qrels = load_and_print_stats("Natural Questions", nq_path)
hotpotqa_corpus, hotpotqa_queries, hotpotqa_qrels = load_and_print_stats("HotpotQA", hotpotqa_path)
fiqa_corpus, fiqa_queries, fiqa_qrels = load_and_print_stats("FiQA", fiqa_path)

# Example: Accessing first few entries from Natural Questions corpus
print("\nSample from Natural Questions Corpus:")
for doc_id, doc_info in list(nq_corpus.items())[:3]:
    print(f"Doc ID: {doc_id}, Content: {doc_info['text'][:200]}...")  # Print first 200 chars of the document


  from tqdm.autonotebook import tqdm
100%|██████████| 2681468/2681468 [00:12<00:00, 220585.45it/s]


Natural Questions Corpus: 2681468 Queries: 3452 Qrels: 3452


100%|██████████| 5233329/5233329 [00:22<00:00, 232610.82it/s]


HotpotQA Corpus: 5233329 Queries: 7405 Qrels: 7405


100%|██████████| 57638/57638 [00:00<00:00, 204690.62it/s]


FiQA Corpus: 57638 Queries: 648 Qrels: 648

Sample from Natural Questions Corpus:
Doc ID: doc0, Content: In accounting, minority interest (or non-controlling interest) is the portion of a subsidiary corporation's stock that is not owned by the parent corporation. The magnitude of the minority interest in...
Doc ID: doc1, Content: It is, however, possible (such as through special voting rights) for a controlling interest requiring consolidation to be achieved without exceeding 50% ownership, depending on the accounting standard...
Doc ID: doc2, Content: The reporting of 'minority interest' is a consequence of the requirement by accounting standards to 'fully' consolidate partly owned subsidiaries. Full consolidation, as opposed to partial consolidati...


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the NVIDIA embedding model
small_model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)

# Define a sample query
query = "What is quantum computing?"

# Encode the query
query_embedding = small_model.encode(query)

# Encode all documents in the corpus (you can limit this to a smaller subset for testing)
doc_embeddings = [small_model.encode(doc['text']) for doc_id, doc in nq_corpus.items()]

# Compute cosine similarity between the query and documents
similarities = cosine_similarity([query_embedding], doc_embeddings)

# Get top-k most relevant documents
top_k = 5
top_k_indices = np.argsort(similarities[0])[-top_k:][::-1]
top_k_docs = [list(nq_corpus.items())[i] for i in top_k_indices]

# Print the top-k document IDs and their similarity scores
print("\nTop-k Documents:")
for i, (doc_id, _) in enumerate(top_k_docs):
    print(f"Rank {i+1}: Document ID: {doc_id}, Similarity Score: {similarities[0][top_k_indices[i]]:.4f}")


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the ranking model
rank_model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2' trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2' trust_remote_code=True)

# Rerank the top-k documents
reranked_docs = []
for doc_id, doc in top_k_docs:
    inputs = tokenizer("Query: " + query, "Passage: " + doc['text'], return_tensors="pt")
    with torch.no_grad():
        score = rank_model(**inputs).logits.item()
    reranked_docs.append((doc_id, score))

# Sort by reranking scores
reranked_docs.sort(key=lambda x: x[1], reverse=True)

# Print reranked document IDs and their scores
print("\nReranked Documents:")
for i, (doc_id, score) in enumerate(reranked_docs):
    print(f"Rank {i+1}: Document ID: {doc_id}, Reranking Score: {score:.4f}")


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the ranking model
rank_model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2' trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2' trust_remote_code=True)

# Rerank the top-k documents
reranked_docs = []
for doc_id, doc in top_k_docs:
    inputs = tokenizer("Query: " + query, "Passage: " + doc['text'], return_tensors="pt")
    with torch.no_grad():
        score = rank_model(**inputs).logits.item()
    reranked_docs.append((doc_id, score))

# Sort by reranking scores
reranked_docs.sort(key=lambda x: x[1], reverse=True)

# Print reranked document IDs and their scores
print("\nReranked Documents:")
for i, (doc_id, score) in enumerate(reranked_docs):
    print(f"Rank {i+1}: Document ID: {doc_id}, Reranking Score: {score:.4f}")


In [None]:
from sklearn.metrics import ndcg_score

# Example true relevance scores and predicted relevance scores
true_relevance = np.asarray([[1, 0, 1, 0, 0]])  # Replace with actual relevance for your documents
predicted_scores = np.asarray([[score for _, score in reranked_docs]])  # Scores from reranked_docs

# Calculate NDCG@10
ndcg = ndcg_score(true_relevance, predicted_scores, k=10)
print(f"NDCG@10: {ndcg:.4f}")
