In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

ds = load_dataset("lucadiliello/hotpotqa")

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

def chunk_text(text, chunk_size=150):
    """Split text into chunks of tokenized sentences with max chunk size."""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length <= chunk_size:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def preprocess_dataset(dataset, chunk_size=150):
    """Preprocess the lucadiliello/hotpotqa dataset by chunking the context passages."""
    processed_data = []

    for item in dataset:
        question = item['question']  # Adjust field based on the dataset
        passages = " ".join(item['context'])  # Combine multiple context passages into one block of text

        # Chunk the combined context text into smaller chunks
        chunked_passages = chunk_text(passages, chunk_size)

        processed_data.append({
            'question': question,
            'chunked_passages': chunked_passages,
            'answers': item['answers']  
        })

    return processed_data


In [None]:
!pip install transformers==4.42.4

In [None]:
pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

class MultiStageRetrieval:
    def __init__(self, embed_model_small, embed_model_large, rank_model_small, rank_model_large):
        # Embedding Models (Small and Large)
        self.embed_model_small = SentenceTransformer(embed_model_small)

        self.embed_model_large = NVEmbedModel.from_pretrained(embed_model_large, trust_remote_code=True)


        # Ranking Models (Small and Large)
        self.rank_model_small = AutoModelForSequenceClassification.from_pretrained(rank_model_small)
        self.rank_tokenizer_small = AutoTokenizer.from_pretrained(rank_model_small)
        self.rank_model_large = AutoModelForSequenceClassification.from_pretrained(rank_model_large)
        self.rank_tokenizer_large = AutoTokenizer.from_pretrained(rank_model_large)

    def retrieve_candidates(self, query, chunked_passages, top_k=5, model_type='small'):
        """
        Retrieve top-k relevant chunked passages using embedding models.
        Args:
            query: The input question/query from the dataset.
            chunked_passages: A list of text chunks from the context passages.
            top_k: The number of top-k passages to retrieve.
            model_type: Choose between 'small' or 'large' embedding models.
        """
        if model_type == 'small':
            query_embedding = self.embed_model_small.encode(query, convert_to_tensor=True)
            passage_embeddings = self.embed_model_small.encode(chunked_passages, convert_to_tensor=True)
        else:
            query_embedding = self.embed_model_large.encode(query, convert_to_tensor=True)
            passage_embeddings = self.embed_model_large.encode(chunked_passages, convert_to_tensor=True)

        # Retrieve top-k most similar passages
        scores = util.pytorch_cos_sim(query_embedding, passage_embeddings)[0]
        top_k_scores = torch.topk(scores, k=top_k)
        return top_k_scores.indices, top_k_scores.values

    def rerank(self, query, retrieved_passages, model_type='small'):
        """
        Rerank the retrieved passages using ranking models.
        Args:
            query: The input question/query.
            retrieved_passages: Passages retrieved from the candidate retrieval stage.
            model_type: Choose between 'small' or 'large' ranking models.
        """
        if model_type == 'small':
            model = self.rank_model_small
            tokenizer = self.rank_tokenizer_small
        else:
            model = self.rank_model_large
            tokenizer = self.rank_tokenizer_large

        # Prepare the inputs for the ranking model
        inputs = [tokenizer(query, passage, return_tensors='pt', truncation=True, padding=True) for passage in retrieved_passages]
        relevance_scores = []

        for input in inputs:
            with torch.no_grad():
                outputs = model(**input)
                relevance_scores.append(outputs.logits[0].item())  # Assumes single logit relevance score

        # Sort the passages by relevance score in descending order
        ranked_indices = sorted(range(len(relevance_scores)), key=lambda i: relevance_scores[i], reverse=True)
        return ranked_indices, relevance_scores


In [None]:
import numpy as np

# Define the Recall@K computation function
def compute_recall_at_k(retrieved_docs, relevant_docs, k=10):
    """Computes Recall@K for the retrieved documents."""
    retrieved_top_k = retrieved_docs[:k]  # Get the top K retrieved documents
    num_relevant_in_k = len(set(retrieved_top_k) & set(relevant_docs))  # Intersection of relevant and retrieved docs

    # Recall@K is the number of relevant docs in top K divided by total relevant docs
    recall = num_relevant_in_k / len(relevant_docs) if relevant_docs else 0
    return recall

# Modify the evaluation function to use batch processing
def evaluate_retrieval(retrieval_system, dataset, k=10, batch_size=64):
    """Evaluates the retrieval system using Recall@K with batch processing."""
    total_recall = 0
    num_samples = len(dataset)

    # Process the dataset in batches
    for i in range(0, num_samples, batch_size):
        # Create a batch of items
        batch = dataset[i:i + batch_size]
        
        questions = [item['question'] for item in batch]
        chunked_passages = [item['chunked_passages'] for item in batch]

        # Retrieve and rerank in batch
        candidates_batch = retrieval_system.retrieve_batch(questions, chunked_passages)  # Modify retrieval to handle batch
        reranked_candidates_batch = retrieval_system.rerank_batch(candidates_batch)  # Modify reranking to handle batch

        # Compute Recall@K for each item in the batch
        for j, item in enumerate(batch):
            recall_at_k = compute_recall_at_k(reranked_candidates_batch[j], item['answers'], k=k)
            total_recall += recall_at_k

    # Return the average Recall@K over all samples
    return total_recall / num_samples



In [None]:
pip install einops

In [None]:
from datasets import load_dataset

# Load the config from the HotpotQA dataset in the  benchmark
hotpotqa_dataset = load_dataset("lucadiliello/hotpotqa", "default", trust_remote_code=True)['train']

# Assuming preprocess_dataset is defined in your notebook
# Example preprocess function (adjust based on your actual implementation)

def processed_dataset(dataset, chunk_size=150):
    """Preprocess the lucadiliello/hotpotqa dataset by chunking the context passages."""
    processed_data = []

    for item in dataset:
        question = item['question']  # Adjust field based on the dataset
        passages = " ".join(item['context'])  # Combine multiple context passages into one block of text

        # Chunk the combined context text into smaller chunks
        chunked_passages = chunk_text(passages, chunk_size)

        processed_data.append({
            'question': question,
            'chunked_passages': chunked_passages,
            'answers': item['answers']  # Add answer for relevance evaluation later
        })
    return processed_dataset
# Preprocess the HotpotQA dataset
hotpotqa_data = preprocess_dataset(hotpotqa_dataset)

# Assuming MultiStageRetrieval is defined in your notebook
# Initialize multi-stage retrieval system with selected models
from transformers import AutoModel, AutoTokenizer

# Load the embedding models with trust_remote_code set to True
embed_model_small = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
embed_model_large = AutoModel.from_pretrained('nvidia/NV-Embed-v2', trust_remote_code=True)

# Load the ranking models with trust_remote_code set to True
rank_model_small = AutoModel.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2')
rank_model_large = AutoModel.from_pretrained('nvidia/nv-rerankqa-mistral-4b-v3', trust_remote_code=True)

# Initialize the multi-stage retrieval system with the preloaded models
retrieval_system = MultiStageRetrieval(
    embed_model_small=embed_model_small,
    embed_model_large=embed_model_large,
    rank_model_small=rank_model_small,
    rank_model_large=rank_model_large
)

"""retrieval_system = MultiStageRetrieval(
    embed_model_small='sentence-transformers/all-MiniLM-L6-v2',
    embed_model_large='nvidia/NV-Embed-v2',
    rank_model_small='cross-encoder/ms-marco-MiniLM-L-12-v2',
    rank_model_large='nvidia/nv-rerankqa-mistral-4b-v3',
    trust_remote_code = True
)
"""
avg_recall_hotpotqa = evaluate_retrieval(retrieval_system, hotpotqa_data, k=10)

# Output the evaluation result
print(f'Average Recall@10 on HotpotQA: {avg_recall_hotpotqa:.4f}')