## Install dependancies.

In [None]:
# pip install -q torch transformers langchain_chroma bitsandbytes langchain langchain_huggingface langchain-community sentence-transformers  pacmap tqdm matplotlib

Note: you may need to restart the kernel to use updated packages.


In [11]:
from tqdm.notebook import tqdm
import pandas as pd
import os
import csv
import sys
import numpy as np
import time
import random
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
import textwrap
import torch

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Disable huffingface tokenizers parallelism <- should huggingface
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# Load the meetings dataset

In [12]:
from langchain.docstore.document import Document
import csv
import sys

def set_csv_field_limit():
    maxInt = sys.maxsize
    while True:
        try:
            csv.field_size_limit(maxInt)
            break
        except OverflowError:
            maxInt = int(maxInt/10)
    return maxInt

def load_documents(doc_file):
    """
    Loads the document contents from the first file.

    :param doc_file: Path to the document file (document ID <TAB> document contents).
    :return: A dictionary {document_id: document_contents}.
    """
    # Set the field size limit first
    set_csv_field_limit()

    documents = {}
    with open(doc_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            if len(row)==0: continue
            doc_id, content = row
            documents[doc_id] = content
    return documents

# Load and process the documents
docs = []
doc_file = 'meetings.tsv'
documents = load_documents(doc_file)

for doc_id in documents:
    doc = Document(page_content=documents[doc_id])
    metadata = {'source': doc_id}
    doc.metadata = metadata
    docs.append(doc)

print(f"Total meetings (docs): {len(documents)}")

Total meetings (docs): 230


In [13]:
documents['doc_0']

"project manager: yep . soon as i get this . okay . this is our last meeting . um i 'll go ahead and go through the minutes from the previous meeting . uh and then we 'll have a , the prototype presentation . um then we will um do an evaluation . uh or we 'll see what , what we need to have under the criteria for the evaluation . then we 'll go through the finance and see if we fall within the budget . um then we 'll do the evaluation , and then we can finish up after that with um any changes that we 'll need to make , or hopefully everything will fall right in line . um let 's see , minutes from the last meeting . um we looked at uh the the trends . we had uh the fashion trends that people want a fancy look-and-feel . it was twice as important as anything else . um they liked fruit and vegetables in the new styles . um and a spongy feel . so we were talking about trying to incorporate those into our prototype . um they wanted limited buttons and simplicity . um then we looked at the u

# Retriever - Building the retriever 🗂️

 ### 1. Specify an Embedding Model and Visualize Document Lengths


In [14]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

from sentence_transformers import SentenceTransformer

print(
    f"Model's maximum sequence length: {SentenceTransformer(EMBEDDING_MODEL_NAME).max_seq_length}"
)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs)]

Model's maximum sequence length: 512


  0%|          | 0/230 [00:00<?, ?it/s]

### 2. Split the Documents into Chunks


In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 768,
    chunk_overlap = 128,
)

doc_snippets = text_splitter.split_documents(docs)
print(f"Total {len(doc_snippets)} snippets to be stored in our vector store.")

lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(doc_snippets)]

Total 18070 snippets to be stored in our vector store.


  0%|          | 0/18070 [00:00<?, ?it/s]

### 3. Build the Vector Database


In [16]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy

# Automatically set the device to 'cuda' if available, otherwise use 'cpu'
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Found device: {device}")


embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": device},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

start_time = time.time()

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    doc_snippets, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

end_time = time.time()

elapsed_time = (end_time - start_time)/60
print(f"Time taken: {elapsed_time} minutes")


Found device: cuda
Time taken: 0.3376580516497294 minutes


### 4. Querying the Vector Database



In [17]:
## The function for ranking documents given a query:
def rank_documents_biencoder(user_query, top_k = 5):
    """
    Function for document ranking based on the query.

    :param query: The query to retrieve documents for.
    :return: A list of document IDs ranked based on the query (mocked).
    """
    retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=top_k)
    ranked_list = []
    for i, doc in enumerate(retrieved_docs):
        ranked_list.append(retrieved_docs[i].metadata['source'])

    return ranked_list  # ranked document IDs.


user_query = "what did kirsty williams am say about her plan for quality assurance ?"
retrieved_docs = rank_documents_biencoder(user_query)

print("\n==================================Top-5 documents==================================")
print("\n\nRetrieved documents:", retrieved_docs)
print("\n====================================================================\n")




Retrieved documents: ['doc_211', 'doc_2', 'doc_43', 'doc_160', 'doc_43']




### Simplified ColBERT as a Reranker

In [18]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel


# Load tokenizer and model BERT from HuggingFace
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")


def rank_documents_finegrained_interactions(user_query, shortlist = 15, top_k=5):

    """
    Rerank the top-K=15 retrieved documents from Bi-encoder using fine-grained token-level interactions
    and return the top_k=5 most similar documents.

    Args:
    - user_query (str): The user query string.
    - shortlist (list): Number of documents in the longer short list
    - top_k (int): Number of top reranked documents to return.

    Returns:
    - ranked_list of document IDs.
    """

    retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=shortlist)


    # Tokenize the user query
    query_inputs = tokenizer(user_query, return_tensors='pt', truncation=True, padding=True)

    # Get query token embeddings from BERT
    with torch.no_grad():
        query_embeddings = model(**query_inputs).last_hidden_state  # Shape: (1, seq_len_query, hidden_dim)

    ranked_list = []

    ### YOUR CODE HERE
    ranked_scores = []
    for doc in retrieved_docs:
        # Tokenize the document content
        doc_inputs = tokenizer(doc.page_content, return_tensors='pt', truncation=True, padding=True)
        doc_embeddings = model(**doc_inputs).last_hidden_state

        similarity_matrix = torch.matmul(query_embeddings, doc_embeddings.transpose(-2, -1))

        max_similarities = torch.max(similarity_matrix, dim=-1).values
        doc_score = max_similarities.sum().item()
        # mean_similarities = torch.mean(similarity_matrix, dim=-1)
        # doc_score = mean_similarities.sum().item()

        ranked_scores.append([doc_score, doc.metadata['source']])

    ranked_scores.sort(reverse=True)

    ranked_list = [ranked_scores[i][1] for i in range(top_k)]


    return ranked_list  # ranked document IDs


user_query = "how did project manager and user interface introduce the prototype of the remote control ?"
retrieved_docs = rank_documents_finegrained_interactions(user_query)

print("\n==================================Top-5 documents==================================")
print("\n\nRetrieved documents:", retrieved_docs)
print("\n====================================================================\n")




Retrieved documents: ['doc_166', 'doc_9', 'doc_218', 'doc_56', 'doc_15']




### Full evaluation pipeline


In [None]:

def load_questions_answers(qa_file):
    """
    Loads the questions and corresponding ground truth document IDs.

    :param qa_file: Path to the question-answer file (document ID <TAB> question <TAB> answer).
    :return: A list of tuples [(document_id, question, answer)].
    """
    qa_pairs = []
    with open(qa_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            doc_id, question, answer = row
            qa_pairs.append((doc_id, question, answer))

    random.shuffle(qa_pairs)

    return qa_pairs

def precision_at_k(ground_truth, retrieved_docs, k):
    """
    Computes Precision at k for a single query.

    :param ground_truth: The name of the ground truth document.
    :param retrieved_docs: The list of document names returned by the model in ranked order.
    :param k: The cutoff for computing Precision.
    :return: Precision at k.
    """
    return 1 if ground_truth in retrieved_docs[:k] else 0

def evaluate(doc_file, qa_pairs, ranking_fuction = None, k= 5):
    """
    Evaluate the retrieval system based on the documents and question-answer pairs.

    :param doc_file: Path to the document file.
    :param qa_file: Path to the question-answer file.
    :param k: The cutoff for Precision@k.
    """
    # Load the QA pairs


    precision_scores = []


    for doc_id, question, _ in qa_pairs:

        retrieved_docs = ranking_fuction(question)
        precision_scores.append(precision_at_k(doc_id, retrieved_docs, k))

        avg_precision_at_k = sum(precision_scores) / len(precision_scores)

        if len(precision_scores) %10==0:
            print(f"After {len(precision_scores)} queries, Precision@{k}: {avg_precision_at_k}")

    # Compute average Precision@k
    avg_precision_at_k = sum(precision_scores) / len(precision_scores)

    print(f"Precision@{k}: {avg_precision_at_k}")


qa_file = 'questions_answers.tsv'  # document ID <TAB> question <TAB> answer
qa_pairs = load_questions_answers(qa_file)
print(len(qa_pairs))
qa_pairs[:3]

# start_time = time.time()
# evaluate(doc_file, qa_pairs,rank_documents_biencoder)
# end_time = time.time()
# elapsed_time = (end_time - start_time)/60
# print(f"Time taken: {elapsed_time} minutes")

### Bi-Encoder

In [None]:
import torch
import numpy as np
from tqdm import tqdm
import time

def batch_encode_queries_v2(queries, embedding_model, batch_size=256):
    """
    Optimized batch encoding with larger batches and better GPU utilization
    """
    # Pre-allocate memory for all embeddings
    num_queries = len(queries)
    embedding_dim = 384  # We know this from the output
    all_embeddings = np.zeros((num_queries, embedding_dim), dtype=np.float32)
    
    # Process in larger batches
    for i in tqdm(range(0, num_queries, batch_size), desc="Encoding queries"):
        end_idx = min(i + batch_size, num_queries)
        batch = queries[i:end_idx]
        
        # Get embeddings for batch
        batch_embeddings = embedding_model.embed_documents(batch)
        all_embeddings[i:end_idx] = batch_embeddings
    
    return all_embeddings

def evaluate_gpu_optimized_v2(qa_pairs, ks=[1, 5, 10, 15, 20], batch_size=256, search_batch_size=512):
    """
    Optimized GPU evaluation with precision@k for multiple k values
    """
    questions = [q for _, q, _ in qa_pairs]
    ground_truths = [doc_id for doc_id, _, _ in qa_pairs]
    max_k = max(ks)  # Use maximum k for retrieval
    
    print(f"Starting evaluation with batch_size={batch_size}, search_batch_size={search_batch_size}")
    start_time = time.time()
    
    # 1. Batch encode queries
    print("Encoding queries...")
    query_embeddings = batch_encode_queries_v2(questions, embedding_model, batch_size)
    encoding_time = time.time() - start_time
    print(f"Encoding completed in {encoding_time:.1f} seconds")
    
    # 2. Batch similarity search
    print("Performing batch similarity search...")
    search_start = time.time()
    
    all_D = []
    all_I = []
    num_queries = len(questions)
    
    for i in tqdm(range(0, num_queries, search_batch_size), desc="Searching"):
        end_idx = min(i + search_batch_size, num_queries)
        batch_embeddings = query_embeddings[i:end_idx]
        
        # Retrieve max_k documents
        D, I = KNOWLEDGE_VECTOR_DATABASE.index.search(batch_embeddings, max_k)
        all_D.extend(D)
        all_I.extend(I)
    
    search_time = time.time() - search_start
    print(f"Search completed in {search_time:.1f} seconds")
    
    # 3. Process results
    doc_dict = {i: doc.metadata['source'] for i, doc in enumerate(doc_snippets)}
    retrieved_docs = [[doc_dict[idx] for idx in query_indices] for query_indices in all_I]
    
    # Calculate precision for each k
    precision_scores = {}
    for k in ks:
        scores = [
            1 if gt in retrieved[:k] else 0 
            for gt, retrieved in zip(ground_truths, retrieved_docs)
        ]
        precision_scores[k] = np.mean(scores)
    
    # Calculate timing metrics
    total_time = time.time() - start_time
    qps = num_queries / total_time
    
    # Print results
    print("\nPerformance Breakdown:")
    print(f"- Encoding time: {encoding_time:.1f}s ({num_queries/encoding_time:.1f} queries/s)")
    print(f"- Search time: {search_time:.1f}s ({num_queries/search_time:.1f} queries/s)")
    print(f"\nPrecision Results:")
    for k in ks:
        print(f"Precision@{k}: {precision_scores[k]:.3f}")
    print(f"\nTiming:")
    print(f"Total time: {total_time:.1f} seconds")
    print(f"Average speed: {qps:.1f} queries/second")
    
    return precision_scores


qa_file = 'questions_answers.tsv'
qa_pairs = load_questions_answers(qa_file)

start_time = time.time()
precision_scores = evaluate_gpu_optimized_v2(
    qa_pairs, 
    ks=[1,2,3,4,5],
    batch_size=256,
    search_batch_size=512
)
print(f"Total evaluation time: {(time.time() - start_time)/60:.2f} minutes")

In [None]:
from typing import List, Dict, Any
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
import torch
from dataclasses import dataclass
from collections import defaultdict

@dataclass
class RAGEvalMetrics:
    """Stores evaluation metrics for a single answer"""
    rouge1: float
    rouge2: float
    rougeL: float
    bleu: float
    semantic_similarity: float
    retrieval_accuracy: float

class RAGEvaluator:
    def __init__(self, semantic_model_name: str = "sentence-transformers/all-mpnet-base-v2"):
        """
        Initialize the RAG evaluation system
        
        Args:
            semantic_model_name: Model to use for semantic similarity scoring
        """
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.semantic_model = SentenceTransformer(semantic_model_name)
        self.smoothing = SmoothingFunction().method1
        
    def _compute_rouge_scores(self, prediction: str, reference: str) -> Dict[str, float]:
        """Compute ROUGE scores"""
        scores = self.rouge_scorer.score(reference, prediction)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }
    
    def _compute_bleu_score(self, prediction: str, reference: str) -> float:
        """Compute BLEU score"""
        reference_tokens = [word_tokenize(reference)]
        prediction_tokens = word_tokenize(prediction)
        
        return sentence_bleu(
            reference_tokens,
            prediction_tokens,
            smoothing_function=self.smoothing
        )
    
    def _compute_semantic_similarity(self, prediction: str, reference: str) -> float:
        """Compute semantic similarity using sentence embeddings"""
        embeddings = self.semantic_model.encode([prediction, reference])
        similarity = F.cosine_similarity(
            torch.tensor(embeddings[0]).unsqueeze(0),
            torch.tensor(embeddings[1]).unsqueeze(0)
        )
        return similarity.item()
    
    def evaluate_single(
        self,
        prediction: str,
        reference: str,
        retrieved_doc_ids: List[str],
        ground_truth_doc_id: str
    ) -> RAGEvalMetrics:
        """
        Evaluate a single prediction against its reference
        
        Args:
            prediction: Generated answer
            reference: Ground truth answer
            retrieved_doc_ids: List of retrieved document IDs
            ground_truth_doc_id: Ground truth document ID
            
        Returns:
            RAGEvalMetrics object containing all metrics
        """
        rouge_scores = self._compute_rouge_scores(prediction, reference)
        bleu_score = self._compute_bleu_score(prediction, reference)
        semantic_sim = self._compute_semantic_similarity(prediction, reference)
        retrieval_acc = 1.0 if ground_truth_doc_id in retrieved_doc_ids else 0.0
        
        return RAGEvalMetrics(
            rouge1=rouge_scores['rouge1'],
            rouge2=rouge_scores['rouge2'],
            rougeL=rouge_scores['rougeL'],
            bleu=bleu_score,
            semantic_similarity=semantic_sim,
            retrieval_accuracy=retrieval_acc
        )
    
    def evaluate_batch(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Evaluate a batch of predictions
        
        Args:
            results: List of dictionaries containing predictions and references
            
        Returns:
            Dictionary containing aggregated metrics and detailed results
        """
        detailed_metrics = []
        aggregated_metrics = defaultdict(list)
        
        for result in results:
            metrics = self.evaluate_single(
                prediction=result['generated_answer'],
                reference=result['ground_truth_answer'],
                retrieved_doc_ids=result['retrieved_doc_ids'],
                ground_truth_doc_id=result['ground_truth_doc_id']
            )
            
            # Store detailed metrics
            detailed_result = {
                'question': result['question'],
                'prediction': result['generated_answer'],
                'reference': result['ground_truth_answer'],
                'metrics': metrics
            }
            detailed_metrics.append(detailed_result)
            
            # Aggregate metrics
            for field in metrics.__dataclass_fields__:
                aggregated_metrics[field].append(getattr(metrics, field))
        
        # Calculate mean scores
        mean_metrics = {
            k: np.mean(v) for k, v in aggregated_metrics.items()
        }
        
        # Calculate std dev
        std_metrics = {
            k: np.std(v) for k, v in aggregated_metrics.items()
        }
        
        return {
            'mean_metrics': mean_metrics,
            'std_metrics': std_metrics,
            'detailed_results': detailed_metrics
        }
    
    def print_evaluation_summary(self, eval_results: Dict[str, Any]) -> None:
        """Print a human-readable summary of evaluation results"""
        print("\nRAG Evaluation Summary")
        print("=" * 50)
        
        print("\nMean Metrics:")
        print("-" * 20)
        for metric, value in eval_results['mean_metrics'].items():
            print(f"{metric:20s}: {value:.4f} ± {eval_results['std_metrics'][metric]:.4f}")
        
        print("\nSample Predictions:")
        print("-" * 20)
        for result in eval_results['detailed_results'][:3]:  # Show first 3 examples
            print(f"\nQuestion: {result['question']}")
            print(f"Prediction: {result['prediction']}")
            print(f"Reference: {result['reference']}")
            print(f"ROUGE-L: {result['metrics'].rougeL:.4f}")
            print(f"Semantic Similarity: {result['metrics'].semantic_similarity:.4f}")


In [None]:
# Initialize the evaluator
evaluator = RAGEvaluator()

# Run RAG pipeline and get results
rag_results = evaluate_rag_pipeline(qa_pairs)

# Evaluate the results
evaluation_results = evaluator.evaluate_batch(rag_results)

# Print evaluation summary
evaluator.print_evaluation_summary(evaluation_results)

### Simplified ColBERT 

In [None]:
import faiss
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
# model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# tokenizer = AutoTokenizer.from_pretrained("microsoft/DialogRPT-updown")
# model = AutoModel.from_pretrained("microsoft/DialogRPT-updown")


def batch_initial_retrieval_v2(questions, k=15, batch_size=512):
    """
    More optimized batch retrieval using FAISS GPU
    """
    # Convert FAISS index to GPU if not already
    res = faiss.StandardGpuResources()
    gpu_index = faiss.index_cpu_to_gpu(res, 0, KNOWLEDGE_VECTOR_DATABASE.index)
    
    # Pre-allocate embeddings array
    num_queries = len(questions)
    embedding_dim = KNOWLEDGE_VECTOR_DATABASE.index.d
    all_embeddings = np.zeros((num_queries, embedding_dim), dtype=np.float32)
    
    print("Computing embeddings...")
    for i in tqdm(range(0, num_queries, batch_size)):
        end_idx = min(i + batch_size, num_queries)
        batch = questions[i:end_idx]
        
        # Get embeddings for batch
        embeddings = embedding_model.embed_documents(batch)
        all_embeddings[i:end_idx] = embeddings
    
    print("Performing batch search...")
    # Single batch search for all queries
    D, I = gpu_index.search(all_embeddings, k)
    
    # Convert indices to documents (in batches)
    retrieved_docs = []
    for indices in I:
        docs = [doc_snippets[idx] for idx in indices]
        retrieved_docs.append(docs)
    
    return retrieved_docs

def optimized_colbert_rerank_v3(questions, docs_list, max_k=20, batch_size=64):
    """
    Optimized ColBERT reranking that maintains high precision
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    all_reranked = []
    
    for i in tqdm(range(0, len(questions), batch_size), desc="Reranking"):
        batch_questions = questions[i:i + batch_size]
        batch_docs = docs_list[i:i + batch_size]
        
        for q_idx, (question, docs) in enumerate(zip(batch_questions, batch_docs)):
            # Get query embeddings
            q_inputs = tokenizer(
                question,
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=512
            ).to(device)
            
            with torch.no_grad():
                q_embeds = model(**q_inputs).last_hidden_state
            
            doc_scores = []
            doc_batch_size = 8
            
            for j in range(0, len(docs), doc_batch_size):
                doc_batch = docs[j:j + doc_batch_size]
                doc_texts = [d.page_content for d in doc_batch]
                
                d_inputs = tokenizer(
                    doc_texts,
                    padding=True,
                    truncation=True,
                    return_tensors="pt",
                    max_length=512
                ).to(device)
                
                with torch.no_grad():
                    d_embeds = model(**d_inputs).last_hidden_state
                
                for doc_idx in range(len(doc_batch)):
                    d_embed = d_embeds[doc_idx:doc_idx+1]
                    sim_matrix = torch.matmul(q_embeds, d_embed.transpose(-2, -1))
                    max_sim = torch.max(sim_matrix, dim=-1).values
                    score = max_sim.sum().item()
                    doc_scores.append((score, doc_batch[doc_idx].metadata['source']))
            
            # Sort and get top max_k docs instead of just top 5
            doc_scores.sort(reverse=True)
            all_reranked.append([score[1] for score in doc_scores[:max_k]])
    
    return all_reranked

def evaluate_optimized_v3(qa_pairs, initial_k=20, ks=[1, 5, 10, 15, 20], batch_size=512, rerank_batch_size=64):
    max_k = max(ks)
    initial_k = max(initial_k, max_k)
    
    questions = [q for _, q, _ in qa_pairs]
    ground_truths = [doc_id for doc_id, _, _ in qa_pairs]
    
    print("1. Initial retrieval (batched)...")
    init_start = time.time()
    initial_retrieved = batch_initial_retrieval_v2(
        questions, 
        k=initial_k,
        batch_size=batch_size
    )
    init_time = time.time() - init_start
    
    print("\n2. ColBERT reranking...")
    rerank_start = time.time()
    reranked_docs = optimized_colbert_rerank_v3(
        questions, 
        initial_retrieved,
        max_k=max_k,  # Pass max_k to reranking
        batch_size=rerank_batch_size
    )
    rerank_time = time.time() - rerank_start
    
    # Calculate metrics
    precision_scores = {}
    for k in ks:
        scores = [
            1 if gt in reranked[:k] else 0 
            for gt, reranked in zip(ground_truths, reranked_docs)
        ]
        precision_scores[k] = np.mean(scores)
    
    # Print results
    print("\nPerformance Breakdown:")
    print(f"Initial Retrieval: {init_time:.1f}s ({len(questions)/init_time:.1f} q/s)")
    print(f"Reranking: {rerank_time:.1f}s ({len(questions)/rerank_time:.1f} q/s)")
    
    print("\nPrecision Results:")
    for k in sorted(ks):
        print(f"Precision@{k}: {precision_scores[k]:.3f}")
    
    print("\nTiming:")
    print(f"Total time: {time.time() - init_start:.1f}s")
    print(f"Average speed: {len(questions)/(time.time() - init_start):.1f} queries/second")
    
    return precision_scores


# Test with multiple k values
scores = evaluate_optimized_v3(
    qa_pairs,
    initial_k=20,            # Initial retrieval pool size
    ks=[1,2,3,4,5],  # K values to evaluate
    batch_size=512,         # Initial retrieval batch size
    rerank_batch_size=64    # Reranking batch size
)

# Reader

In [None]:
from dataclasses import dataclass
from typing import List, Optional, Dict

@dataclass
class PromptTemplate:
    template: str
    input_variables: List[str]
    
    def format(self, **kwargs) -> str:
        return self.template.format(**kwargs)

class PromptManager:
    def __init__(self):
        self.templates = {
            # Zero-shot prompting
            "basic": PromptTemplate(
                template="Answer the question based on the given context.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:",
                input_variables=["context", "question"]
            ),
            
            # Chain-of-thought prompting
            "cot": PromptTemplate(
                template="Let's approach this step-by-step:\n\n1) First, understand the question: {question}\n\n2) Here's the relevant context: {context}\n\n3) Let's analyze the context and break down the key points\n\n4) Based on this analysis, provide a detailed answer.\n\nReasoning and answer:",
                input_variables=["context", "question"]
            ),
            
            # Role-based prompting
            "expert": PromptTemplate(
                template="As an expert in meeting analysis, review the following context and answer the question.\n\nContext: {context}\n\nQuestion: {question}\n\nExpert analysis and answer:",
                input_variables=["context", "question"]
            ),
            
            # Self-reflection prompting
            "reflective": PromptTemplate(
                template="Question: {question}\n\nContext: {context}\n\nLet me think about this carefully:\n1. What are the key points in the context?\n2. How do they relate to the question?\n3. What might I be missing?\n\nConsidering these points, here's my answer:",
                input_variables=["context", "question"]
            ),
            
            # Structured output prompting
            "structured": PromptTemplate(
                template="Based on the context below, provide a structured answer to the question.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer in the following format:\n- Main point:\n- Supporting details:\n- Additional context:\n- Confidence level:\n",
                input_variables=["context", "question"]
            )
        }

    def get_prompt(self, style: str, **kwargs) -> str:
        if style not in self.templates:
            raise ValueError(f"Unknown prompt style: {style}")
        return self.templates[style].format(**kwargs)

In [None]:
from typing import List, Dict, Optional
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  
from dataclasses import dataclass

@dataclass
class ReaderConfig:
    """Configuration for the Reader component"""
    model_name: str = "google/flan-t5-base"  # Can also use larger variants
    max_source_length: int = 1024
    max_target_length: int = 256
    num_beams: int = 4
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    temperature: float = 0.7
    do_sample: bool = True
    top_p: float = 0.95
    prompt_template: str = "Answer the question based on the given context.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:"

class Reader:
    def __init__(self, config: ReaderConfig):
        """Initialize the Reader with a config"""
        self.config = config
        self.device = torch.device(config.device)
        
        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        self.model = AutoModelForSeq2SeqGeneration.from_pretrained(config.model_name)
        self.model.to(self.device)
        
    def _prepare_context(self, retrieved_docs: List[str], max_length: Optional[int] = None) -> str:
        """Prepare context from retrieved documents within length constraints"""
        if not max_length:
            max_length = self.config.max_source_length
            
        # Join documents with separator
        context = " [DOC] ".join([doc.page_content for doc in retrieved_docs])
        
        # Truncate if needed
        tokens = self.tokenizer.encode(context)
        if len(tokens) > max_length:
            tokens = tokens[:max_length-1] + [tokens[-1]]  # Keep EOS token
            context = self.tokenizer.decode(tokens)
            
        return context
    
class EnhancedReader(Reader):
    def __init__(self, config: ReaderConfig):
        super().__init__(config)
        self.prompt_manager = PromptManager()
        
    def generate_answer_with_prompt_style(
        self, 
        question: str, 
        retrieved_docs: List[str],
        prompt_style: str = "basic",
        return_context: bool = False
    ) -> Dict[str, str]:
        """
        Generate an answer using a specific prompting strategy
        """
        context = self._prepare_context(retrieved_docs, self.config.max_source_length)
        
        # Get the appropriate prompt
        prompt = self.prompt_manager.get_prompt(
            style=prompt_style,
            context=context,
            question=question
        )
        
        # Generate answer with the selected prompt
        inputs = self.tokenizer(
            prompt,
            max_length=self.config.max_source_length,
            padding=True,
            truncation=True,
            return_tensors="pt"
        ).to(self.config.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                inputs.input_ids,
                max_length=self.config.max_target_length,
                num_beams=self.config.num_beams,
                temperature=self.config.temperature,
                do_sample=self.config.do_sample,
                top_p=self.config.top_p
            )
        
        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        result = {
            "answer": answer,
            "prompt_style": prompt_style
        }
        
        if return_context:
            result["context"] = context
            result["full_prompt"] = prompt
            
        return result


In [None]:
reader_config = ReaderConfig()
reader = EnhancedReader(config=reader_config)

rag_output = reader.generate_answer(user_query, retrieved_docs, return_context=True)
rag_output


In [None]:
from typing import List, Tuple, Dict
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import spacy
from collections import defaultdict

class RAGQAEvaluator:
    def __init__(self):
        """Initialize the RAG QA evaluation system."""
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except:
            print("Please install spaCy model: python -m spacy download en_core_web_sm")

    def evaluate_answer(self, 
                       rag_output: Dict[str, str],
                       qa_pairs: List[Tuple[str, str, str]]) -> Dict:
        """
        Evaluate RAG output against ground truth QA pairs.
        
        Args:
            rag_output: Dictionary containing 'answer' and 'context' from RAG
            qa_pairs: List of tuples (doc_id, question, answer) from ground truth
            
        Returns:
            Dictionary containing evaluation metrics
        """
        metrics = {}
        
        # Extract mentioned documents from RAG context
        mentioned_docs = set(rag_output['context'].split())
        
        # Find matching QA pairs based on document overlap
        matching_pairs = [
            (q, a) for doc_id, q, a in qa_pairs 
            if doc_id in mentioned_docs
        ]
        
        if not matching_pairs:
            return {
                'error': 'No matching ground truth QA pairs found for the retrieved documents',
                'retrieved_docs': list(mentioned_docs)
            }
        
        # Calculate metrics for the generated answer against all matching pairs
        answer_metrics = []
        for question, answer in matching_pairs:
            pair_metrics = self._calculate_metrics(
                generated_answer=rag_output['answer'],
                reference_question=question,
                reference_answer=answer
            )
            answer_metrics.append(pair_metrics)
        
        # Take the best scores across all matching pairs
        metrics['best_match'] = {
            metric: max(m[metric] for m in answer_metrics)
            for metric in answer_metrics[0].keys()
        }
        
        # Calculate relevance to retrieved context
        if 'context' in rag_output:
            context_doc = self.nlp(' '.join(rag_output['context'].split()))
            answer_doc = self.nlp(rag_output['answer'])
            metrics['context_relevance'] = context_doc.similarity(answer_doc)
        
        # Add document coverage metrics
        metrics['document_coverage'] = {
            'num_retrieved': len(mentioned_docs),
            'retrieved_docs': list(mentioned_docs)
        }
        
        return metrics

    def _calculate_metrics(self, 
                         generated_answer: str,
                         reference_question: str, 
                         reference_answer: str) -> Dict[str, float]:
        """Calculate various similarity metrics between generated and reference text."""
        metrics = {}
        
        # ROUGE scores
        rouge_scores = self.rouge_scorer.score(reference_answer, generated_answer)
        metrics['rouge1_f1'] = rouge_scores['rouge1'].fmeasure
        metrics['rouge2_f1'] = rouge_scores['rouge2'].fmeasure
        metrics['rougeL_f1'] = rouge_scores['rougeL'].fmeasure
        
        # BLEU score
        metrics['bleu'] = sentence_bleu(
            [reference_answer.split()],
            generated_answer.split()
        )
        
        # Semantic similarity
        ref_answer_doc = self.nlp(reference_answer)
        gen_answer_doc = self.nlp(generated_answer)
        metrics['semantic_similarity'] = ref_answer_doc.similarity(gen_answer_doc)
        
        # Question relevance
        ref_question_doc = self.nlp(reference_question)
        metrics['question_relevance'] = gen_answer_doc.similarity(ref_question_doc)
        
        return metrics

    def get_evaluation_summary(self, metrics: Dict) -> str:
        """Generate a human-readable summary of the evaluation metrics."""
        if 'error' in metrics:
            return f"Error: {metrics['error']}\nRetrieved documents: {', '.join(metrics['retrieved_docs'])}"
        
        summary = []
        
        if 'best_match' in metrics:
            summary.append("Best Matching Scores:")
            summary.append(f"Content Overlap:")
            summary.append(f"- ROUGE-1 F1: {metrics['best_match']['rouge1_f1']:.3f}")
            summary.append(f"- ROUGE-2 F1: {metrics['best_match']['rouge2_f1']:.3f}")
            summary.append(f"- ROUGE-L F1: {metrics['best_match']['rougeL_f1']:.3f}")
            summary.append(f"- BLEU Score: {metrics['best_match']['bleu']:.3f}")
            summary.append(f"\nSemantic Evaluation:")
            summary.append(f"- Semantic Similarity: {metrics['best_match']['semantic_similarity']:.3f}")
            summary.append(f"- Question Relevance: {metrics['best_match']['question_relevance']:.3f}")
        
        if 'context_relevance' in metrics:
            summary.append(f"\nContext Relevance: {metrics['context_relevance']:.3f}")
        
        if 'document_coverage' in metrics:
            summary.append(f"\nDocument Coverage:")
            summary.append(f"- Number of Retrieved Documents: {metrics['document_coverage']['num_retrieved']}")
            summary.append(f"- Retrieved Documents: {', '.join(metrics['document_coverage']['retrieved_docs'])}")
        
        return '\n'.join(summary)

In [None]:
# Initialize the evaluator
evaluator = RAGQAEvaluator()

# Get evaluation metrics
metrics = evaluator.evaluate_answer(rag_output, qa_pairs)

# Print human-readable summary
print(evaluator.get_evaluation_summary(metrics))