In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import os
import csv
import sys
import numpy as np
import time
import random
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
import textwrap
import torch

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Disable huffingface tokenizers parallelism <- should huggingface
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# Load the meetings dataset

In [2]:
from langchain.docstore.document import Document
import csv
import sys

def set_csv_field_limit():
    maxInt = sys.maxsize
    while True:
        try:
            csv.field_size_limit(maxInt)
            break
        except OverflowError:
            maxInt = int(maxInt/10)
    return maxInt

def load_documents(doc_file):
    """
    Loads the document contents from the first file.

    :param doc_file: Path to the document file (document ID <TAB> document contents).
    :return: A dictionary {document_id: document_contents}.
    """
    # Set the field size limit first
    set_csv_field_limit()

    documents = {}
    with open(doc_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            if len(row)==0: continue
            doc_id, content = row
            documents[doc_id] = content
    return documents

# Load and process the documents
docs = []
doc_file = 'meetings.tsv'
documents = load_documents(doc_file)

for doc_id in documents:
    doc = Document(page_content=documents[doc_id])
    metadata = {'source': doc_id}
    doc.metadata = metadata
    docs.append(doc)

print(f"Total meetings (docs): {len(documents)}")

Total meetings (docs): 230


In [3]:
documents['doc_0']

"project manager: yep . soon as i get this . okay . this is our last meeting . um i 'll go ahead and go through the minutes from the previous meeting . uh and then we 'll have a , the prototype presentation . um then we will um do an evaluation . uh or we 'll see what , what we need to have under the criteria for the evaluation . then we 'll go through the finance and see if we fall within the budget . um then we 'll do the evaluation , and then we can finish up after that with um any changes that we 'll need to make , or hopefully everything will fall right in line . um let 's see , minutes from the last meeting . um we looked at uh the the trends . we had uh the fashion trends that people want a fancy look-and-feel . it was twice as important as anything else . um they liked fruit and vegetables in the new styles . um and a spongy feel . so we were talking about trying to incorporate those into our prototype . um they wanted limited buttons and simplicity . um then we looked at the u

# Retriever - Building the retriever 🗂️

 ### 1. Specify an Embedding Model and Visualize Document Lengths


In [4]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

from sentence_transformers import SentenceTransformer

print(
    f"Model's maximum sequence length: {SentenceTransformer(EMBEDDING_MODEL_NAME).max_seq_length}"
)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs)]

Model's maximum sequence length: 512


  0%|          | 0/230 [00:00<?, ?it/s]

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 768,
    chunk_overlap = 128,
)

doc_snippets = text_splitter.split_documents(docs)
print(f"Total {len(doc_snippets)} snippets to be stored in our vector store.")

lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(doc_snippets)]

Total 18070 snippets to be stored in our vector store.


  0%|          | 0/18070 [00:00<?, ?it/s]

### 3. Build the Vector Database

To enable retrieval, we need to compute embeddings for all chunks in our knowledge base. These embeddings will then be stored in a vector database.

#### How Retrieval Works

A query is embedded using an embedding model and a similarity search finds the closest matching chunks in the vector database.

The following cell builds the vector database consisting of  all chunks in our knowledge base.


In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy

# Automatically set the device to 'cuda' if available, otherwise use 'cpu'
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Found device: {device}")


embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": device},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

start_time = time.time()

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    doc_snippets, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

end_time = time.time()

elapsed_time = (end_time - start_time)/60
print(f"Time taken: {elapsed_time} minutes")


Found device: cuda
Time taken: 0.33181405464808145 minutes


In [7]:
## The function for ranking documents given a query:
def rank_documents_biencoder(user_query, top_k = 5):
    """
    Function for document ranking based on the query.

    :param query: The query to retrieve documents for.
    :return: A list of document IDs ranked based on the query (mocked).
    """
    retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=top_k)
    ranked_list = []
    for i, doc in enumerate(retrieved_docs):
        ranked_list.append(retrieved_docs[i].metadata['source'])

    return ranked_list  # ranked document IDs.


user_query = "what did kirsty williams am say about her plan for quality assurance ?"
retrieved_docs = rank_documents_biencoder(user_query)

print("\n==================================Top-5 documents==================================")
print("\n\nRetrieved documents:", retrieved_docs)
print("\n====================================================================\n")




Retrieved documents: ['doc_211', 'doc_2', 'doc_43', 'doc_160', 'doc_43']




### Bi-Encoder Evaluation Pipeline

In [8]:

def load_questions_answers(qa_file):
    """
    Loads the questions and corresponding ground truth document IDs.

    :param qa_file: Path to the question-answer file (document ID <TAB> question <TAB> answer).
    :return: A list of tuples [(document_id, question, answer)].
    """
    qa_pairs = []
    with open(qa_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            doc_id, question, answer = row
            qa_pairs.append((doc_id, question, answer))

    random.shuffle(qa_pairs)

    return qa_pairs

def precision_at_k(ground_truth, retrieved_docs, k):
    """
    Computes Precision at k for a single query.

    :param ground_truth: The name of the ground truth document.
    :param retrieved_docs: The list of document names returned by the model in ranked order.
    :param k: The cutoff for computing Precision.
    :return: Precision at k.
    """
    return 1 if ground_truth in retrieved_docs[:k] else 0

def evaluate(doc_file, qa_pairs, ranking_fuction = None, k= 5):
    """
    Evaluate the retrieval system based on the documents and question-answer pairs.

    :param doc_file: Path to the document file.
    :param qa_file: Path to the question-answer file.
    :param k: The cutoff for Precision@k.
    """
    # Load the QA pairs


    precision_scores = []


    for doc_id, question, _ in qa_pairs:

        retrieved_docs = ranking_fuction(question)
        precision_scores.append(precision_at_k(doc_id, retrieved_docs, k))

        avg_precision_at_k = sum(precision_scores) / len(precision_scores)

        if len(precision_scores) %10==0:
            print(f"After {len(precision_scores)} queries, Precision@{k}: {avg_precision_at_k}")

    # Compute average Precision@k
    avg_precision_at_k = sum(precision_scores) / len(precision_scores)

    print(f"Precision@{k}: {avg_precision_at_k}")

In [9]:
import torch
import numpy as np
from tqdm import tqdm
import time

def batch_encode_queries_v2(queries, embedding_model, batch_size=256):
    """
    Optimized batch encoding with larger batches and better GPU utilization
    """
    # Pre-allocate memory for all embeddings
    num_queries = len(queries)
    embedding_dim = 384  # We know this from the output
    all_embeddings = np.zeros((num_queries, embedding_dim), dtype=np.float32)
    
    # Process in larger batches
    for i in tqdm(range(0, num_queries, batch_size), desc="Encoding queries"):
        end_idx = min(i + batch_size, num_queries)
        batch = queries[i:end_idx]
        
        # Get embeddings for batch
        batch_embeddings = embedding_model.embed_documents(batch)
        all_embeddings[i:end_idx] = batch_embeddings
    
    return all_embeddings

def evaluate_gpu_optimized_v2(qa_pairs, ks=[1, 5, 10, 15, 20], batch_size=256, search_batch_size=512):
    """
    Optimized GPU evaluation with precision@k for multiple k values
    """
    questions = [q for _, q, _ in qa_pairs]
    ground_truths = [doc_id for doc_id, _, _ in qa_pairs]
    max_k = max(ks)  # Use maximum k for retrieval
    
    print(f"Starting evaluation with batch_size={batch_size}, search_batch_size={search_batch_size}")
    start_time = time.time()
    
    # 1. Batch encode queries
    print("Encoding queries...")
    query_embeddings = batch_encode_queries_v2(questions, embedding_model, batch_size)
    encoding_time = time.time() - start_time
    print(f"Encoding completed in {encoding_time:.1f} seconds")
    
    # 2. Batch similarity search
    print("Performing batch similarity search...")
    search_start = time.time()
    
    all_D = []
    all_I = []
    num_queries = len(questions)
    
    for i in tqdm(range(0, num_queries, search_batch_size), desc="Searching"):
        end_idx = min(i + search_batch_size, num_queries)
        batch_embeddings = query_embeddings[i:end_idx]
        
        # Retrieve max_k documents
        D, I = KNOWLEDGE_VECTOR_DATABASE.index.search(batch_embeddings, max_k)
        all_D.extend(D)
        all_I.extend(I)
    
    search_time = time.time() - search_start
    print(f"Search completed in {search_time:.1f} seconds")
    
    # 3. Process results
    doc_dict = {i: doc.metadata['source'] for i, doc in enumerate(doc_snippets)}
    retrieved_docs = [[doc_dict[idx] for idx in query_indices] for query_indices in all_I]
    
    # Calculate precision for each k
    precision_scores = {}
    for k in ks:
        scores = [
            1 if gt in retrieved[:k] else 0 
            for gt, retrieved in zip(ground_truths, retrieved_docs)
        ]
        precision_scores[k] = np.mean(scores)
    
    # Calculate timing metrics
    total_time = time.time() - start_time
    qps = num_queries / total_time
    
    # Print results
    print("\nPerformance Breakdown:")
    print(f"- Encoding time: {encoding_time:.1f}s ({num_queries/encoding_time:.1f} queries/s)")
    print(f"- Search time: {search_time:.1f}s ({num_queries/search_time:.1f} queries/s)")
    print(f"\nPrecision Results:")
    for k in ks:
        print(f"Precision@{k}: {precision_scores[k]:.3f}")
    print(f"\nTiming:")
    print(f"Total time: {total_time:.1f} seconds")
    print(f"Average speed: {qps:.1f} queries/second")
    
    return precision_scores


qa_file = 'questions_answers.tsv'
qa_pairs = load_questions_answers(qa_file)

start_time = time.time()
precision_scores = evaluate_gpu_optimized_v2(
    qa_pairs, 
    ks=[1,2,3,4,5],
    batch_size=256,
    search_batch_size=512
)
print(f"Total evaluation time: {(time.time() - start_time)/60:.2f} minutes")

Starting evaluation with batch_size=256, search_batch_size=512
Encoding queries...


Encoding queries: 100%|██████████| 5/5 [00:17<00:00,  3.43s/it]


Encoding completed in 17.1 seconds
Performing batch similarity search...


Searching: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]

Search completed in 3.7 seconds

Performance Breakdown:
- Encoding time: 17.1s (67.2 queries/s)
- Search time: 3.7s (310.0 queries/s)

Precision Results:
Precision@1: 0.278
Precision@2: 0.354
Precision@3: 0.400
Precision@4: 0.438
Precision@5: 0.467

Timing:
Total time: 20.9 seconds
Average speed: 55.2 queries/second
Total evaluation time: 0.35 minutes





# Reader

In [10]:
from dataclasses import dataclass
from typing import List, Dict
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score

@dataclass
class BaselineReaderConfig:
    """Simple configuration for the baseline T5 reader"""
    model_name: str = "google/flan-t5-base"  # Can also use small/large variants
    max_input_length: int = 512  # Keep shorter for faster inference
    max_output_length: int = 64   # Short answers for baseline
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    batch_size: int = 8  # For batch processing during evaluation

class BaselineReader:
    """Simple T5-based reader for RAG baseline"""
    
    def __init__(self, config: BaselineReaderConfig):
        self.config = config
        self.device = torch.device(config.device)
        
        # Initialize model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(config.model_name)
        self.model.to(self.device)
        
        # Initialize ROUGE scorer
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        
    def generate_answer(self, question: str, context: str) -> str:
        """Generate an answer for a single question-context pair"""
        # Create input text
        input_text = f"Question: {question}\nContext: {context}\nAnswer:"
        
        # Tokenize
        inputs = self.tokenizer(
            input_text,
            max_length=self.config.max_input_length,
            truncation=True,
            return_tensors="pt"
        ).to(self.device)
        
        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                inputs.input_ids,
                max_length=self.config.max_output_length,
                num_return_sequences=1,
            )
        
        # Decode
        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return answer
    
    def evaluate_batch(self, questions: List[str], contexts: List[str], 
                      reference_answers: List[str]) -> Dict:
        """Evaluate the reader on a batch of questions"""
        assert len(questions) == len(contexts) == len(reference_answers)
        
        generated_answers = []
        rouge1_scores = []
        rouge2_scores = []
        rougeL_scores = []
        bleu_scores = []
        exact_matches = []
        f1_scores = []
        
        # Process in batches
        for i in tqdm(range(0, len(questions), self.config.batch_size)):
            batch_questions = questions[i:i + self.config.batch_size]
            batch_contexts = contexts[i:i + self.config.batch_size]
            
            # Generate answers for batch
            for question, context in zip(batch_questions, batch_contexts):
                answer = self.generate_answer(question, context)
                generated_answers.append(answer)
        
        for gen, ref in zip(generated_answers, reference_answers):
            scores = self.rouge_scorer.score(ref, gen)
            rouge1_scores.append(scores['rouge1'].fmeasure)
            rouge2_scores.append(scores['rouge2'].fmeasure)
            rougeL_scores.append(scores['rougeL'].fmeasure)

            # Exact Match
            exact_matches.append(compute_exact_match(gen, ref))

            f1_scores.append(compute_f1(gen, ref))

            # BLEU Score
            smoothie = SmoothingFunction().method1
            bleu_scores.append(sentence_bleu([ref.split()], 
                                       gen.split(), 
                                       smoothing_function=smoothie))
        
        # Aggregate metrics
        metrics = {
            'rouge1': np.mean(rouge1_scores),
            'rouge2': np.mean(rouge2_scores),
            'rougeL': np.mean(rougeL_scores),
            'exact_match': np.mean(exact_matches),
            'f1': np.mean(f1_scores),
            'bleu' : np.mean(bleu_scores),
            'num_samples': len(questions)
        }
        
        # Store some examples
        examples = list(zip(questions[:5], contexts[:5], 
                          generated_answers[:5], reference_answers[:5]))
        
        return {
            'metrics': metrics,
            'examples': examples
        }

def compute_exact_match(prediction, reference):
    return int(prediction.strip().lower() == reference.strip().lower())

def compute_f1(prediction, reference):
    pred_tokens = set(prediction.lower().split())
    ref_tokens = set(reference.lower().split())
    common = pred_tokens & ref_tokens
    if not common:
        return 0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(ref_tokens)
    return 2 * precision * recall / (precision + recall)

def print_evaluation_summary(eval_results: Dict):
    """Print a readable summary of the evaluation results"""
    print("\n=== Baseline Reader Evaluation Summary ===")
    
    metrics = eval_results['metrics']
    print(f"\nScores (over {metrics['num_samples']} samples):")
    
    # Print all metrics in a compact format
    for metric_name, value in metrics.items():
        if metric_name != 'num_samples':
            print(f"{metric_name}: {value:.3f}")
    
    print("\nExample Predictions:")
    for i, (q, c, pred, ref) in enumerate(eval_results['examples'], 1):
        print(f"\nExample {i}:")
        print(f"Q: {q}")
        print(f"Pred: {pred}")
        print(f"Ref: {ref}")
        print("-" * 50)

In [11]:
# Initialize
config = BaselineReaderConfig()
reader = BaselineReader(config)

# Single prediction
user_query = "what did kirsty williams am say about her plan for quality assurance ?"
retrieved_docs = rank_documents_biencoder(user_query, top_k=1)
context = documents[retrieved_docs[0]]
answer = reader.generate_answer(user_query, context)
print(answer)

she said it was a collaborative effort


In [12]:
def batch_retrieve(queries, ks=1, batch_size=256, search_batch_size=512):
    """
    Optimized GPU evaluation with precision@k for multiple k values
    """
    max_k = ks  # Use maximum k for retrieval
    
    print(f"Starting evaluation with batch_size={batch_size}, search_batch_size={search_batch_size}")
    start_time = time.time()
    
    # 1. Batch encode queries
    print("Encoding queries...")
    query_embeddings = batch_encode_queries_v2(queries, embedding_model, batch_size)
    encoding_time = time.time() - start_time
    print(f"Encoding completed in {encoding_time:.1f} seconds")
    
    # 2. Batch similarity search
    print("Performing batch similarity search...")
    search_start = time.time()
    
    all_D = []
    all_I = []
    num_queries = len(queries)
    
    for i in tqdm(range(0, num_queries, search_batch_size), desc="Searching"):
        end_idx = min(i + search_batch_size, num_queries)
        batch_embeddings = query_embeddings[i:end_idx]
        
        # Retrieve max_k documents
        D, I = KNOWLEDGE_VECTOR_DATABASE.index.search(batch_embeddings, max_k)
        all_D.extend(D)
        all_I.extend(I)
    
    search_time = time.time() - search_start
    print(f"Search completed in {search_time:.1f} seconds")
    
    # 3. Process results
    doc_dict = {i: doc.metadata['source'] for i, doc in enumerate(doc_snippets)}
    retrieved_docs = [[doc_dict[idx] for idx in query_indices] for query_indices in all_I]
    return retrieved_docs

In [13]:
batch_split = 50

questions = [q for id, q, a in qa_pairs]
reference_answers = [a for id, q, a in qa_pairs]

retrieved_docs = batch_retrieve(questions)
contexts = [documents[d_id[0]] for d_id in retrieved_docs]

Starting evaluation with batch_size=256, search_batch_size=512
Encoding queries...


Encoding queries: 100%|██████████| 5/5 [00:16<00:00,  3.23s/it]


Encoding completed in 16.2 seconds
Performing batch similarity search...


Searching: 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]

Search completed in 3.8 seconds





In [14]:
# Batch evaluation
eval_results = reader.evaluate_batch(questions, contexts, reference_answers)
print_evaluation_summary(eval_results)

100%|██████████| 144/144 [02:37<00:00,  1.09s/it]



=== Baseline Reader Evaluation Summary ===

Scores (over 1152 samples):
rouge1: 0.057
rouge2: 0.008
rougeL: 0.048
exact_match: 0.000
f1: 0.070
bleu: 0.001

Example Predictions:

Example 1:
Q: why did n't the team believe that the remote control could fully depend on speech recognition and have no buttons ?
Pred: they did n't think it was worth pursuing
Ref: age group data for remote control use was not available ; many people may not want to learn to use the new remote control ; some buttons are still needed , such as channel control , volume settings and on/off .
--------------------------------------------------

Example 2:
Q: what was agreed upon on sample transcripts ?
Pred: automatic speech recognition
Ref: to save time , speaker mn005 will only mark the sample of transcribed data for regions of overlapping speech , as opposed to marking all acoustic events . the digits extraction task will be delegated to whomever is working on acoustics for the meeting recorder project .
------