## Import libraries

In [13]:
from tqdm.notebook import tqdm
import pandas as pd
import os
import csv
import sys
import numpy as np
import time
import random
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
import textwrap
import torch

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


## Prepare Data

In [None]:
import csv
import sys

def set_csv_field_limit():
    maxInt = sys.maxsize
    while True:
        try:
            csv.field_size_limit(maxInt)
            break
        except OverflowError:
            maxInt = int(maxInt/10)
    return maxInt

def load_documents(doc_file):
    """
    Loads the document contents from the first file.

    :param doc_file: Path to the document file (document ID <TAB> document contents).
    :return: A dictionary {document_id: document_contents}.
    """
    # Set the field size limit first
    set_csv_field_limit()

    documents = {}
    with open(doc_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            if len(row)==0: continue
            doc_id, content = row
            documents[doc_id] = content
    return documents

docs = []
doc_file = 'meetings.tsv'
documents = load_documents(doc_file)
documents['doc_0']

"project manager: yep . soon as i get this . okay . this is our last meeting . um i 'll go ahead and go through the minutes from the previous meeting . uh and then we 'll have a , the prototype presentation . um then we will um do an evaluation . uh or we 'll see what , what we need to have under the criteria for the evaluation . then we 'll go through the finance and see if we fall within the budget . um then we 'll do the evaluation , and then we can finish up after that with um any changes that we 'll need to make , or hopefully everything will fall right in line . um let 's see , minutes from the last meeting . um we looked at uh the the trends . we had uh the fashion trends that people want a fancy look-and-feel . it was twice as important as anything else . um they liked fruit and vegetables in the new styles . um and a spongy feel . so we were talking about trying to incorporate those into our prototype . um they wanted limited buttons and simplicity . um then we looked at the u

In [None]:
import random

def load_questions_answers(qa_file):
    """
    Loads the questions and corresponding ground truth document IDs.

    :param qa_file: Path to the question-answer file (document ID <TAB> question <TAB> answer).
    :return: A list of tuples [(document_id, question, answer)].
    """
    qa_pairs = []
    with open(qa_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            doc_id, question, answer = row
            qa_pairs.append((doc_id, question, answer))

    # random.shuffle(qa_pairs)

    return qa_pairs

qa_file = 'questions_answers.tsv'  # document ID <TAB> question <TAB> answer
qa_pairs = load_questions_answers(qa_file)
qa_pairs[0]


('doc_184',
 "what was bethan owen 's comment towards the high-risk category ?",
 'bethan owen suggested that maintaining the attractiveness of welsh universities to students would be a key driven for solving the financial stress . also , the research portfolio should be heavily invested in , which would also bring economic benefits .')

In [7]:
my_collections = [documents[k] for k in documents.keys()]
doc_ids = list(documents.keys())
queries = [q for _, q, _ in qa_pairs]

## Load Model

In [4]:
from ragatouille import RAGPretrainedModel

RAG = RAGPretrainedModel.from_index(".ragatouille/colbert/indexes/my_index")

[Dec 05, 15:53:27] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


  self.scaler = torch.cuda.amp.GradScaler()


## Evaluate Retriever

In [8]:
query = 'what did kirsty williams am say about her plan for quality assurance ?'
result = RAG.search(query, k=5)

Loading searcher for index my_index for the first time... This may take a few seconds
[Dec 05, 15:53:33] #> Loading codec...
[Dec 05, 15:53:33] #> Loading IVF...
[Dec 05, 15:53:33] Loading segmented_lookup_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


  centroids = torch.load(centroids_path, map_location='cpu')
  avg_residual = torch.load(avgresidual_path, map_location='cpu')
  bucket_cutoffs, bucket_weights = torch.load(buckets_path, map_location='cpu')
  ivf, ivf_lengths = torch.load(os.path.join(self.index_path, "ivf.pid.pt"), map_location='cpu')


[Dec 05, 15:53:33] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 1267.54it/s]

[Dec 05, 15:53:33] #> Loading codes and residuals...



  return torch.load(codes_path, map_location='cpu')
  return torch.load(residuals_path, map_location='cpu')
100%|██████████| 1/1 [00:00<00:00, 19.73it/s]

[Dec 05, 15:53:33] Loading filter_pids_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...





[Dec 05, 15:53:33] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . what did kirsty williams am say about her plan for quality assurance ?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2054,  2106, 11382, 12096,  2100,  3766,  2572,  2360,
         2055,  2014,  2933,  2005,  3737, 16375,  1029,   102,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


In [9]:
retrieved_docs = [i['document_id'] for i in result]
result[0]

{'content': "kirsty williams am: as always . [ laughter . ] at least in public , huw . huw morris: there is the expectation that they will work together in concert . there 's a lot of joint operation . i think , going forward , we would expect that to continue . we 're looking to the new bill to try to make that clearer . that was a theme in the general and technical consultation exercises that we 've engaged in over the last couple of years . sian gwenllian am: so , you 're happy , therefore , that that partnership has worked . are you happy with that ? kirsty williams am: certainly , in our consultation for the upcoming act , we 've generally heard , certainly from our further education colleges , that they 've been quite content with the arrangements . no concerns about it , certainly from further education colleges . huw morris: there are differences in the systems of quality assurance as they 've historically applied to fe and he , but i understand that that has meant that , as fe

In [None]:
import time
def precision_at_k(ground_truth, retrieved_docs, k):
    """
    Computes Precision at k for a single query.

    :param ground_truth: The name of the ground truth document.
    :param retrieved_docs: The list of document names returned by the model in ranked order.
    :param k: The cutoff for computing Precision.
    :return: Precision at k.
    """
    return 1 if ground_truth in retrieved_docs[:k] else 0

def evaluate(model, qa_pairs, ranking_function=None, max_k=5):
    """
    Evaluate the retrieval system based on the documents and question-answer pairs.
    Calculates precision@k for k from 1 to max_k.

    :param qa_pairs: List of (doc_id, question, answer) tuples
    :param ranking_function: Optional ranking function
    :param max_k: Maximum k value for precision@k calculation
    """
    # Initialize precision scores for each k
    precision_scores = {k: [] for k in range(1, max_k + 1)}

    for doc_id, question, _ in qa_pairs:
        # Get enough results for maximum k
        results = model.search(question, k=max_k)
        retrieved_docs = [i['document_id'] for i in results]

        # Calculate precision for each k
        for k in range(1, max_k + 1):
            precision_scores[k].append(precision_at_k(doc_id, retrieved_docs, k))

        # Print progress every 10 queries
        if len(precision_scores[1]) % 10 == 0:
            print(f"\nAfter {len(precision_scores[1])} queries:")
            for k in range(1, max_k + 1):
                avg_precision = sum(precision_scores[k]) / len(precision_scores[k])
                print(f"Precision@{k}: {avg_precision:.3f}")

    # Print final results
    print("\nFinal Results:")
    for k in range(1, max_k + 1):
        avg_precision = sum(precision_scores[k]) / len(precision_scores[k])
        print(f"Precision@{k}: {avg_precision:.3f}")

start_time = time.time()
evaluate(RAG, qa_pairs, max_k=5)
end_time = time.time()
elapsed_time = (end_time - start_time)/60
print(f"\nTime taken: {elapsed_time:.2f} minutes")

### Prepare retrieved documents for Reader

In [None]:
results = [RAG.search(q, k=1) for id, q, a in qa_pairs]
retrieved_docs = [[d['document_id'] for d in rslt] for rslt in results]

In [20]:
import json

with open('ragatouille_retrieved_docs.json', 'w') as f:
    json.dump(retrieved_docs, f)

## Reader

In [62]:
from dataclasses import dataclass
from typing import List, Dict
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score

@dataclass
class BaselineReaderConfig:
    """Simple configuration for the baseline T5 reader"""
    model_name: str = "google/flan-t5-large"  # Can also use small/large variants
    max_input_length: int = 512  # Keep shorter for faster inference
    max_output_length: int = 64   # Short answers for baseline
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    batch_size: int = 8  # For batch processing during evaluation

class BaselineReader:
    """Simple T5-based reader for RAG baseline"""
    
    def __init__(self, config: BaselineReaderConfig):
        self.config = config
        self.device = torch.device(config.device)
        
        # Initialize model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(config.model_name)
        self.model.to(self.device)
        
        # Initialize ROUGE scorer
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        
    def generate_answer(self, question: str, context: str) -> str:
        """Generate an answer for a single question-context pair"""
        # Create input text
        input_text = (
            f"Let's approach this step-by-step:\n\n1) First, understand the question: {question}\n\n2) Here's the relevant context: {context}\n\n3) Let's analyze the context and break down the key points\n\n4) Based on this analysis, provide a detailed answer.\n\nReasoning and answer:"
        )
        
        # Tokenize
        inputs = self.tokenizer(
            input_text,
            max_length=self.config.max_input_length,
            truncation=True,
            return_tensors="pt"
        ).to(self.device)
        
        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                inputs.input_ids,
                max_length=self.config.max_output_length,
                min_length=25,  # Prevent very short answers
                do_sample=True,  # Enable sampling
                temperature=0.7,  # Control randomness
                top_p=0.9,  # Nucleus sampling
                no_repeat_ngram_size=3,
                num_return_sequences=1
            )
        
        # Decode
        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return answer
    
    def evaluate_batch(self, questions: List[str], contexts: List[str], 
                      reference_answers: List[str]) -> Dict:
        """Evaluate the reader on a batch of questions"""
        assert len(questions) == len(contexts) == len(reference_answers)
        
        generated_answers = []
        rouge1_scores = []
        rouge2_scores = []
        rougeL_scores = []
        bleu_scores = []
        exact_matches = []
        f1_scores = []
        
        
        # Process in batches
        for i in tqdm(range(0, len(questions), self.config.batch_size)):
            batch_questions = questions[i:i + self.config.batch_size]
            batch_contexts = contexts[i:i + self.config.batch_size]
            
            # Generate answers for batch
            for question, context in zip(batch_questions, batch_contexts):
                answer = self.generate_answer(question, context)
                generated_answers.append(answer)
        
        # Calculate ROUGE scores
        for gen, ref in zip(generated_answers, reference_answers):
            scores = self.rouge_scorer.score(ref, gen)
            rouge1_scores.append(scores['rouge1'].fmeasure)
            rouge2_scores.append(scores['rouge2'].fmeasure)
            rougeL_scores.append(scores['rougeL'].fmeasure)

            # Exact Match
            exact_matches.append(compute_exact_match(gen, ref))

            f1_scores.append(compute_f1(gen, ref))

            # BLEU Score
            smoothie = SmoothingFunction().method1
            bleu_scores.append(sentence_bleu([ref.split()], 
                                       gen.split(), 
                                       smoothing_function=smoothie))
        
        # Aggregate metrics
        metrics = {
            'rouge1': np.mean(rouge1_scores),
            'rouge2': np.mean(rouge2_scores),
            'rougeL': np.mean(rougeL_scores),
            'exact_match': np.mean(exact_matches),
            'f1': np.mean(f1_scores),
            'bleu' : np.mean(bleu_scores),
            'num_samples': len(questions)
        }
        
        # Store some examples
        examples = list(zip(questions[:5], contexts[:5], 
                          generated_answers[:5], reference_answers[:5]))
        
        return {
            'metrics': metrics,
            'examples': examples
        }
    
def compute_exact_match(prediction, reference):
    return int(prediction.strip().lower() == reference.strip().lower())

def compute_f1(prediction, reference):
    pred_tokens = set(prediction.lower().split())
    ref_tokens = set(reference.lower().split())
    common = pred_tokens & ref_tokens
    if not common:
        return 0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(ref_tokens)
    return 2 * precision * recall / (precision + recall)

def print_evaluation_summary(eval_results: Dict):
    """Print a readable summary of the evaluation results"""
    print("\n=== Baseline Reader Evaluation Summary ===")
    
    metrics = eval_results['metrics']
    print(f"\nScores (over {metrics['num_samples']} samples):")
    
    # Print all metrics in a compact format
    for metric_name, value in metrics.items():
        if metric_name != 'num_samples':
            print(f"{metric_name}: {value:.3f}")
    
    print("\nExample Predictions:")
    for i, (q, c, pred, ref) in enumerate(eval_results['examples'], 1):
        print(f"\nExample {i}:")
        print(f"Q: {q}")
        print(f"Pred: {pred}")
        print(f"Ref: {ref}")
        print("-" * 50)

## Evaluate Reader

In [None]:
import json
def read_json():
    with open('ragatouille_retrieved_docs.json', 'r') as f:
        retrieved_docs = json.load(f)
    return retrieved_docs

retrieved_docs = read_json()

In [64]:
questions = [q for id, q, a in qa_pairs]
reference_answers = [a for id, q, a in qa_pairs]
contexts = [documents[d_id[0]] for d_id in retrieved_docs]

In [65]:
# Initialize
config = BaselineReaderConfig()
reader = BaselineReader(config)

# # Single prediction
# user_query = "what did kirsty williams am say about her plan for quality assurance ?"
# results = RAG.search(user_query, k=1)
# context= results[0]['content']
answer = reader.generate_answer(questions[0], contexts[0])
print(f"generated: {answer}, reference: {reference_answers[0]}")

generated: higher education sector? 2: the financial indicators look like for higher education system .. and what does lynne neagle am think about the latency ?, reference: the professor said that people adamantly insist on going in with a brain damaged system , overlooking straight forward solutions . he thought that the lag should not be too long for the task .


In [66]:
# Batch evaluation
eval_results = reader.evaluate_batch(questions, contexts, reference_answers)
print_evaluation_summary(eval_results)

100%|██████████| 144/144 [19:50<00:00,  8.27s/it]



=== Baseline Reader Evaluation Summary ===

Scores (over 1152 samples):
rouge1: 0.183
rouge2: 0.032
rougeL: 0.133
exact_match: 0.000
f1: 0.174
bleu: 0.010

Example Predictions:

Example 1:
Q: what did the professor think about the latency ?
Pred: higher education sector?br>The professor was talking about the latency .bl>The latency is the delay between the professor 's speaking and the actual time the professor speaks. br>As the professor spoke, the lateness of the professor was apparent .
Ref: the professor said that people adamantly insist on going in with a brain damaged system , overlooking straight forward solutions . he thought that the lag should not be too long for the task .
--------------------------------------------------

Example 2:
Q: what did the group discuss about marketing 's presentation ?
Pred: To answer this question, we should know that: The group discussed about marketing 's presentation. They discussed how to incorporate speech recognition into the device. They