In [6]:
# !pip -q install pytrec_eval faiss-cpu

### Evaluate BGE reranker

### Evaluate model (With LoRA)

In [25]:
import torch
import numpy as np
import random
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from peft import PeftModel, LoraConfig
import torch.nn.functional as F
from typing import List, Tuple, Dict, Any
import json

def load_trained_model(base_model_id: str, adapter_path: str):
    """
    Load the base model with trained LoRA adapter.
    
    Args:
        base_model_id: Base model identifier (e.g., "BAAI/bge-m3")
        adapter_path: Path to saved LoRA adapter
    
    Returns:
        tuple: (tokenizer, model)
    """
    print(f"Loading base model: {base_model_id}")
    
    # 4-bit quantization config (same as training)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        llm_int8_skip_modules=["classifier", "pre_classifier"]
    )
    
    # Load base model
    base_model = AutoModelForSequenceClassification.from_pretrained(
        base_model_id,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    
    # Load LoRA adapter
    print(f"Loading LoRA adapter from: {adapter_path}")
    model = PeftModel.from_pretrained(base_model, adapter_path)
    
    # Set to evaluation mode
    model.eval()
    
    return tokenizer, model




def rerank_predictions(query: str, documents: List[str], model, tokenizer, 
                      max_query_length: int = 128, max_passage_length: int = 1024) -> List[Tuple[str, float, int]]:
    """
    Rerank documents for a given query using the trained model.
    
    Args:
        query: The search query
        documents: List of documents to rerank
        model: The trained reranking model
        tokenizer: The tokenizer
        max_query_length: Maximum query token length
        max_passage_length: Maximum passage token length
    
    Returns:
        List of tuples: (document, score, rank) sorted by relevance score (descending)
    """
    if not documents:
        return []
    
    model.eval()
    scores = []
    
    # Add query prompt (same as training)
    query_prompt = "Represent this sentence for searching relevant passages:"
    formatted_query = query_prompt + query
    
    with torch.no_grad():
        for doc in documents:
            # Tokenize query-document pair exactly like training
            qry_inputs = tokenizer.encode(
                formatted_query,
                truncation=True,
                max_length=max_query_length,
                add_special_tokens=False
            )
            doc_inputs = tokenizer.encode(
                doc,
                truncation=True,
                max_length=max_passage_length,
                add_special_tokens=False
            )
            
            # Prepare input - same as training data preparation
            inputs = tokenizer.prepare_for_model(
                qry_inputs,
                doc_inputs,
                truncation="only_second",
                max_length=max_query_length + max_passage_length,
                padding=False
            )
            
            # Convert to tensors and ensure proper batch dimension
            input_ids = torch.tensor([inputs["input_ids"]], dtype=torch.long)
            attention_mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long) if "attention_mask" in inputs else None
            token_type_ids = torch.tensor([inputs["token_type_ids"]], dtype=torch.long) if "token_type_ids" in inputs else None
            
            # Create final input dict
            model_inputs = {"input_ids": input_ids}
            if attention_mask is not None:
                model_inputs["attention_mask"] = attention_mask
            if token_type_ids is not None:
                model_inputs["token_type_ids"] = token_type_ids
            
            # Move to device
            model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
            
            # Get prediction
            outputs = model(**model_inputs)
            logits = outputs.logits
            
            # Apply softmax and get relevance score
            probs = F.softmax(logits, dim=-1)
            
            # IMPORTANT: Based on your training code, class 0 is positive (labels = torch.zeros)
            # So we should use class 0 probability as relevance score
            if logits.shape[-1] == 2:
                relevance_score = probs[0, 0].item()  # Class 0 is positive in your training
            elif logits.shape[-1] == 1:
                relevance_score = torch.sigmoid(logits[0, 0]).item()  # Single output
            else:
                # If more than 2 classes, use the first one (class 0)
                relevance_score = probs[0, 0].item()
                
            scores.append(relevance_score)
    
    # Create ranked results
    doc_scores = list(zip(documents, scores))
    ranked_results = sorted(doc_scores, key=lambda x: x[1], reverse=True)
    
    # Add ranks (1-indexed)
    ranked_with_ranks = [(doc, score, rank + 1) for rank, (doc, score) in enumerate(ranked_results)]


    return ranked_with_ranks
    




def calculate_ndcg(relevance_scores: List[int], k: int = None) -> float:
    """
    Calculate Normalized Discounted Cumulative Gain (NDCG).
    
    Args:
        relevance_scores: List of relevance scores (1 for relevant, 0 for non-relevant)
        k: Calculate NDCG@k (if None, calculate for all documents)
    
    Returns:
        NDCG score
    """
    if k is not None:
        relevance_scores = relevance_scores[:k]
    
    if not relevance_scores:
        return 0.0
    
    # Calculate DCG
    dcg = relevance_scores[0]  # First document has no discount
    for i in range(1, len(relevance_scores)):
        dcg += relevance_scores[i] / np.log2(i + 1)
    
    # Calculate IDCG (ideal DCG)
    ideal_relevance = sorted(relevance_scores, reverse=True)
    idcg = ideal_relevance[0] if ideal_relevance else 0
    for i in range(1, len(ideal_relevance)):
        idcg += ideal_relevance[i] / np.log2(i + 1)
    
    # Return NDCG
    return dcg / idcg if idcg > 0 else 0.0

def evaluate_model(eval_dataset: List[Dict], model, tokenizer, shuffle_docs: bool = True) -> Dict[str, Any]:
    """
    Evaluate a reranking model on a dataset.
    
    Args:
        eval_dataset: List of dictionaries with 'query', 'pos', and 'neg' keys
        model: The model to evaluate
        tokenizer: The tokenizer
        shuffle_docs: Whether to shuffle documents to avoid position bias
    
    Returns:
        dict: Dictionary containing all evaluation metrics
    """
    # Eval metrics 
    correct_at_1 = 0
    mrr_sum = 0  # Mean Reciprocal Rank
    ndcg_sum = 0  # NDCG
    ndcg_at_3_sum = 0  # NDCG@3
    ndcg_at_5_sum = 0  # NDCG@5
    
    print("Evaluating model on dataset...")
    for entry in tqdm(eval_dataset, desc="Evaluating queries"):
        query = entry["query"]
        
        # Handle both single positive doc and list of positive docs
        if isinstance(entry["pos"], list):
            positive_doc = entry["pos"][0]  # Take first positive document
        else:
            positive_doc = entry["pos"]
            
        negative_docs = entry["neg"]
        
        # Combine positive and negative documents
        all_docs = [positive_doc] + negative_docs
        
        # Shuffle documents to avoid position bias
        if shuffle_docs:
            random.shuffle(all_docs)
        
        # Rerank the documents
        ranked_results = rerank_predictions(query, all_docs, model, tokenizer)
        
        # Find the rank of the positive document and create relevance list
        positive_rank = None
        relevance_scores = []
        
        for doc, score, rank in ranked_results:
            # 1 for relevant (positive) document, 0 for non-relevant
            relevance = 1 if doc == positive_doc else 0
            relevance_scores.append(relevance)
            
            if doc == positive_doc:
                positive_rank = rank
        
        if positive_rank == 1:
            correct_at_1 += 1
        
        mrr_sum += 1.0 / positive_rank if positive_rank else 0
        
        # Calculate NDCG metrics
        ndcg = calculate_ndcg(relevance_scores)
        ndcg_at_3 = calculate_ndcg(relevance_scores, k=3)
        ndcg_at_5 = calculate_ndcg(relevance_scores, k=5)
        
        ndcg_sum += ndcg
        ndcg_at_3_sum += ndcg_at_3
        ndcg_at_5_sum += ndcg_at_5
    
    total_queries = len(eval_dataset)
    accuracy_at_1 = correct_at_1 / total_queries if total_queries > 0 else 0
    mrr = mrr_sum / total_queries if total_queries > 0 else 0
    ndcg_avg = ndcg_sum / total_queries if total_queries > 0 else 0
    ndcg_at_3_avg = ndcg_at_3_sum / total_queries if total_queries > 0 else 0
    ndcg_at_5_avg = ndcg_at_5_sum / total_queries if total_queries > 0 else 0
    
    results = {
        "total_queries": total_queries,
        "accuracy_at_1": accuracy_at_1,
        "mrr": mrr,
        "ndcg": ndcg_avg,
        "ndcg_at_3": ndcg_at_3_avg,
        "ndcg_at_5": ndcg_at_5_avg
    }
    
    # Print results
    print(f"\nEvaluation Results:")
    print(f"Total queries evaluated: {results['total_queries']}")
    print(f"Accuracy@1: {results['accuracy_at_1']:.4f}")
    print(f"Mean Reciprocal Rank (MRR): {results['mrr']:.4f}")
    print(f"NDCG: {results['ndcg']:.4f}")
    print(f"NDCG@3: {results['ndcg_at_3']:.4f}")
    print(f"NDCG@5: {results['ndcg_at_5']:.4f}")
    
    return results

def load_evaluation_data(file_path: str) -> List[Dict]:
    """
    Load evaluation dataset from JSON/JSONL file.
    
    Args:
        file_path: Path to the evaluation dataset file
    
    Returns:
        List of evaluation examples
    """
    eval_data = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        if file_path.endswith('.jsonl'):
            for line in f:
                eval_data.append(json.loads(line.strip()))
        else:
            eval_data = json.load(f)
    
    return eval_data





In [26]:
test_data = load_evaluation_data("ft_data/testing_data.jsonl")

In [27]:
len(test_data), test_data[100]

(700,
 {'query': 'What types of companies form the origins of the described global technology leader?',
  'pos': ['Our over 50-year history of innovation dates back to our diverse origins from Hewlett-Packard Company, AT&T, LSI Corporation, Broadcom Corporation, Brocade Communications Systems LLC, CA, Inc., Symantec Enterprise Security, and VMware, Inc.'],
  'neg': ['Synthroid (levothyroxine sodium tablets, USP) is used in the treatment of hypothyroidism.',
   'Our Revolving Credit Agreement contains a net debt-to-EBITDA financial ratio covenant requiring AT&T to maintain, as of the last day of each fiscal quarter, a ratio of not more than 3.75-to-1.',
   "In corporate financial reporting, different sections of the Management's Discussion and Analysis include topics such as Executive Overview, Critical Accounting Policies and Estimates, Results of Operations, and Liquidity and Capital Resources.",
   'For a discussion of legal and other proceedings in which we are involved, see Note 13

In [29]:
BASE_MODEL_ID = "BAAI/bge-m3"
ADAPTER_PATH = "bge_m3_reranker_lora_adapter_200"  # Path to your saved adapter


tokenizer, model = load_trained_model(BASE_MODEL_ID, ADAPTER_PATH)

Loading base model: BAAI/bge-m3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at BAAI/bge-m3 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading LoRA adapter from: bge_m3_reranker_lora_adapter_200


In [30]:
# model

In [31]:
results = evaluate_model(test_data, model, tokenizer, shuffle_docs=True)
# bsz = 8, 200 steps 

Evaluating model on dataset...


Evaluating queries:   0%|          | 0/700 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Evaluating queries: 100%|██████████| 700/700 [03:58<00:00,  2.94it/s]


Evaluation Results:
Total queries evaluated: 700
Accuracy@1: 0.9029
Mean Reciprocal Rank (MRR): 0.9486
NDCG: 0.9940
NDCG@3: 0.9911
NDCG@5: 0.9940





In [28]:
results = evaluate_model(test_data, model, tokenizer, shuffle_docs=True)
# bsz = 4, 600 steps 

Evaluating model on dataset...


Evaluating queries: 100%|██████████| 700/700 [03:55<00:00,  2.97it/s]


Evaluation Results:
Total queries evaluated: 700
Accuracy@1: 0.9343
Mean Reciprocal Rank (MRR): 0.9667
NDCG: 0.9989
NDCG@3: 0.9989
NDCG@5: 0.9989



