In [6]:
# !pip -q install pytrec_eval faiss-cpu

### Evaluate BGE reranker

In [1]:
from FlagEmbedding.abc.evaluation.utils import evaluate_metrics, evaluate_mrr
from FlagEmbedding import FlagModel, FlagReranker



### Base model

In [2]:
from FlagEmbedding.abc.evaluation.utils import evaluate_metrics, evaluate_mrr
from FlagEmbedding import FlagModel, FlagReranker


raw_model_id = "BAAI/bge-base-en-v1.5"

In [3]:
raw_model = FlagReranker(raw_model_id, use_fp16=False)

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at BAAI/bge-base-en-v1.5 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load testing data

In [4]:
import json

test_data = []
with open("ft_data/testing_data.jsonl", "r") as file:
    for line in file:
        test_data.append(json.loads(line))


In [5]:
len(test_data)

700

In [6]:
test_data[0]

{'query': 'How does Starbucks recognize the interest and penalties related to income tax matters on their financial statements?',
 'pos': ['Starbucks recognizes interest and penalties related to income tax matters in income tax expense on our consolidated statements of earnings. Accrued interest and penalties are included within the related tax balances on our consolidated balance sheets.'],
 'neg': ['The Australian Securities and Investments Commission regulates corporations and has authority to investigate, prosecute, ban individuals and to seek civil penalties.',
  'Item 3 of the Annual Report on Form 10-K connects to information about legal proceedings by referring to Note 14 in the Notes to the Consolidated Financial Statements, included in Item 8.',
  'The accumulated benefit obligation for the USRIP and Supplemental Retirement Plans was $466.1 million at December 31, 2023, and was $500.6 million at December 31, 2022.',
  'Operating activities | $ | 2,296,164 |',
  'Net cash prov

### Create data for evals

In [10]:
import pandas as pd
import random

dataset = []

for item in test_data:
    query_text = item["query"]
    positive_docs = item["pos"]
    negative_docs = item["neg"]
    
    # For each positive document, create an entry
    for pos_doc_text in positive_docs:
        entry = {
            "query": query_text,
            "pos": pos_doc_text,
            "neg": negative_docs
        }
        dataset.append(entry)

df = pd.DataFrame(dataset)

# Print statistics
print(f"Total queries: {len(df)}")
print(f"Total negative examples: {sum(len(row['neg']) for _, row in df.iterrows())}")
print(f"Average negative examples per query: {sum(len(row['neg']) for _, row in df.iterrows()) / len(df):.2f}")

print("\nSample entry:")
sample_entry = dataset[0]
print(f"Query: {sample_entry['query']}")
print(f"Positive document: {sample_entry['pos'][:100]}...")
print(f"Number of negative documents: {len(sample_entry['neg'])}")
print(f"First negative document: {sample_entry['neg'][0][:100]}...")

# Convert to training format
training_data = []
for entry in dataset:
    query = entry["query"]
    pos_doc = entry["pos"]
    
    # Add positive example
    training_data.append({
        "query": query,
        "document": pos_doc,
        "label": 1
    })
    
    # Add negative examples
    for neg_doc in entry["neg"]:
        training_data.append({
            "query": query,
            "document": neg_doc,
            "label": 0
        })

# Convert to DataFrame
training_df = pd.DataFrame(training_data)

# Display training data statistics
print("\nTraining data:")
print(f"Total examples: {len(training_df)}")
print(f"Positive examples: {len(training_df[training_df['label'] == 1])}")
print(f"Negative examples: {len(training_df[training_df['label'] == 0])}")


# Optional: Save the dataset to files
# df.to_json("structured_dataset.json", orient="records")
# training_df.to_csv("training_dataset.csv", index=False)
# training_df.to_json("training_dataset.jsonl", orient="records", lines=True)

Total queries: 700
Total negative examples: 6300
Average negative examples per query: 9.00

Sample entry:
Query: How does Starbucks recognize the interest and penalties related to income tax matters on their financial statements?
Positive document: Starbucks recognizes interest and penalties related to income tax matters in income tax expense on o...
Number of negative documents: 9
First negative document: The Australian Securities and Investments Commission regulates corporations and has authority to inv...

Training data:
Total examples: 7000
Positive examples: 700
Negative examples: 6300


In [18]:

dataset[0]

{'query': 'How does Starbucks recognize the interest and penalties related to income tax matters on their financial statements?',
 'pos': 'Starbucks recognizes interest and penalties related to income tax matters in income tax expense on our consolidated statements of earnings. Accrued interest and penalties are included within the related tax balances on our consolidated balance sheets.',
 'neg': ['The Australian Securities and Investments Commission regulates corporations and has authority to investigate, prosecute, ban individuals and to seek civil penalties.',
  'Item 3 of the Annual Report on Form 10-K connects to information about legal proceedings by referring to Note 14 in the Notes to the Consolidated Financial Statements, included in Item 8.',
  'The accumulated benefit obligation for the USRIP and Supplemental Retirement Plans was $466.1 million at December 31, 2023, and was $500.6 million at December 31, 2022.',
  'Operating activities | $ | 2,296,164 |',
  'Net cash provid

In [15]:
eval_dataset = dataset

In [16]:
from tqdm import tqdm
import numpy as np

def rerank_predictions(query, docs, model=None):
    """
    Rerank documents based on their relevance to the query using a pre-trained model.
    
    Args:
        query (str): The query text
        docs (list): List of document texts to rerank
        model: The model to use for scoring. If None, uses the default model.
        
    Returns:
        list: List of (document, score, rank) tuples sorted by score in descending order
    """
    # If no model is provided, use the default model
    if model is None:
        # You would need to define or import your model here
        raise ValueError("Model must be provided")
    
    # Create pairs of query and document for scoring
    pairs = [[query, doc] for doc in docs]
    
    # Compute scores for each query-document pair
    scores = model.compute_score(pairs)
    
    # Combine documents with their scores
    doc_scores = list(zip(docs, scores))
    
    # Sort by score in descending order (highest score first)
    ranked_docs = sorted(doc_scores, key=lambda x: x[1], reverse=True)
    
    # Add rank information
    ranked_docs_with_rank = [(doc, score, i+1) for i, (doc, score) in enumerate(ranked_docs)]
    
    return ranked_docs_with_rank

# Function to calculate NDCG
def calculate_ndcg(relevance_scores, k=None):
    """
    Calculate NDCG (Normalized Discounted Cumulative Gain) at k.
    
    Args:
        relevance_scores (list): List of relevance scores (1 for relevant, 0 for non-relevant)
        k (int, optional): Calculate NDCG@k. If None, use all scores.
    
    Returns:
        float: NDCG value
    """
    if not relevance_scores:
        return 0.0
    
    if k is not None:
        relevance_scores = relevance_scores[:k]
    
    # Calculate DCG (Discounted Cumulative Gain)
    dcg = 0.0
    for i, rel in enumerate(relevance_scores):
        # Using log base 2 as is standard in NDCG
        dcg += (2**rel - 1) / np.log2(i + 2)  # +2 because i is 0-indexed and log(1) is 0
    
    # Calculate ideal DCG (IDCG)
    ideal_relevance = sorted(relevance_scores, reverse=True)
    idcg = 0.0
    for i, rel in enumerate(ideal_relevance):
        idcg += (2**rel - 1) / np.log2(i + 2)
    
    # Calculate NDCG
    if idcg > 0:
        return dcg / idcg
    else:
        return 0.0

In [21]:
def evaluate_model(eval_dataset, model, shuffle_docs=True):
    """
    Evaluate a reranking model on a dataset.
    
    Args:
        eval_dataset: List of dictionaries with 'query', 'pos', and 'neg' keys
        model: The model to evaluate
        shuffle_docs: Whether to shuffle documents to avoid position bias
    
    Returns:
        dict: Dictionary containing all evaluation metrics
    """
    # Eval metrics 
    correct_at_1 = 0
    mrr_sum = 0  # Mean Reciprocal Rank
    ndcg_sum = 0  # NDCG
    ndcg_at_3_sum = 0  # NDCG@3
    ndcg_at_5_sum = 0  # NDCG@5

    print("Evaluating model on dataset...")
    for entry in tqdm(eval_dataset, desc="Evaluating queries"):
        query = entry["query"]
        positive_doc = entry["pos"]
        negative_docs = entry["neg"]
        
        # Combine positive and negative documents
        all_docs = [positive_doc] + negative_docs
        
        # Shuffle documents to avoid position bias
        if shuffle_docs:
            random.shuffle(all_docs)
        
        # Rerank the documents
        ranked_results = rerank_predictions(query, all_docs, model)
        
        # Find the rank of the positive document and create relevance list
        positive_rank = None
        relevance_scores = []
        
        for doc, score, rank in ranked_results:
            # 1 for relevant (positive) document, 0 for non-relevant
            relevance = 1 if doc == positive_doc else 0
            relevance_scores.append(relevance)
            
            if doc == positive_doc:
                positive_rank = rank
        
        # Update metrics
        if positive_rank == 1:
            correct_at_1 += 1
        
        mrr_sum += 1.0 / positive_rank if positive_rank else 0
        
        # Calculate NDCG metrics
        ndcg = calculate_ndcg(relevance_scores)
        ndcg_at_3 = calculate_ndcg(relevance_scores, k=3)
        ndcg_at_5 = calculate_ndcg(relevance_scores, k=5)
        
        ndcg_sum += ndcg
        ndcg_at_3_sum += ndcg_at_3
        ndcg_at_5_sum += ndcg_at_5

    # Calculate final metrics
    total_queries = len(eval_dataset)
    accuracy_at_1 = correct_at_1 / total_queries if total_queries > 0 else 0
    mrr = mrr_sum / total_queries if total_queries > 0 else 0
    ndcg_avg = ndcg_sum / total_queries if total_queries > 0 else 0
    ndcg_at_3_avg = ndcg_at_3_sum / total_queries if total_queries > 0 else 0
    ndcg_at_5_avg = ndcg_at_5_sum / total_queries if total_queries > 0 else 0

    # Create results dictionary
    results = {
        "total_queries": total_queries,
        "accuracy_at_1": accuracy_at_1,
        "mrr": mrr,
        "ndcg": ndcg_avg,
        "ndcg_at_3": ndcg_at_3_avg,
        "ndcg_at_5": ndcg_at_5_avg
    }
    
    # Print results
    print(f"\nEvaluation Results:")
    print(f"Total queries evaluated: {results['total_queries']}")
    print(f"Accuracy@1: {results['accuracy_at_1']:.4f}")
    print(f"Mean Reciprocal Rank (MRR): {results['mrr']:.4f}")
    print(f"NDCG: {results['ndcg']:.4f}")
    print(f"NDCG@3: {results['ndcg_at_3']:.4f}")
    print(f"NDCG@5: {results['ndcg_at_5']:.4f}")
    
    return results



In [22]:
results = evaluate_model(eval_dataset, raw_model)

Evaluating model on dataset...


Evaluating queries: 100%|██████████| 700/700 [00:53<00:00, 12.98it/s]


Evaluation Results:
Total queries evaluated: 700
Accuracy@1: 0.0514
Mean Reciprocal Rank (MRR): 0.2439
NDCG: 0.4162
NDCG@3: 0.1445
NDCG@5: 0.2413





In [23]:
results

{'total_queries': 700,
 'accuracy_at_1': 0.05142857142857143,
 'mrr': 0.24392687074829883,
 'ndcg': np.float64(0.4161641625871046),
 'ndcg_at_3': np.float64(0.144539167760206),
 'ndcg_at_5': np.float64(0.24128697642883729)}

### Eval on finetuned model

In [24]:
ft_model_path = "results"

In [25]:
ft_model =  FlagReranker(ft_model_path, use_fp16=False)


In [26]:
ft_model

<FlagEmbedding.inference.reranker.encoder_only.base.BaseReranker at 0x784aba9414b0>

In [27]:
results = evaluate_model(eval_dataset, ft_model)

Evaluating model on dataset...


Evaluating queries:   0%|          | 0/700 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Evaluating queries: 100%|██████████| 700/700 [00:27<00:00, 25.66it/s]


Evaluation Results:
Total queries evaluated: 700
Accuracy@1: 0.9943
Mean Reciprocal Rank (MRR): 0.9971
NDCG: 0.9979
NDCG@3: 0.9979
NDCG@5: 0.9979



