In [1]:
# !pip install faiss-cpu

### Evaluate BGE reranker

In [7]:
from FlagEmbedding.abc.evaluation.utils import evaluate_metrics, evaluate_mrr
from FlagEmbedding import FlagModel, FlagReranker

k_values = [10,100]

In [8]:
# !pip install pytrec_eval

### Base model

In [9]:
from FlagEmbedding.abc.evaluation.utils import evaluate_metrics, evaluate_mrr
from FlagEmbedding import FlagModel, FlagReranker


raw_model = "BAAI/bge-reranker-v2-m3"

In [10]:
raw_model = FlagReranker("BAAI/bge-reranker-v2-m3", use_fp16=False)

In [11]:
pairs = [
    ["What is the capital of France?", "Paris is the capital of France."],
    ["What is the capital of France?", "The population of China is over 1.4 billion people."],
    ["What is the population of China?", "Paris is the capital of France."],
    ["What is the population of China?", "The population of China is over 1.4 billion people."]
]

scores = raw_model.compute_score(pairs)
scores

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[8.781817436218262, -10.811635971069336, -11.013592720031738, 8.00526237487793]

In [12]:
queries[0]

{'id': '1289',
 'text': 'How does Starbucks recognize the interest and penalties related to income tax matters on their financial statements?'}

In [13]:
def construct_query_passage_pairs(queries, corpus, qrels):
    pairs = []
    
    for query in queries:
        for passage in corpus:
            pairs.append([query, passage])
    return pairs

In [14]:
from datasets import load_dataset
import random
import pandas as pd

# Load the datasets
queries = load_dataset("json", data_files="ft_data/test_queries.jsonl")["train"]
corpus = load_dataset("json", data_files="ft_data/corpus.jsonl")["train"]
qrels = load_dataset("json", data_files="ft_data/test_qrels.jsonl")["train"]

In [15]:
len(queries), len(corpus), len(qrels)

(700, 7000, 700)

In [16]:
# queries = queries[:100]

### Data preperation for evals :- query, pos, neg 

In [17]:
queries_dict = {q["id"]: q["text"] for q in queries}
corpus_dict = {d["id"]: d["text"][0] if isinstance(d["text"], list) else d["text"] for d in corpus}


# Create a structured dataset with positive and negative examples for each query
dataset = []

# For each query in qrels
for qrel in qrels:
    qid = qrel["qid"]
    pos_docid = qrel["docid"]
    
    if qid in queries_dict and pos_docid in corpus_dict:
        query_text = queries_dict[qid]
        pos_doc_text = corpus_dict[pos_docid]
        
        # Get all document IDs that are not positive matches for this query
        negative_docids = [doc_id for doc_id in corpus_dict.keys() if doc_id != pos_docid]
        
        # Sample negative examples for this query (adjust the number as needed)
        num_negatives = min(5, len(negative_docids))  # 5 negatives per query
        if negative_docids and num_negatives > 0:
            sampled_negative_docids = random.sample(negative_docids, num_negatives)
            
            # Create an entry with the query, positive document, and negative documents
            entry = {
                "query_id": qid,
                "query_text": query_text,
                "positive_doc_id": pos_docid,
                "positive_doc_text": pos_doc_text,
                "negative_docs": [
                    {
                        "doc_id": neg_docid,
                        "doc_text": corpus_dict[neg_docid]
                    } for neg_docid in sampled_negative_docids
                ]
            }
            
            dataset.append(entry)

df = pd.DataFrame(dataset)

print(f"Total queries: {len(df)}")
print(f"Total negative examples: {sum(len(row['negative_docs']) for _, row in df.iterrows())}")
print(f"Average negative examples per query: {sum(len(row['negative_docs']) for _, row in df.iterrows()) / len(df):.2f}")

print("\nSample entry:")
sample_entry = dataset[0]
print(f"Query ID: {sample_entry['query_id']}")
print(f"Query text: {sample_entry['query_text']}")
print(f"Positive document ID: {sample_entry['positive_doc_id']}")
print(f"Positive document text: {sample_entry['positive_doc_text'][:100]}...")
print(f"Number of negative documents: {len(sample_entry['negative_docs'])}")
print(f"First negative document ID: {sample_entry['negative_docs'][0]['doc_id']}")
print(f"First negative document text: {sample_entry['negative_docs'][0]['doc_text'][:100]}...")

# Optional: Convert to a format suitable for training
training_data = []
for entry in dataset:
    query = entry["query_text"]
    pos_doc = entry["positive_doc_text"]
    
    # Add positive example
    training_data.append({
        "query": query,
        "document": pos_doc,
        "label": 1
    })
    
    # Add negative examples
    for neg_doc in entry["negative_docs"]:
        training_data.append({
            "query": query,
            "document": neg_doc["doc_text"],
            "label": 0
        })

# Convert to DataFrame
training_df = pd.DataFrame(training_data)

# Display training data statistics
print("\nTraining data:")
print(f"Total examples: {len(training_df)}")
print(f"Positive examples: {len(training_df[training_df['label'] == 1])}")
print(f"Negative examples: {len(training_df[training_df['label'] == 0])}")

# Preview training data
training_df.head()

# Optional: Save the dataset to a file
# df.to_json("structured_dataset.json", orient="records")
# training_df.to_csv("training_dataset.csv", index=False)
# training_df.to_json("training_dataset.jsonl", orient="records", lines=True)

Total queries: 700
Total negative examples: 3500
Average negative examples per query: 5.00

Sample entry:
Query ID: 1289
Query text: How does Starbucks recognize the interest and penalties related to income tax matters on their financial statements?
Positive document ID: 1289
Positive document text: Starbucks recognizes interest and penalties related to income tax matters in income tax expense on o...
Number of negative documents: 5
First negative document ID: 3135
First negative document text: Consumer Banking Results Net income for Consumer Banking decreased $923 million to $11.6 billion due...

Training data:
Total examples: 4200
Positive examples: 700
Negative examples: 3500


Unnamed: 0,query,document,label
0,How does Starbucks recognize the interest and ...,Starbucks recognizes interest and penalties re...,1
1,How does Starbucks recognize the interest and ...,Consumer Banking Results Net income for Consum...,0
2,How does Starbucks recognize the interest and ...,The report on the Consolidated Financial State...,0
3,How does Starbucks recognize the interest and ...,Iron Mountain expects to incur approximately $...,0
4,How does Starbucks recognize the interest and ...,Gross profit margin for the Dollar Tree segmen...,0


In [28]:
training_df.to_json("structured_dataset.json", orient="records")


In [25]:
len(dataset)

700

In [26]:
eval_dataset = dataset[:100]

In [None]:
from tqdm import tqdm
import numpy as np

def rerank_predictions(query, docs, model=None):
    """
    Rerank documents based on their relevance to the query using a pre-trained model.
    
    Args:
        query (str): The query text
        docs (list): List of document texts to rerank
        model: The model to use for scoring. If None, uses the default model.
        
    Returns:
        list: List of (document, score, rank) tuples sorted by score in descending order
    """
    # If no model is provided, use the default model
    if model is None:
        # You would need to define or import your model here
        raise ValueError("Model must be provided")
    
    # Create pairs of query and document for scoring
    pairs = [[query, doc] for doc in docs]
    
    # Compute scores for each query-document pair
    scores = model.compute_score(pairs)
    
    # Combine documents with their scores
    doc_scores = list(zip(docs, scores))
    
    # Sort by score in descending order (highest score first)
    ranked_docs = sorted(doc_scores, key=lambda x: x[1], reverse=True)
    
    # Add rank information
    ranked_docs_with_rank = [(doc, score, i+1) for i, (doc, score) in enumerate(ranked_docs)]
    
    return ranked_docs_with_rank

# Function to calculate NDCG
def calculate_ndcg(relevance_scores, k=None):
    """
    Calculate NDCG (Normalized Discounted Cumulative Gain) at k.
    
    Args:
        relevance_scores (list): List of relevance scores (1 for relevant, 0 for non-relevant)
        k (int, optional): Calculate NDCG@k. If None, use all scores.
    
    Returns:
        float: NDCG value
    """
    if not relevance_scores:
        return 0.0
    
    if k is not None:
        relevance_scores = relevance_scores[:k]
    
    # Calculate DCG (Discounted Cumulative Gain)
    dcg = 0.0
    for i, rel in enumerate(relevance_scores):
        # Using log base 2 as is standard in NDCG
        dcg += (2**rel - 1) / np.log2(i + 2)  # +2 because i is 0-indexed and log(1) is 0
    
    # Calculate ideal DCG (IDCG)
    ideal_relevance = sorted(relevance_scores, reverse=True)
    idcg = 0.0
    for i, rel in enumerate(ideal_relevance):
        idcg += (2**rel - 1) / np.log2(i + 2)
    
    # Calculate NDCG
    if idcg > 0:
        return dcg / idcg
    else:
        return 0.0

In [29]:
# Eval metrics 

correct_at_1 = 0
mrr_sum = 0  # Mean Reciprocal Rank
ndcg_sum = 0  # NDCG
ndcg_at_3_sum = 0  # NDCG@3
ndcg_at_5_sum = 0  # NDCG@5



print("Evaluating model on dataset...")
for entry in tqdm(eval_dataset, desc="Evaluating queries"):
    query = entry["query_text"]
    positive_doc = entry["positive_doc_text"]
    
    # Combine positive and negative documents
    all_docs = [positive_doc] + [neg_doc["doc_text"] for neg_doc in entry["negative_docs"]]
    
    # Shuffle documents to avoid position bias
    random.shuffle(all_docs)
    
    # Rerank the documents
    ranked_results = rerank_predictions(query, all_docs, raw_model)
    
    # Find the rank of the positive document and create relevance list
    positive_rank = None
    relevance_scores = []
    
    for doc, score, rank in ranked_results:
        # 1 for relevant (positive) document, 0 for non-relevant
        relevance = 1 if doc == positive_doc else 0
        relevance_scores.append(relevance)
        
        if doc == positive_doc:
            positive_rank = rank
    
    # Update metrics
    if positive_rank == 1:
        correct_at_1 += 1
    
    mrr_sum += 1.0 / positive_rank if positive_rank else 0
    
    # Calculate NDCG metrics
    ndcg = calculate_ndcg(relevance_scores)
    ndcg_at_3 = calculate_ndcg(relevance_scores, k=3)
    ndcg_at_5 = calculate_ndcg(relevance_scores, k=5)
    
    ndcg_sum += ndcg
    ndcg_at_3_sum += ndcg_at_3
    ndcg_at_5_sum += ndcg_at_5

# Calculate final metrics
total_queries = len(eval_dataset)
accuracy_at_1 = correct_at_1 / total_queries if total_queries > 0 else 0
mrr = mrr_sum / total_queries if total_queries > 0 else 0
ndcg_avg = ndcg_sum / total_queries if total_queries > 0 else 0
ndcg_at_3_avg = ndcg_at_3_sum / total_queries if total_queries > 0 else 0
ndcg_at_5_avg = ndcg_at_5_sum / total_queries if total_queries > 0 else 0

print(f"\nEvaluation Results:")
print(f"Total queries evaluated: {total_queries}")
print(f"Accuracy@1: {accuracy_at_1:.4f}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"NDCG: {ndcg_avg:.4f}")
print(f"NDCG@3: {ndcg_at_3_avg:.4f}")
print(f"NDCG@5: {ndcg_at_5_avg:.4f}")

Evaluating model on dataset...


Evaluating queries: 100%|██████████| 100/100 [00:07<00:00, 13.01it/s]


Evaluation Results:
Total queries evaluated: 100
Accuracy@1: 0.9900
Mean Reciprocal Rank (MRR): 0.9950
NDCG: 0.9963
NDCG@3: 0.9963
NDCG@5: 0.9963





In [31]:

# # Example of reranking for a single query
# if eval_dataset:
#     sample_entry = eval_dataset[0]
#     sample_query = sample_entry["query_text"]
#     sample_pos_doc = sample_entry["positive_doc_text"]
#     sample_neg_docs = [neg_doc["doc_text"] for neg_doc in sample_entry["negative_docs"]]
    
#     all_sample_docs = [sample_pos_doc] + sample_neg_docs
    
#     print("\nExample reranking for query:")
#     print(f"Query: {sample_query}")
    
#     ranked_sample = rerank_predictions(sample_query, all_sample_docs, raw_model)
    
#     print("\nRanked results:")
#     for doc, score, rank in ranked_sample:
#         doc_type = "POSITIVE" if doc == sample_pos_doc else "NEGATIVE"
#         print(f"Rank {rank} | Score: {score:.4f} | Type: {doc_type}")
#         print(f"Document: {doc[:100]}...\n")

### Eval on finetuned model

In [34]:
ft_model_path = "results"

In [35]:
ft_model =  FlagReranker(ft_model_path, use_fp16=False)


In [36]:
ft_model

<FlagEmbedding.inference.reranker.encoder_only.base.BaseReranker at 0x76d1675fafb0>

In [37]:

# Initialize evaluation metrics
correct_at_1 = 0
mrr_sum = 0  # Mean Reciprocal Rank
ndcg_sum = 0  # NDCG
ndcg_at_3_sum = 0  # NDCG@3
ndcg_at_5_sum = 0  # NDCG@5

# Evaluate each query in the dataset with progress bar
print("Evaluating model on dataset...")
for entry in tqdm(eval_dataset, desc="Evaluating queries"):
    query = entry["query_text"]
    positive_doc = entry["positive_doc_text"]
    
    # Combine positive and negative documents
    all_docs = [positive_doc] + [neg_doc["doc_text"] for neg_doc in entry["negative_docs"]]
    
    # Shuffle documents to avoid position bias
    random.shuffle(all_docs)
    
    # Rerank the documents
    ranked_results = rerank_predictions(query, all_docs, ft_model)
    
    # Find the rank of the positive document and create relevance list
    positive_rank = None
    relevance_scores = []
    
    for doc, score, rank in ranked_results:
        # 1 for relevant (positive) document, 0 for non-relevant
        relevance = 1 if doc == positive_doc else 0
        relevance_scores.append(relevance)
        
        if doc == positive_doc:
            positive_rank = rank
    
    # Update metrics
    if positive_rank == 1:
        correct_at_1 += 1
    
    mrr_sum += 1.0 / positive_rank if positive_rank else 0
    
    # Calculate NDCG metrics
    ndcg = calculate_ndcg(relevance_scores)
    ndcg_at_3 = calculate_ndcg(relevance_scores, k=3)
    ndcg_at_5 = calculate_ndcg(relevance_scores, k=5)
    
    ndcg_sum += ndcg
    ndcg_at_3_sum += ndcg_at_3
    ndcg_at_5_sum += ndcg_at_5

# Calculate final metrics
total_queries = len(eval_dataset)
accuracy_at_1 = correct_at_1 / total_queries if total_queries > 0 else 0
mrr = mrr_sum / total_queries if total_queries > 0 else 0
ndcg_avg = ndcg_sum / total_queries if total_queries > 0 else 0
ndcg_at_3_avg = ndcg_at_3_sum / total_queries if total_queries > 0 else 0
ndcg_at_5_avg = ndcg_at_5_sum / total_queries if total_queries > 0 else 0

print(f"\nEvaluation Results:")
print(f"Total queries evaluated: {total_queries}")
print(f"Accuracy@1: {accuracy_at_1:.4f}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"NDCG: {ndcg_avg:.4f}")
print(f"NDCG@3: {ndcg_at_3_avg:.4f}")
print(f"NDCG@5: {ndcg_at_5_avg:.4f}")

Evaluating model on dataset...


Evaluating queries:   0%|          | 0/100 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Evaluating queries: 100%|██████████| 100/100 [00:21<00:00,  4.68it/s]


Evaluation Results:
Total queries evaluated: 100
Accuracy@1: 1.0000
Mean Reciprocal Rank (MRR): 1.0000
NDCG: 1.0000
NDCG@3: 1.0000
NDCG@5: 1.0000



