In [1]:
import os
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer

In [2]:
# Initialize models
print("Initializing models...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
encoder = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
rouge_calculator = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

Initializing models...


In [3]:
def bert_encode(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.squeeze(0)  # shape: (seq_len, hidden_dim)

def bert_similarity_matrix(ref_emb, cand_emb):
    ref_norm = F.normalize(ref_emb, dim=1)
    cand_norm = F.normalize(cand_emb, dim=1)
    return ref_norm @ cand_norm.T  # shape: (len_ref, len_cand)

def bert_score(text_ref, text_cand, tokenizer, model):
    ref_emb = bert_encode(text_ref, tokenizer, model)
    cand_emb = bert_encode(text_cand, tokenizer, model)
    
    sim_matrix = bert_similarity_matrix(ref_emb, cand_emb)

    # Recall: average over reference tokens (rows)
    recall = sim_matrix.max(dim=1).values.mean().item()

    # Precision: average over candidate tokens (columns)
    precision = sim_matrix.max(dim=0).values.mean().item()

    # F1 score
    f1 = 2 * precision * recall / (precision + recall + 1e-8)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def pairwise_sim(ref_emb, cand_emb, scale=False):
    ref_norm = F.normalize(ref_emb, dim=1)
    cand_norm = F.normalize(cand_emb, dim=1) 
    sim_score = (ref_norm * cand_norm).sum(dim=1)  # shape: (n,), -1 to 1
    if scale: 
        sim_score = (sim_score + 1) / 2  # scale to 0 to 1
    return sim_score

def calculate_rouge_metrics(references, candidates):
    rouge1_precision = []
    rouge1_recall = []
    rouge1_f1 = []
    
    rouge2_precision = []
    rouge2_recall = []
    rouge2_f1 = []
    
    rougeL_precision = []
    rougeL_recall = []
    rougeL_f1 = []
    
    for ref, cand in zip(references, candidates):
        scores = rouge_calculator.score(ref, cand)
        
        rouge1_precision.append(scores['rouge1'].precision)
        rouge1_recall.append(scores['rouge1'].recall)
        rouge1_f1.append(scores['rouge1'].fmeasure)
        
        rouge2_precision.append(scores['rouge2'].precision)
        rouge2_recall.append(scores['rouge2'].recall)
        rouge2_f1.append(scores['rouge2'].fmeasure)
        
        rougeL_precision.append(scores['rougeL'].precision)
        rougeL_recall.append(scores['rougeL'].recall)
        rougeL_f1.append(scores['rougeL'].fmeasure)
    
    return {
        'rouge1_precision': np.mean(rouge1_precision),
        'rouge1_recall': np.mean(rouge1_recall),
        'rouge1_f1': np.mean(rouge1_f1),
        'rouge2_precision': np.mean(rouge2_precision),
        'rouge2_recall': np.mean(rouge2_recall),
        'rouge2_f1': np.mean(rouge2_f1),
        'rougeL_precision': np.mean(rougeL_precision),
        'rougeL_recall': np.mean(rougeL_recall),
        'rougeL_f1': np.mean(rougeL_f1)
    }

def process_file(file_path):
    print(f"Processing {file_path}...")
    df = pd.read_csv(file_path)
    
    # Calculate BERTScore
    for i, row in tqdm(df.iterrows(), total=len(df), desc="Computing BERTScore"):
        ref = row['ground_truth_answer']
        cand = row['predicted_answer']
        score = bert_score(ref, cand, tokenizer, model)
        df.at[i, 'precision'] = score['precision']
        df.at[i, 'recall'] = score['recall']
        df.at[i, 'f1'] = score['f1']
    
    # Extract data
    queries = df['query'].tolist()
    references = df['ground_truth_answer'].tolist()
    candidates = df['predicted_answer'].tolist()
    
    # Compute encodings
    print("Computing sentence embeddings...")
    q_embs = encoder.encode(queries, convert_to_tensor=True)
    ref_embs = encoder.encode(references, convert_to_tensor=True)
    cand_embs = encoder.encode(candidates, convert_to_tensor=True)
    
    # Calculate similarities
    q_ref_sim = pairwise_sim(q_embs, ref_embs)
    q_cand_sim = pairwise_sim(q_embs, cand_embs)
    ref_cand_sim = pairwise_sim(ref_embs, cand_embs)
    
    # Calculate metrics
    metric1 = ref_cand_sim                                   # reference-candidate similarity
    metric2 = q_cand_sim                                     # query-candidate relevance
    
    # Handle potential division by zero or very small values
    safe_divisor = torch.where(q_ref_sim != 0, q_ref_sim, torch.ones_like(q_ref_sim) * 1e-8)
    aggregated_metric = metric1 * q_cand_sim / safe_divisor   # reference-candidate similarity weighted by query-candidate relevance
    
    # Convert to numpy for calculations
    metric1_np = metric1.cpu().numpy()
    metric2_np = metric2.cpu().numpy()
    aggregated_metric_np = aggregated_metric.cpu().numpy()
    
    # Calculate ROUGE metrics
    print("Computing ROUGE metrics...")
    rouge_metrics = calculate_rouge_metrics(references, candidates)
    
    # Return all metrics
    result = {
        'file_name': os.path.basename(file_path),
        'bertscore_precision': df['precision'].mean(),
        'bertscore_recall': df['recall'].mean(),
        'bertscore_f1': df['f1'].mean(),
        'ref_cand_sim': metric1_np.mean(),
        'query_cand_relevance': metric2_np.mean(),
        'aggregated_metric': aggregated_metric_np.mean(),
        **rouge_metrics
    }
    
    return result

In [4]:
# Find all CSV files in the ./predictions directory
prediction_dir = './predictions_end_to_end/'
csv_files = [os.path.join(prediction_dir, f) for f in os.listdir(prediction_dir) if f.endswith('.csv')]

# Process each file and collect results
results = []
for file_path in csv_files:
    try:
        result = process_file(file_path)
        results.append(result)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Generate summary report
report_path = "evaluation_end_to_end_summary_report.txt"
with open(report_path, 'w') as f:
    f.write("EVALUATION SUMMARY REPORT\n")
    f.write("=======================\n\n")
    
    for result in results:
        f.write(f"MODEL: {'_'.join(result['file_name'].split('_')[1:])}\n")
        f.write("=" * (len("MODEL: ") + len(result['file_name'])) + "\n")
        
        f.write("   BERTScore:\n")
        f.write(f"      Precision: {result['bertscore_precision']:.4f}\n")
        f.write(f"      Recall: {result['bertscore_recall']:.4f}\n")
        f.write(f"      F1: {result['bertscore_f1']:.4f}\n\n")
        
        f.write("   Reference-Candidate Similarity:\n")
        f.write(f"      Mean: {result['ref_cand_sim']:.4f}\n\n")
        
        f.write("   Query-Candidate Relevance:\n")
        f.write(f"      Mean: {result['query_cand_relevance']:.4f}\n\n")
        
        f.write("   Aggregated Metric:\n")
        f.write(f"      Mean: {result['aggregated_metric']:.4f}\n\n")
        
        f.write("   ROUGE Metrics:\n")
        f.write(f"      ROUGE-1 Precision: {result['rouge1_precision']:.4f}\n")
        f.write(f"      ROUGE-1 Recall: {result['rouge1_recall']:.4f}\n")
        f.write(f"      ROUGE-1 F1: {result['rouge1_f1']:.4f}\n\n")
        
        f.write(f"      ROUGE-2 Precision: {result['rouge2_precision']:.4f}\n")
        f.write(f"      ROUGE-2 Recall: {result['rouge2_recall']:.4f}\n")
        f.write(f"      ROUGE-2 F1: {result['rouge2_f1']:.4f}\n\n")
        
        f.write(f"      ROUGE-L Precision: {result['rougeL_precision']:.4f}\n")
        f.write(f"      ROUGE-L Recall: {result['rougeL_recall']:.4f}\n")
        f.write(f"      ROUGE-L F1: {result['rougeL_f1']:.4f}\n\n")
        
        f.write("-" * 50 + "\n\n")

# Also save as CSV for easier data analysis
pd.DataFrame(results).to_csv("evaluation_end_to_end_summary_report.csv", index=False)

print(f"Evaluation summary report written to {report_path}")
print(f"CSV report written to evaluation_end_to_end_summary_report.csv")

Processing ./predictions_end_to_end/test_end_to_end_gemma3:12b.csv...


Computing BERTScore: 100%|██████████| 244/244 [00:18<00:00, 13.11it/s]


Computing sentence embeddings...
Computing ROUGE metrics...
Evaluation summary report written to evaluation_end_to_end_summary_report.txt
CSV report written to evaluation_end_to_end_summary_report.csv
