In [1]:
# 2. Imports
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import numpy as np
import pandas as pd
from bert_score import score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [10]:
# 3. Configuration
model_path = "./unbiased_summarizer_dpo_final_1000"
print(f"Model path set to: {model_path}")

Model path set to: ./unbiased_summarizer_dpo_final_1000


In [11]:
# 4. Initialize BestOfNModel
import sys
import os

# Ensure we can import from the current directory
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

from bestofn import BestOfNModel

# Initialize the model wrapper
# This handles loading the policy model, tokenizer, and reward model
print("Initializing BestOfNModel...")
best_of_n_model = BestOfNModel(model_path, n=4, device="cuda" if torch.cuda.is_available() else "cpu")

Initializing BestOfNModel...
Loading Policy Model from ./unbiased_summarizer_dpo_final_1000...




Loading Reward Model: maximuspowers/bias-type-classifier...


Device set to use cuda:0


In [12]:
# 5. Load Dataset
print("Loading Multi-News Test Set...")
dataset = load_dataset("Awesome075/multi_news_parquet", split="test")

# Select subset for faster evaluation (e.g., 50 samples)
num_samples = 100
test_dataset = dataset.select(range(min(len(dataset), num_samples)))
print(f"Evaluating on {len(test_dataset)} samples.")

Loading Multi-News Test Set...
Evaluating on 100 samples.


In [13]:
# 6. Evaluation Loop (Best-of-N)
generated_summaries = []
reference_summaries = []
documents = []
neutrality_scores = []

print(f"Generating summaries using Best-of-{best_of_n_model.n} strategy...")

# Iterate one by one since the class handles single inputs
for i in tqdm(range(len(test_dataset))):
    sample = test_dataset[i]
    doc = sample["document"]
    ref = sample["summary"]
    
    # Use the class to generate
    # The forward method returns a dict with 'best_summary', 'best_score', etc.
    result = best_of_n_model.forward(doc)
    
    generated_summaries.append(result["best_summary"])
    reference_summaries.append(ref)
    documents.append(doc)

Generating summaries using Best-of-4 strategy...


100%|██████████| 100/100 [01:46<00:00,  1.07s/it]


In [14]:
from transformers import pipeline

# 7. Compute Metrics
bias_classifier = pipeline(
    "text-classification",
    model="cirimus/modernbert-large-bias-type-classifier",
    top_k=None,
    device=-1
)

def compute_neutrality(texts):
    bias_outputs = bias_classifier(texts, batch_size=8, truncation=True, max_length=512)
    neutrality_scores = []
    for output in bias_outputs:
        scores = sorted([float(item["score"]) for item in output], reverse=True)
        top3 = scores[:3] if len(scores) >= 3 else scores
        avg_top3 = np.mean(top3)
        neutrality = (1.0 - avg_top3) ** 2
        neutrality_scores.append(neutrality)
    return neutrality_scores

neutrality_scores = compute_neutrality(generated_summaries)
print(f"Average Neutrality Score: {np.mean(neutrality_scores):.4f}")

print("Computing BERTScore...")
P, R, F1 = score(generated_summaries, reference_summaries, lang="en", verbose=True, device=device, batch_size=16)
print(f"BERTScore F1: {F1.mean().item():.4f}")

Device set to use cpu


Average Neutrality Score: 0.5509
Computing BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 13/13 [00:01<00:00,  7.57it/s]


computing greedy matching.


100%|██████████| 7/7 [00:00<00:00, 191.43it/s]

done in 1.77 seconds, 56.63 sentences/sec
BERTScore F1: 0.8503





In [None]:
# 8. Save Results
df = pd.DataFrame({
    "document": documents,
    "reference": reference_summaries,
    "generated": generated_summaries,
    "neutrality_score": neutrality_scores,
    "bert_score_f1": F1.tolist()
})
output_file = "online_inference_results.csv"
df.to_csv(output_file, index=False)
print(f"Detailed results saved to {output_file}")
df.head()