In [1]:
# 1. Install Evaluation Dependencies
!pip install evaluate rouge_score bert_score transformers datasets torch pandas numpy



In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from datasets import load_dataset
import evaluate
import pandas as pd
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## 2. Load Model and Tokenizer
Load the model saved from the previous training step.

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import traceback

# Path to your saved model
model_path = "./unbiased_summarizer_dpo_final_1000"

print(f"Checking model at: {model_path}")

try:
    # Determine device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Load Tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    print(f"Special Tokens: BOS={tokenizer.bos_token_id}, EOS={tokenizer.eos_token_id}, PAD={tokenizer.pad_token_id}")
    
    # Load Model
    print("Loading model...")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
    print("Model loaded successfully!")
    
    # --- FIX CONFIGURATION ISSUES GLOBALLY ---
    # Explicitly set config values to prevent "NoneType" and "early_stopping" errors
    model.config.early_stopping = True
    model.config.forced_bos_token_id = 0
    if model.config.decoder_start_token_id is None:
        model.config.decoder_start_token_id = model.config.eos_token_id
    
    # Check for NaN weights (sign of corrupted training)
    print("Checking model weights...")
    for name, param in list(model.named_parameters())[:5]:
        if torch.isnan(param).any():
            print(f"WARNING: Parameter {name} contains NaNs!")
        if param.sum() == 0:
             print(f"WARNING: Parameter {name} is all zeros!")

except Exception as e:
    print("\n!!! Model Verification Failed !!!")
    print(f"Error: {e}")
    traceback.print_exc()

Checking model at: ./unbiased_summarizer_dpo_final_1000
Using device: cuda
Loading tokenizer...
Special Tokens: BOS=0, EOS=2, PAD=1
Loading model...




Model loaded successfully!
Checking model weights...


## 3. Load Test Data
We use the test split of the Multi-News dataset.

In [3]:
dataset = load_dataset("Awesome075/multi_news_parquet", split="test")
# Optional: Take a subset for faster evaluation during development
test_dataset = dataset.select(range(100)) 
print(f"Evaluating on {len(test_dataset)} samples.")

Evaluating on 100 samples.


## 4. Define Evaluation Metrics
We will use:
- **ROUGE**: Standard summarization metric (n-gram overlap).
- **BERTScore**: Semantic similarity metric.
- **Neutrality Score**: Our custom bias metric.

In [7]:
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# Load Bias Classifier for Neutrality Score
bias_classifier = pipeline(
    "text-classification",
    model="cirimus/modernbert-large-bias-type-classifier",
    top_k=None,
    device=-1
)

def compute_neutrality(texts):
    bias_outputs = bias_classifier(texts, batch_size=8, truncation=True, max_length=512)
    neutrality_scores = []
    for output in bias_outputs:
        scores = sorted([float(item["score"]) for item in output], reverse=True)
        top3 = scores[:3] if len(scores) >= 3 else scores
        avg_top3 = np.mean(top3)
        neutrality = (1.0 - avg_top3) ** 2
        neutrality_scores.append(neutrality)
    return neutrality_scores

Device set to use cpu


## 5. Generate Summaries and Evaluate

In [8]:
generated_summaries = []
reference_summaries = []
documents = []

batch_size = 8


print("Generating summaries...")
for i in tqdm(range(0, len(test_dataset), batch_size)):
    batch = test_dataset[i : i + batch_size]
    inputs = tokenizer(
        batch["document"], 
        max_length=1024, 
        truncation=True, 
        padding=True, 
        return_tensors="pt"
    ).to(device)
    
    with torch.no_grad():
        summary_ids = model.generate(
        inputs["input_ids"], 
        max_length=100, 
        min_length=30, 
        do_sample=True,              # Enable Sampling
        num_beams=1,                 # Explicitly set to 1 for sampling
        top_p=0.9,                   # Nucleus sampling
        no_repeat_ngram_size=3,      # Prevent repetition
        early_stopping=False         # Disable early_stopping for sampling
    )
    
    # Debugging: Print the first generation of the first batch
    if i == 0:
        print(f"DEBUG: Input text sample: {batch['document'][0][:50]}...")
        print(f"DEBUG: Raw generated IDs: {summary_ids[0].tolist()}")
    
    decoded_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    
    generated_summaries.extend(decoded_summaries)
    reference_summaries.extend(batch["summary"])
    documents.extend(batch["document"])

#Compute BERTScore
bertscore_results = bertscore.compute(predictions=generated_summaries, references=reference_summaries, lang="en")
print(f"BERTScore F1 Mean: {np.mean(bertscore_results['f1']):.4f}")

# Compute Neutrality
neutrality_scores = compute_neutrality(generated_summaries)
print(f"Average Neutrality Score: {np.mean(neutrality_scores):.4f}")

Generating summaries...


  8%|▊         | 1/13 [00:05<01:00,  5.05s/it]

DEBUG: Input text sample: GOP Eyes Gains As Voters In 11 States Pick Governo...
DEBUG: Raw generated IDs: [2, 0, 2383, 17646, 11, 365, 982, 40, 2807, 49, 12066, 3422, 6, 8, 10872, 690, 14, 1858, 2082, 15, 1349, 7, 712, 49, 1530, 30, 23, 513, 65, 6, 19, 5, 801, 7, 4442, 49, 946, 7, 55, 87, 80, 12, 10224, 9, 5, 1226, 18, 299, 194, 4088, 4, 10099, 9, 5, 18932, 3202, 62, 13, 17565, 32, 122, 547, 30, 1574, 131, 130, 32, 11, 1172, 1420, 4, 1858, 855, 946, 1132, 12066, 7903, 6, 1574, 33, 291, 4, 10872, 2775, 14, 129, 130, 9, 5, 11997, 32, 1687, 2695, 6, 70, 11, 982, 147, 9861, 1557, 12066, 2025, 75, 878, 456, 35, 8920, 6, 188, 10372, 6, 8, 663, 4, 616, 167, 194, 4694, 1091, 350, 593, 7, 486, 6, 1858, 32, 421, 2]


100%|██████████| 13/13 [01:04<00:00,  4.97s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.


BERTScore F1 Mean: 0.8622
Average Neutrality Score: 0.5281


## 6. Inspect Results

In [9]:
df_results = pd.DataFrame({
    "Generated Summary": generated_summaries,
    "Reference Summary": reference_summaries,
    "Neutrality Score": neutrality_scores
})

# Show top 5 most neutral summaries
print("Top 5 Most Neutral Summaries:")
display(df_results.sort_values(by="Neutrality Score", ascending=False).head(5))

# Show top 5 least neutral summaries
print("\nTop 5 Least Neutral Summaries:")
display(df_results.sort_values(by="Neutrality Score", ascending=True).head(5))

Top 5 Most Neutral Summaries:


Unnamed: 0,Generated Summary,Reference Summary,Neutrality Score
37,– A 100-foot-tall eucalyptus tree fell on a we...,"– ""It’s just again showing that the vegetation...",0.926167
86,"– The idea of a ""sober January"" has been aroun...",– Thinking of giving up booze for the month af...,0.917344
3,– Microsoft's acquisition of Nokia is the late...,– Why did Microsoft buy Nokia's phone business...,0.911424
26,– A 90-year-old man in California has been cha...,– Police in San Jose believe a 90-year-old man...,0.86966
41,– A US Army sergeant severely injured in an IE...,– A US serviceman has become the world's first...,0.843159



Top 5 Least Neutral Summaries:


Unnamed: 0,Generated Summary,Reference Summary,Neutrality Score
6,– Israel says it hit Hamas targets in the Gaza...,– Israel launched a round of airstrikes on Gaz...,0.076773
15,– The number of children in the US on food sta...,– There are roughly 73.7 million kids running ...,0.190477
36,– President Obama's re-election campaign has r...,"– President Obama's big-money ""bundlers"" are o...",0.191155
21,– Kenya's Ministry of Tourism and Wildlife say...,– Eight endangered rhinos have died after a mo...,0.237932
28,– David Plouffe called Rick Santorum and Newt ...,– David Plouffe is making the talk-show rounds...,0.2452
