In [1]:
# 1. Install Dependencies
!pip install datasets bert_score transformers torch pandas numpy



In [2]:
import torch
from transformers import pipeline
from datasets import load_dataset
from bert_score import score as bertscore_score
import pandas as pd
import numpy as np
from tqdm import tqdm

DOC_COL = "document"
REF_COL = "summary"
RANDOM_SEED = 42

  from .autonotebook import tqdm as notebook_tqdm


## 2. Load Data
We use the test split of the Multi-News dataset.

In [None]:
dataset = load_dataset("Awesome075/multi_news_parquet", split="test")
# Take a subset of 50 samples for quick evaluation
test_dataset = dataset.select(range(100))
print(f"Evaluating on {len(test_dataset)} samples.")

Evaluating on 50 samples.


## 3. Load Baseline Model
We use the generic `facebook/bart-large-cnn` summarization pipeline.

In [4]:
# Check for CUDA availability
if torch.cuda.is_available():
    device = 0
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = -1
    print("Using CPU")

try:
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
except Exception as e:
    print(f"Failed to load on device {device}. Falling back to CPU.")
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

Using GPU: NVIDIA A40


Device set to use cuda:0


## 4. Generate Summaries

In [5]:
from transformers import AutoTokenizer

# Load tokenizer to ensure proper truncation
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
# Explicitly set max length to silence warnings and ensure correct behavior
tokenizer.model_max_length = 1024

generated_summaries = []
reference_summaries = test_dataset[REF_COL]

print("Generating summaries...")
for doc in tqdm(test_dataset[DOC_COL]):
    try:
        # Explicitly tokenize and truncate to 1020 tokens
        # We use 1020 instead of 1024 to leave room for special tokens (BOS/EOS) added by the pipeline
        inputs = tokenizer(doc, truncation=True, max_length=1020, return_tensors="pt")
        
        # Decode back to string to pass to pipeline
        input_text = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
        
        # Pass truncated text to summarizer. Disable truncation here since we handled it manually.
        output = summarizer(input_text, max_length=128, min_length=30, do_sample=False, truncation=False)
        generated_summaries.append(output[0]['summary_text'])
    except Exception as e:
        print(f"Error generating summary: {e}")
        generated_summaries.append("")

Generating summaries...


 20%|██        | 10/50 [00:07<00:29,  1.34it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 50/50 [00:40<00:00,  1.25it/s]


In [6]:
print(generated_summaries[0])

Voters in 11 states will pick their governors tonight. Eight of the gubernatorial seats up for grabs are now held by Democrats. Republicans are expected to wrest the North Carolina governorship from Democratic control.


## 5. Evaluation Metrics
We define the exact same metric functions as in `MultiNews.ipynb`.

In [7]:
def compute_bertscore(cands, refs, model_type="roberta-large", lang="en"):
    P, R, F1 = bertscore_score(cands, refs, lang=lang, model_type=model_type)
    return np.array([f.item() for f in F1])

# Load Bias Classifier
bias_classifier = pipeline(
    "text-classification",
    model="cirimus/modernbert-large-bias-type-classifier",
    return_all_scores=True,
    device=-1,
)

def compute_neutrality(texts):
    # Process in batches to avoid OOM
    bias_outputs = bias_classifier(texts, batch_size=8, truncation=True, max_length=1024)
    neutrality_scores = []
    
    for output in bias_outputs:
        scores = sorted([float(item["score"]) for item in output], reverse=True)
        top3 = scores[:3] if len(scores) >= 3 else scores  # handle fewer categories
        avg_top3 = np.mean(top3)
        neutrality = (1.0 - avg_top3) ** 2
        neutrality_scores.append(neutrality)
        
    return np.array(neutrality_scores)

Device set to use cpu


## 6. Compute Scores

In [8]:
# 1. BERTScore
print("Computing BERTScore...")
bert_scores = compute_bertscore(generated_summaries, list(reference_summaries))
print(f"Mean BERTScore F1: {bert_scores.mean():.4f}")

# 2. Neutrality Score
print("Computing Neutrality Score...")
neutrality_scores = compute_neutrality(generated_summaries)
print(f"Mean Neutrality Score: {neutrality_scores.mean():.4f}")

Computing BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.


Mean BERTScore F1: 0.8523
Computing Neutrality Score...
Mean Neutrality Score: 0.4571


In [9]:
# Display Sample Results
df_results = pd.DataFrame({
    "Generated Summary": generated_summaries,
    "Reference Summary": reference_summaries,
    "BERTScore": bert_scores,
    "Neutrality Score": neutrality_scores
})

display(df_results.head())

Unnamed: 0,Generated Summary,Reference Summary,BERTScore,Neutrality Score
0,Voters in 11 states will pick their governors ...,– It's a race for the governor's mansion in 11...,0.867532,0.441393
1,A photo of two men kissing was posted on a Fac...,– It turns out Facebook is only guilty of abou...,0.861831,0.354238
2,The Siskiyou County Board of Supervisors voted...,– Not a big fan of Southern California? Neithe...,0.879094,0.251091
3,Microsoft's acquisition of Nokia is aimed at b...,– Why did Microsoft buy Nokia's phone business...,0.824597,0.950444
4,The Supreme Court's new term kicks off Monday....,– The Supreme Court is facing a docket of high...,0.832153,0.287343
