# 📚 NoteBook 9 allenai/PRIMERA Evaluation

# 🚀 PROJECT PLAN

MKEM Implementation – Transformer-Based Abstractive Text Summarization

# 🎯 Problem Statement Recap

# 🔍 Objective:
    
To build and compare transformer-based summarization models (T5, BART, Pegasus,BARTScore,ProphetNet,BigBird,LED,mTS,FLAN-T5,GPT 3.5 Turbo) and then enhance them using MKEM (Multi-Knowledge-Enhanced Model) on curated English news datasets.

# 📌 Phase-1 Objective

✅ Implement the following 3 summarization models:

PEGASUS (Google)---NoteBook(2)

BART (Facebook)---NoteBook(3)

T5 (Google)---NoteBook(1)

Final Comparison + MKEM---NoteBook(4)

NewsSum(Indian Newspaper)---NoteBook(5)

BARTScore---NoteBook(6)

ProphetNet---NoteBook(7)

BigBird-Pegasus---NoteBook(8)

LED(Longformer)---NoteBook(9)

allenai/PRIMERA ---NoteBook(10)

FLAN-T5---NoteBook(11)

GPT-3.5 Turbo---NoteBook(12)

# ✅ Evaluate on 3 benchmark datasets:
    
CNN/DailyMail

Newssum (IndianNewsPaper)

✅ Evaluation Metrics:
    
ROUGE-1

ROUGE-2

ROUGE-L

BERTScore

# 📊 Final Output (Per Model × Dataset):
    
You must submit structured results:

Dataset name

Model used

ROUGE-1, ROUGE-2, ROUGE-L, BERTScore

Inference Time

GPU used

Short analysis/observations

# 1.🚀 allenai/PRIMERA on CNN Dataset

**✏️Step 1: Install & Import Libraries**

In [1]:
# Install PRIMERA dependencies
!pip install transformers sentencepiece datasets evaluate bert-score --quiet

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
import time
import evaluate
import bert_score



**✏️ Step 2: Load Model & Tokenizer**

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# ✅ Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# ✅ Load PRIMERA model & tokenizer
tokenizer_primera = AutoTokenizer.from_pretrained("allenai/PRIMERA")
model_primera = AutoModelForSeq2SeqLM.from_pretrained("allenai/PRIMERA").to(device)

print(f"✅ PRIMERA model loaded on {device.upper()}")

✅ PRIMERA model loaded on CPU


**✏️ Step 3: Load CNN Dataset**

In [4]:
# Load cleaned CNN dataset
df_cnn = pd.read_csv("cnn_dailymail.csv")

# Drop missing or empty articles/highlights
df_cnn = df_cnn.dropna(subset=["article", "highlights"])
df_cnn = df_cnn[df_cnn["article"].str.strip().astype(bool)]

# Optional: Limit for quick testing
# df_cnn = df_cnn[:5]

print("✅ CNN Dataset Loaded. Shape:", df_cnn.shape)
df_cnn.head()

✅ CNN Dataset Loaded. Shape: (5, 3)


Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a


**✏️ Step 4: Define Summarization Function**

In [5]:
def summarize_with_primera(text, max_input_length=4096, min_summary_length=40, max_summary_length=150):
    # Tokenize input
    inputs = tokenizer_primera(
        text,
        return_tensors="pt",
        truncation=True,
        padding="longest",
        max_length=max_input_length
    ).to(device)

    # Generate summary
    summary_ids = model_primera.generate(
        inputs["input_ids"],
        min_length=min_summary_length,
        max_length=max_summary_length,
        length_penalty=2.0,
        num_beams=4
    )

    # Decode and return
    return tokenizer_primera.decode(summary_ids[0], skip_special_tokens=True)

**✏️ Step 5: Generate Predictions**

In [6]:
import time

# ✅ Generate predictions for CNN dataset using PRIMERA
start_time = time.time()

primera_cnn_preds = [summarize_with_primera(article) for article in df_cnn["article"]]
primera_cnn_refs = df_cnn["highlights"].tolist()

inference_time = round(time.time() - start_time, 2)
print(f"⏱ Inference Time: {inference_time} seconds")

Input ids are automatically padded from 565 to 1024 to be a multiple of `config.attention_window`: 512
`cache.key_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].keys` instead.
`cache.value_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].values` instead.
Input ids are automatically padded from 888 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 919 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 531 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1196 to 1536 to be a multiple of `config.attention_window`: 512


⏱ Inference Time: 248.78 seconds


**✏️ Step 6: Evaluate with ROUGE & BERTScore**

In [7]:
import evaluate
import pandas as pd
import torch

# ✅ Load evaluation metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# ✅ ROUGE scores
rouge_results = rouge.compute(predictions=primera_cnn_preds, references=primera_cnn_refs)

# ✅ BERTScore (average F1 score)
bertscore_results = bertscore.compute(predictions=primera_cnn_preds,
                                      references=primera_cnn_refs,
                                      lang="en")

# ✅ Prepare results dictionary
primera_cnn_scores = {
    "Dataset": ["CNN"],
    "Model": ["PRIMERA"],
    "ROUGE-1": [rouge_results["rouge1"]],
    "ROUGE-2": [rouge_results["rouge2"]],
    "ROUGE-L": [rouge_results["rougeL"]],
    "BERTScore": [round(sum(bertscore_results["f1"]) / len(bertscore_results["f1"]), 4)],
    "Inference Time (s)": [inference_time],
    "GPU Used": [torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"]
}

# ✅ Save to CSV
primera_cnn_scores_df = pd.DataFrame(primera_cnn_scores)
primera_cnn_scores_df.to_csv("primera_cnn_scores.csv", index=False)

print("✅ PRIMERA CNN scores saved to primera_cnn_scores.csv")
primera_cnn_scores_df

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


✅ PRIMERA CNN scores saved to primera_cnn_scores.csv


Unnamed: 0,Dataset,Model,ROUGE-1,ROUGE-2,ROUGE-L,BERTScore,Inference Time (s),GPU Used
0,CNN,PRIMERA,0.271003,0.10834,0.16965,0.8513,248.78,CPU


**💾 Save the Scores to .CSV Files**

**So that we can use to comapair models in different NoteBooks**

In [8]:
primera_cnn_scores_df = pd.DataFrame(primera_cnn_scores)
primera_cnn_scores_df.to_csv("primera_cnn_scores.csv", index=False)

print("✅ PRIMERA CNN scores saved to primera_cnn_scores.csv")

✅ PRIMERA CNN scores saved to primera_cnn_scores.csv


# 2.🚀 allenai/PRIMERA on NewsSum Dataset

**✏️ Step 1: Load NewsSum Dataset**

In [9]:
import pandas as pd

# Load the cleaned NewsSum dataset
df_newsum = pd.read_csv("newsum_cleaned.csv")

# Optional: sample a smaller batch for testing
# df_newsum = df_newsum[:5]

df_newsum.head()

Unnamed: 0,Headline,Article,Category,Summary
0,Elephant death brings to fore man-animal confl...,The death of a pregnant elephant in the buffer...,Local News,Thousands of farmers in Kerala have either aba...
1,Cases filed after two â€˜commit suicideâ€™ in ...,Two suicides were reported from Vadodara and D...,Crime and Justice,"In the first incident, a 30-year-old woman all..."
2,Woman alleges father tied to MP hospital bed o...,A day after a woman alleged that her father ha...,Health and Wellness,"The hospital denied the allegation, saying the..."
3,"Sena member, author, app designer â€“ the many...","Assistant police inspector Sachin Vaze, who wa...",Defense,"On Saturday, Vaze along with police constables..."
4,"Manager, owner of resort where Gujarat Congres...","The manager and owner of a resort in Rajkot, w...",Politics,The resort is reportedly owned by Indranil Raj...


**✏️ Step 2: Define Summarization Function**

In [11]:
import time
import evaluate
import pandas as pd
import torch

# Take only first 5 rows
df_newsum_small = df_newsum.iloc[:5].copy()

# Summarization function (fast settings)
def summarize_with_primera_fast(text):
    inputs = tokenizer_primera(
        text,
        return_tensors="pt",
        truncation=True,
        padding="longest",
        max_length=2048
    ).to(device)

    summary_ids = model_primera.generate(
        inputs["input_ids"],
        min_length=30,
        max_length=120,
        length_penalty=2.0,
        num_beams=4
    )

    return tokenizer_primera.decode(summary_ids[0], skip_special_tokens=True)

**✏️Step 3: Generate Summaries with allenai/PRIMERA**

In [13]:
# Generate predictions testing purpose
start_time = time.time()
primera_newsum_preds = [summarize_with_primera_fast(article) for article in df_newsum_small["Article"]]
primera_newsum_refs = df_newsum_small["Summary"].tolist()
inference_time = round(time.time() - start_time, 2)
print(f"⏱ Inference Time (NewsSum - PRIMERA, 5 rows): {inference_time} seconds")

⏱ Inference Time (NewsSum - PRIMERA, 5 rows): 289.82 seconds


**✏️Step 3: Evaluate with ROUGE and BERTScore**

In [14]:
#  Evaluate
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

rouge_results = rouge.compute(predictions=primera_newsum_preds, references=primera_newsum_refs)
bertscore_results = bertscore.compute(predictions=primera_newsum_preds, references=primera_newsum_refs, lang="en")
bert_f1 = round(sum(bertscore_results["f1"]) / len(bertscore_results["f1"]), 4)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


**💾 Step 4: Save Evaluation Scores to CSV**

In [15]:
# Save scores
primera_newsum_scores = {
    "Dataset": ["NewsSum"],
    "Model": ["PRIMERA"],
    "ROUGE-1": [rouge_results["rouge1"]],
    "ROUGE-2": [rouge_results["rouge2"]],
    "ROUGE-L": [rouge_results["rougeL"]],
    "BERTScore": [bert_f1],
    "Inference Time (s)": [inference_time],
    "GPU Used": [torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"]
}

primera_newsum_scores_df = pd.DataFrame(primera_newsum_scores)
primera_newsum_scores_df.to_csv("primera_newsum_scores.csv", index=False)

print("✅ PRIMERA NewsSum scores saved to primera_newsum_scores.csv")
primera_newsum_scores_df

✅ PRIMERA NewsSum scores saved to primera_newsum_scores.csv


Unnamed: 0,Dataset,Model,ROUGE-1,ROUGE-2,ROUGE-L,BERTScore,Inference Time (s),GPU Used
0,NewsSum,PRIMERA,0.376837,0.342666,0.356498,0.8777,289.82,CPU
