# 📚Notebook 5: NewsSum (Indian Nesws Paper)  Evaluation 

# 🤖ON T5, PEGASUS & BART

**✏️ Step 1: Setup & Load Pretrained Models**

In [53]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from evaluate import load
import torch
import pandas as pd

# Load evaluation metrics
rouge = load("rouge")
bertscore = load("bertscore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**✏️ Step 2: Load NewsSum Dataset**

In [54]:
import pandas as pd

# Load the Excel file
df_newsum = pd.read_excel("NewsSum.xlsx")

#Inspect the first few rows
df_newsum.head()

Unnamed: 0,Index,Article_id,Headline,Human Summerry,Published Date,URL of News Article,Article,Category
0,1,INDEXP19988,Elephant death brings to fore man-animal confl...,Thousands of farmers in Kerala have either aba...,"June 8, 2020 7:28:42 am",https://indianexpress.com/article/india/elepha...,The death of a pregnant elephant in the buffer...,Local News
1,2,INDEXP19987,Cases filed after two â€˜commit suicideâ€™ in ...,"In the first incident, a 30-year-old woman all...","June 8, 2020 2:06:22 am",https://indianexpress.com/article/india/cases-...,Two suicides were reported from Vadodara and D...,Crime and Justice
2,3,INDEXP19986,Woman alleges father tied to MP hospital bed o...,"The hospital denied the allegation, saying the...","June 8, 2020 2:05:16 am",https://indianexpress.com/article/india/woman-...,A day after a woman alleged that her father ha...,Health and Wellness
3,4,INDEXP19985,"Sena member, author, app designer â€“ the many...","On Saturday, Vaze along with police constables...","June 8, 2020 7:30:15 am",https://indianexpress.com/article/india/sena-m...,"Assistant police inspector Sachin Vaze, who wa...",Defense
4,5,INDEXP19984,"Manager, owner of resort where Gujarat Congres...",The resort is reportedly owned by Indranil Raj...,"June 8, 2020 7:19:53 am",https://indianexpress.com/article/india/manage...,"The manager and owner of a resort in Rajkot, w...",Politics


In [55]:
df_newsum.columns

Index(['Index', 'Article_id', 'Headline', 'Human Summerry', 'Published Date',
       'URL of News Article', 'Article', 'Category'],
      dtype='object')

In [56]:
print("📊 NewsSum Dataset Shape:", df_newsum.shape)

📊 NewsSum Dataset Shape: (1003, 8)


**✏️ Step 3: Clean, Rename & Save**

In [57]:
# Step 1: Copy the selected columns safely
df_newsum = df_newsum[['Headline', 'Article', 'Category', 'Human Summerry']].copy()

# Step 2: Rename column
df_newsum.rename(columns={'Human Summerry': 'Summary'}, inplace=True)

# Step 3: Drop rows with missing values
df_newsum.dropna(subset=['Headline', 'Article', 'Category', 'Summary'], inplace=True)

In [58]:
df_newsum.head()

Unnamed: 0,Headline,Article,Category,Summary
0,Elephant death brings to fore man-animal confl...,The death of a pregnant elephant in the buffer...,Local News,Thousands of farmers in Kerala have either aba...
1,Cases filed after two â€˜commit suicideâ€™ in ...,Two suicides were reported from Vadodara and D...,Crime and Justice,"In the first incident, a 30-year-old woman all..."
2,Woman alleges father tied to MP hospital bed o...,A day after a woman alleged that her father ha...,Health and Wellness,"The hospital denied the allegation, saying the..."
3,"Sena member, author, app designer â€“ the many...","Assistant police inspector Sachin Vaze, who wa...",Defense,"On Saturday, Vaze along with police constables..."
4,"Manager, owner of resort where Gujarat Congres...","The manager and owner of a resort in Rajkot, w...",Politics,The resort is reportedly owned by Indranil Raj...


**✏️ Step 4: Clean Category Values**

In [59]:
# Count how many times each category appears
df_newsum['Category'].value_counts()

Health and Wellness     303
Politics                192
Crime and Justice       132
 Local News             101
 National News           56
Education                48
Defense                  43
International News       41
National News            39
Business and Finance     27
 Environment             14
Sports                    7
Name: Category, dtype: int64

In [60]:
# Step: Combine rare categories into "Other"
category_counts = df_newsum['Category'].value_counts()
rare_categories = category_counts[category_counts < 30].index

df_newsum['Category'] = df_newsum['Category'].replace(rare_categories, 'Other')

In [61]:
print("✅ Updated Category Counts:")
print(df_newsum['Category'].value_counts())

✅ Updated Category Counts:
Health and Wellness    303
Politics               192
Crime and Justice      132
 Local News            101
 National News          56
Other                   48
Education               48
Defense                 43
International News      41
National News           39
Name: Category, dtype: int64


**✏️Step 5: Save Cleaned Dataset**

In [62]:
# Save cleaned and simplified NewsSum dataset
df_newsum.to_csv("newsum_cleaned.csv", index=False)
print("✅ Cleaned NewsSum dataset saved as 'newsum_cleaned.csv'")

✅ Cleaned NewsSum dataset saved as 'newsum_cleaned.csv'


In [63]:
import pandas as pd

# Load the cleaned NewsSum dataset
df_newsum = pd.read_csv("newsum_cleaned.csv")

# Display shape and first few rows
print("✅ NewsSum Shape:", df_newsum.shape)
df_newsum.head()

✅ NewsSum Shape: (1003, 4)


Unnamed: 0,Headline,Article,Category,Summary
0,Elephant death brings to fore man-animal confl...,The death of a pregnant elephant in the buffer...,Local News,Thousands of farmers in Kerala have either aba...
1,Cases filed after two â€˜commit suicideâ€™ in ...,Two suicides were reported from Vadodara and D...,Crime and Justice,"In the first incident, a 30-year-old woman all..."
2,Woman alleges father tied to MP hospital bed o...,A day after a woman alleged that her father ha...,Health and Wellness,"The hospital denied the allegation, saying the..."
3,"Sena member, author, app designer â€“ the many...","Assistant police inspector Sachin Vaze, who wa...",Defense,"On Saturday, Vaze along with police constables..."
4,"Manager, owner of resort where Gujarat Congres...","The manager and owner of a resort in Rajkot, w...",Politics,The resort is reportedly owned by Indranil Raj...


# 🚀  Evaluate T5, PEGASUS, and BART on NewsSum

In [64]:
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    PegasusTokenizer, PegasusForConditionalGeneration,
    BartTokenizer, BartForConditionalGeneration
)
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# T5
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

# PEGASUS
pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(device)

# BART
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**✅ Preparation**

**✅ Define Summary Functions:T5**

In [65]:
def generate_summary_with_t5(text):
    input_text = "summarize: " + text
    inputs = t5_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

    summary_ids = t5_model.generate(
        inputs["input_ids"],
        max_length=60,
        min_length=10,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

**✅ Define Summary Functions:PEGASUS**

In [66]:
def generate_summary_with_pegasus(text):
    inputs = pegasus_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to(device)

    summary_ids = pegasus_model.generate(
        inputs["input_ids"],
        max_length=60,
        min_length=10,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    return pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

**✅ Define Summary Functions:BART**

In [67]:
def generate_summary_with_bart(text):
    inputs = bart_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to(device)

    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=60,
        min_length=10,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

**✅ Summaries Generation with T5**

In [68]:
sample_articles = df_newsum["Article"][:3]

In [69]:
def summarize_with_t5(text):
    inputs = t5_tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = t5_model.generate(inputs["input_ids"], max_length=60, min_length=10, length_penalty=2.0, num_beams=4)
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [70]:
sample_articles = df_newsum["Article"][:3]

for i, article in enumerate(sample_articles):
    print(f"\n📰 NewsSum Article #{i+1}:\n", article[:400], "...\n")
    
    t5_summary = summarize_with_t5(article)
    print(f"🧠 T5 Summary #{i+1}:\n", t5_summary)

    print("-" * 100)


📰 NewsSum Article #1:
 The death of a pregnant elephant in the buffer zone of Silent Valley National Park in Keralaâ€™s Palakkad district, after the pachyderm allegedly bit into a coconut filled with firecrackers, has brought to the forefront the stateâ€™s growing, unresolved challenge of managing man-animal conflicts. Thousands of farmers in Kerala have either abandoned cultivation or have stopped nursing their farm la ...

🧠 T5 Summary #1:
 7,229 in 2017-18.â€ Read | Death of an elephant in Silent Valley National Park in Kerala . During the same period, 416 wild elephants died in Kerala, with 24 deaths attributed to â€unnatural causesâ€
----------------------------------------------------------------------------------------------------

📰 NewsSum Article #2:
 Two suicides were reported from Vadodara and Dahod each on the intervening night of Saturday and Sunday. Police have registered a case of accidental death in both cases. In the first incident, a 30-year-old woman allegedly hang

**✅ Summaries Generation with PEGASUS**

In [71]:
# PEGASUS Summary Function
def summarize_with_pegasus(text):
    inputs = pegasus_tokenizer(text, truncation=True, padding="longest", return_tensors="pt").to(device)
    summary_ids = pegasus_model.generate(inputs["input_ids"], max_length=60, min_length=10, length_penalty=2.0, num_beams=4)
    return pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [72]:
sample_articles = df_newsum["Article"][:3]

for i, article in enumerate(sample_articles):
    print(f"\n📰 NewsSum Article #{i+1}:\n", article[:400], "...\n")
    
    pegasus_summary = summarize_with_pegasus(article)
    print(f"🦅 PEGASUS Summary #{i+1}:\n", pegasus_summary)

    
    print("-" * 120)


📰 NewsSum Article #1:
 The death of a pregnant elephant in the buffer zone of Silent Valley National Park in Keralaâ€™s Palakkad district, after the pachyderm allegedly bit into a coconut filled with firecrackers, has brought to the forefront the stateâ€™s growing, unresolved challenge of managing man-animal conflicts. Thousands of farmers in Kerala have either abandoned cultivation or have stopped nursing their farm la ...

🦅 PEGASUS Summary #1:
 In our series of letters from African journalists, film-maker and columnist M Ilyas Kashmiri looks at the growing menace of man-animal conflicts in Kerala.
------------------------------------------------------------------------------------------------------------------------

📰 NewsSum Article #2:
 Two suicides were reported from Vadodara and Dahod each on the intervening night of Saturday and Sunday. Police have registered a case of accidental death in both cases. In the first incident, a 30-year-old woman allegedly hanged herself from a t

**✅ Summaries Generation with BART**

In [28]:
# BART Summary Function
def summarize_with_bart(text):
    inputs = bart_tokenizer(
        text,
        truncation=True,             # Prevent going beyond limit
        max_length=1024,             # BART's max input length
        padding="longest",
        return_tensors="pt"
    ).to(device)

    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=60,
        min_length=10,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [29]:
sample_articles = df_newsum["Article"][:3]

for i, article in enumerate(sample_articles):
    print(f"\n📰 NewsSum Article #{i+1}:\n", article[:400], "...\n")

    
    bart_summary = summarize_with_bart(article)
    print(f"🧠 BART Summary #{i+1}:\n", bart_summary)
   

    print("-" * 120)


📰 NewsSum Article #1:
 The death of a pregnant elephant in the buffer zone of Silent Valley National Park in Keralaâ€™s Palakkad district, after the pachyderm allegedly bit into a coconut filled with firecrackers, has brought to the forefront the stateâ€™s growing, unresolved challenge of managing man-animal conflicts. Thousands of farmers in Kerala have either abandoned cultivation or have stopped nursing their farm la ...

🧠 BART Summary #1:
 Thousands of farmers in Kerala have either abandoned cultivation or have stopped nursing their farm lands. The number of incidents of human-animal conflict is increasing year by year. In 2018-19, as many as 7,890 incidents were reported, whereas it was 7,229 in 2017-
------------------------------------------------------------------------------------------------------------------------

📰 NewsSum Article #2:
 Two suicides were reported from Vadodara and Dahod each on the intervening night of Saturday and Sunday. Police have registered a case of

# 🎯Evaluate Model Summaries (ROUGE + BERTScore)

**🦅Evaluate_function**

In [73]:
from evaluate import load

# Load evaluation metrics
rouge = load("rouge")
bertscore = load("bertscore")

# Global result list (define only once)
summary_results = []

# ✅ Define evaluation function
def evaluate_metrics(dataset_name, predictions, references):
    # ROUGE
    rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    
    # BERTScore
    bert_scores = bertscore.compute(predictions=predictions, references=references, lang="en")
    avg_bertscore = sum(bert_scores["f1"]) / len(bert_scores["f1"])
    
    # Save results
    summary_results.append({
        "Dataset": dataset_name,
        "ROUGE-1": round(rouge_scores["rouge1"], 4),
        "ROUGE-2": round(rouge_scores["rouge2"], 4),
        "ROUGE-L": round(rouge_scores["rougeL"], 4),
        "BERTScore": round(avg_bertscore, 4)
    })


**🦅Evaluation_T5**

In [74]:
t5_preds = [summarize_with_t5(article) for article in df_newsum["Article"][:3]]
t5_refs = df_newsum["Summary"][:3].tolist()

In [75]:
evaluate_metrics("NewsSum-T5", t5_preds, t5_refs)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


**🦅Evaluation_PEGASUS**

In [76]:
pegasus_preds = [summarize_with_t5(article) for article in df_newsum["Article"][:3]]
pegasus_refs = df_newsum["Summary"][:3].tolist()

In [77]:
evaluate_metrics("NewsSum-PEGASUS", pegasus_preds, pegasus_refs)

**🦅Evaluation_BART**

In [78]:
bart_preds = [summarize_with_t5(article) for article in df_newsum["Article"][:3]]
bart_refs = df_newsum["Summary"][:3].tolist()

In [79]:
evaluate_metrics("NewsSum-BART", bart_preds, bart_refs)

**📊 Final Evaluation Summary**

In [80]:
import pandas as pd

# Create a DataFrame from the collected results
df_summary = pd.DataFrame(summary_results)

print("📊 Final Evaluation Summary on NewsSum Dataset")
print(df_summary.to_string(index=False))

📊 Final Evaluation Summary on NewsSum Dataset
        Dataset  ROUGE-1  ROUGE-2  ROUGE-L  BERTScore
     NewsSum-T5   0.3317   0.1949   0.2949     0.8533
NewsSum-PEGASUS   0.3317   0.1949   0.2949     0.8533
   NewsSum-BART   0.3317   0.1949   0.2949     0.8533


In [81]:
df_newsum.to_csv("newsum_cleaned.csv", index=False)

💾 Save the Scores to .CSV Files

**So that we can use to comapair models in different NoteBooks**

In [82]:
df_summary = pd.DataFrame(summary_results)
df_summary.to_csv("newsum_model_scores.csv", index=False)