In [26]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from datasets import Dataset
import evaluate
from bs4 import BeautifulSoup
import nltk
import re

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

nltk.download("punkt")

Device: cuda


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
train_df = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv").dropna(subset=["article", "highlights"])
val_df   = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv").dropna(subset=["article", "highlights"])
test_df  = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv").dropna(subset=["article", "highlights"])

# Optional: quick preview
for i in range(3):
    print(f"\n--- Article {i} ---\nArticle length: {len(train_df['article'][i])}")
    print(f"Highlights length: {len(train_df['highlights'][i])}")


--- Article 0 ---
Article length: 1211
Highlights length: 220

--- Article 1 ---
Article length: 2544
Highlights length: 223

--- Article 2 ---
Article length: 4743
Highlights length: 390


In [29]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

for df in [train_df, val_df, test_df]:
    df["article"] = df["article"].map(clean_text)
    df["highlights"] = df["highlights"].map(clean_text)

In [36]:
from transformers import pipeline
from evaluate import load
import torch

# Summarization pipeline (no device argument)
summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer
)

# Load ROUGE metric
rouge_metric = load("rouge")

def compute_rouge(dataset, n_samples=100, max_input_length=512):
    preds, refs = [], []
    
    for i in range(min(n_samples, len(dataset))):
        text = dataset[i]["article"]
        ref = dataset[i]["highlights"]
        
        # Truncate to model max length
        inputs = tokenizer(
            text,
            max_length=max_input_length,
            truncation=True,
            return_tensors="pt"
        )
        # Generate summary safely on CPU
        summary_ids = model.generate(
            **inputs,
            max_length=150,
            min_length=40,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        
        if summary_text.strip():  # only keep non-empty
            preds.append(summary_text.strip())
            refs.append(ref.strip())
        else:
            print(f"Skipping article {i} due to empty summary")
    
    if not preds:
        print("No valid summaries to compute ROUGE.")
        return {}
    
    # Compute ROUGE
    results = rouge_metric.compute(predictions=preds, references=refs)
    
    # Convert to % for readability
    results_percent = {k: v * 100 for k, v in results.items()}
    return results_percent

# Run evaluation
rouge_scores = compute_rouge(test_data, n_samples=10)  # small test set
print("ROUGE Scores:", rouge_scores)

Device set to use cpu


ROUGE Scores: {'rouge1': 39.03897245511171, 'rouge2': 16.677957020145957, 'rougeL': 26.647338042074885, 'rougeLsum': 32.938090097749914}


In [41]:
MAX_INPUT_TOKENS = 1024
MAX_SUMMARY_TOKENS = 150
MIN_SUMMARY_TOKENS = 40

def safe_summarize(text):
    """Generate summary safely on CPU, truncates input to model limit"""
    try:
        inputs = tokenizer(
            text,
            max_length=MAX_INPUT_TOKENS,
            truncation=True,
            return_tensors="pt"
        )  # no .to(device), CPU only

        summary_ids = model.generate(
            **inputs,
            max_length=MAX_SUMMARY_TOKENS,
            min_length=MIN_SUMMARY_TOKENS,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"Skipping article due to error: {e}")
        return ""

In [42]:
#BONUS

rouge = evaluate.load("rouge")

def compute_rouge(df, n_samples=100):
    preds, refs = [], []
    for i in range(min(n_samples, len(df))):
        article = df.iloc[i]["article"]
        reference = df.iloc[i]["highlights"]
        summary = safe_summarize(article)
        if summary.strip() == "":
            continue  # skip empty summaries
        preds.append(summary)
        refs.append(reference)

    if len(preds) == 0:
        print("No valid summaries to compute ROUGE.")
        return {}

    results = rouge.compute(predictions=preds, references=refs)
    return {k: v * 100 for k, v in results.items()}

# Run evaluation on test set
rouge_scores = compute_rouge(test_df, n_samples=100)
print("ROUGE Scores:", rouge_scores)

ROUGE Scores: {'rouge1': 35.27787474858648, 'rouge2': 15.045180398223785, 'rougeL': 24.398642095880675, 'rougeLsum': 24.51258623335181}


In [43]:
for i in range(3):
    article = test_df.iloc[i]["article"]
    summary = safe_summarize(article)
    print(f"\n--- ARTICLE {i} ---\n{article[:400]}...\n")
    print(f"--- SUMMARY ---\n{summary}\n")


--- ARTICLE 0 ---
Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on ...

--- SUMMARY ---
, while some airlines offer as little as 28 inches of space. some experts are questioning if shrinking space on planes is putting our health and safety in danger. experts say that shrinking space on planes is not only uncomfortable - it's putting our health and safety in danger.


--- ARTICLE 1 ---
A drunk teenage boy had to be rescued by security after jumping into a lions' enclosure at a zoo in western India. Rahul Kumar, 17, clambered over the enclosure fence at the Kamla Nehru Zoological Park in Ahmedabad, and began running towards the animals, shout