In [None]:
!pip install datasets transformers evaluate rouge_score

In [None]:
import os
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments
import evaluate

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
df = pd.read_csv("/content/train.csv")  # Replace with your actual file name
df_subset = df.head(10000)  # Use a subset if needed for memory constraints
dataset = Dataset.from_pandas(df_subset)

In [None]:
model_name = "google/pegasus-cnn_dailymail"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [None]:
def preprocess_data(examples):
    inputs = tokenizer(examples['article'], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples['highlights'], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_data, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,           # Adjust for memory
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,                               # Mixed precision if supported
    evaluation_strategy="epoch",
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset  # Adjust or split if separate eval data is available
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./my_cnn_dailymail_pegasus_model")
tokenizer.save_pretrained("./my_cnn_dailymail_pegasus_model")

In [None]:
def summarize(text):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    inputs = tokenizer([text], max_length=512, return_tensors="pt", truncation=True).to(device)
    summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
sample_text = "The Mars rover Perseverance has been exploring the Red Planet for almost a year..."
print("Summary:", summarize(sample_text))

In [None]:
rouge = evaluate.load("rouge")

In [None]:
def evaluate_rouge(model, tokenizer, dataset, num_samples=100):
    predictions = []
    references = []

    for i, sample in enumerate(dataset):
        if i >= num_samples:
            break

        article = sample["article"]
        reference_summary = sample["highlights"]

        inputs = tokenizer([article], max_length=512, return_tensors="pt", truncation=True).to("cuda" if torch.cuda.is_available() else "cpu")
        summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        predictions.append(generated_summary)
        references.append(reference_summary)

    results = rouge.compute(predictions=predictions, references=references)
    return results

In [None]:
rouge_results = evaluate_rouge(model, tokenizer, tokenized_dataset, num_samples=100)
print("ROUGE Scores:", rouge_results)