<a href="https://colab.research.google.com/github/Songram-Biswas/bart-text-summarization/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets accelerate
# 'evaluate' and 'scikit-learn' are not strictly needed for summarization (which uses ROUGE)

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq # Import the correct collator for Seq2Seq tasks
)

# 1. Load dataset (CNN/DailyMail summarization)
print("Loading CNN/DailyMail dataset...")
dataset = load_dataset("cnn_dailymail", "3.0.0")

# 2. Load tokenizer & model
checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# --- Define the Seq2Seq Data Collator ---
# This collator handles padding for both the input and the labels (targets).
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


# 3. Preprocess (Article to Highlights)
def preprocess_function(examples):
    # Tokenize the input article
    model_inputs = tokenizer(examples["article"], max_length=512, truncation=True)

    # Tokenize the target summary (highlights) using the correct argument
    # FIX: Replaced 'as_target_tokenizer()' with the modern 'text_target' argument
    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing and preprocessing 1000 training examples...")
tokenized_datasets = dataset["train"].select(range(1000)).map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names # Remove raw text columns to clean up
)

# 4. Training
# FIX: 'evaluation_strategy' changed to 'eval_strategy'
args = TrainingArguments(
    output_dir="results-sum",
    eval_strategy="no",             # <-- FIXED THE KEYWORD
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True, # Recommended for Colab/GPU
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator, # Pass the Seq2Seq collator
)

print("\n--- Starting Fine-Tuning ---\n")
trainer.train()

# 5. Test summarization
text = dataset["test"][0]["article"]
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(model.device) # Move to model device
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=80,
    min_length=30,
    length_penalty=2.0,
    num_beams=4, # Use beam search for better quality
)
print("\n--- Generated Summary ---\n")
print("Article (First 400 chars):", text[:400])
print("\nReference Summary:", dataset["test"][0]["highlights"])
print("\nGenerated Summary:", tokenizer.decode(summary_ids[0], skip_special_tokens=True))