In [1]:
# Step 1: Install necessary libraries
!pip install datasets transformers evaluate nltk

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
# Step 1: Install additional library for ROUGE score calculation
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=501e8777ce81176f11dd8e53a260012d2f4a01a8bf5d11f903cd78fb1d762234
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [3]:
# Step 2: Import libraries and download NLTK data
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
import numpy as np
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Step 3: Load the CNN/DailyMail dataset
ds_train = load_dataset("abisee/cnn_dailymail", "3.0.0", split="train[:50]")
ds_val = load_dataset("abisee/cnn_dailymail", "3.0.0", split="validation[:5]")
ds_test = load_dataset("abisee/cnn_dailymail", "3.0.0", split="test[:5]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [5]:
# Step 4: Load tokenizer and model
model_checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
# Step 5: Define preprocessing function and tokenize datasets
def preprocess(examples):
    inputs = tokenizer(
        examples["article"], max_length=512, truncation=True, padding="max_length"
    )
    targets = tokenizer(
        examples["highlights"], max_length=128, truncation=True, padding="max_length"
    )

    # Replace padding token ID with -100 in labels
    labels = []
    for label in targets["input_ids"]:
        new_label = [(l if l != tokenizer.pad_token_id else -100) for l in label]
        labels.append(new_label)

    inputs["labels"] = labels
    return inputs

tokenized_train = ds_train.map(preprocess, batched=True)
tokenized_val = ds_val.map(preprocess, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [7]:
# Step 6: Load evaluation metric and define compute_metrics function
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    preds = np.array(preds)
    labels = np.array(labels)
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    labels = np.clip(labels, 0, tokenizer.vocab_size - 1)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: v.mid.fmeasure for k, v in result.items()}

Downloading builder script: 0.00B [00:00, ?B/s]

In [8]:
# Step 7: Disable Weights & Biases logging (optional)
import os
os.environ["WANDB_DISABLED"] = "true"

In [9]:
# Step 8: Configure and initialize Seq2SeqTrainer, then train the model
training_args = Seq2SeqTrainingArguments(
    output_dir="./cnn_summarization",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    save_total_limit=1,
    predict_with_generate=True,
    logging_dir="./logs"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


Step,Training Loss


TrainOutput(global_step=75, training_loss=2.576441243489583, metrics={'train_runtime': 496.0126, 'train_samples_per_second': 0.302, 'train_steps_per_second': 0.151, 'total_flos': 27883575705600.0, 'train_loss': 2.576441243489583, 'epoch': 3.0})

In [10]:
# Step 9: Evaluate the fine-tuned model on the test set and compute ROUGE scores
tokenized_test = ds_test.map(preprocess, batched=True)

preds = []
labels = []

for sample in ds_test:
    input_ids = tokenizer(sample["article"], return_tensors="pt", truncation=True, max_length=512).input_ids
    label_text = sample["highlights"]
    labels.append(label_text)

    try:
        output = model.generate(input_ids=input_ids, max_new_tokens=128)
        pred_text = tokenizer.decode(output[0], skip_special_tokens=True)
    except OverflowError:
        pred_text = ""

    preds.append(pred_text)

# Compute ROUGE
rouge = evaluate.load("rouge")
results = rouge.compute(predictions=preds, references=labels)
print("Manual ROUGE scores:", results)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Manual ROUGE scores: {'rouge1': np.float64(0.2995903933010851), 'rouge2': np.float64(0.12122904972186706), 'rougeL': np.float64(0.23265633174336764), 'rougeLsum': np.float64(0.2700073511394266)}


In [11]:
# Step 10: Compare summaries from the original and fine-tuned models
sample = ds_test[0]
input_ids = tokenizer(sample["article"], return_tensors="pt", truncation=True, max_length=512).input_ids

# Fine-tuned model
fine_tuned_output = model.generate(input_ids=input_ids, max_new_tokens=128)
fine_summary = tokenizer.decode(fine_tuned_output[0], skip_special_tokens=True)

# Original model
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
original_output = original_model.generate(input_ids=input_ids, max_new_tokens=128)
original_summary = tokenizer.decode(original_output[0], skip_special_tokens=True)

print("Original Model Summary:-\n", original_summary)
print("\nFine-Tuned Model Summary:-\n", fine_summary)

Original Model Summary:-
 Israel and the United States opposed the Palestinians' efforts to join the ICC, a move that would allow the Palestinians to join the ICC.

Fine-Tuned Model Summary:-
 Palestinian Authority officially becomes the 123rd member of the International Criminal Court. Palestinians signed the Rome Statute in January, when they also accepted its jurisdiction over alleged crimes in Palestine. Palestinian Foreign Minister says it is a move toward greater justice.
