In [1]:
pip install transformers datasets evaluate sentencepiece torch accelerate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
import torch
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)

In [3]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [4]:
#dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1000]")
dataset = dataset.train_test_split(test_size=0.05)

train_data = dataset["train"]
val_data = dataset["test"]

README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [5]:
train_data[0]

{'article': 'LOS ANGELES, California (CNN)  -- Misha Di Bono zips around town in her Infiniti sport-utility vehicle, breezy and unconcerned about the price of gas. Misha Di Bono says people used to make fun of her rolling billboard. She gets $500 a month and free gas. That\'s because she gets $500 a month -- plus free gas -- for turning her car into a rolling billboard for Jobing.com, the online recruiting company she works for. "People used to tease me about the \'Jobing\' mobile, and now they\'re like, \'Oh, we\'ll get Misha to drive,\' " she said, standing next to her decal-covered car. Jobing.com might be the most extreme example of how companies are helping employees during the current gas crunch. But with gas averaging more than $4 a gallon, more and more companies are trying to figure out incentives to help ease the pain at the pump for their employees.  Watch a rolling billboard for your company » . "There\'s no question companies are feeling the pinch," said John Challenger, t

In [6]:
max_input_length = 384 #512
max_target_length = 96 #128

def preprocess(batch):
    inputs = ["summarize: " + doc for doc in batch["article"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        #padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["highlights"],
            max_length=max_target_length,
            truncation=True,
            #padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [7]:
tokenized_train = train_data.map(
    preprocess,
    batched=True,
    remove_columns=train_data.column_names
)

tokenized_val = val_data.map(
    preprocess,
    batched=True,
    remove_columns=val_data.column_names
)


Map:   0%|          | 0/950 [00:00<?, ? examples/s]



Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

In [9]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [10]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_summarizer",
    eval_strategy="steps",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    fp16=True,
    num_train_epochs=1,
    logging_steps=50,
    eval_steps=200,
    save_steps=200,
    report_to="none",

    predict_with_generate=True,
    generation_max_length=80,
    generation_num_beams=2,
    eval_accumulation_steps=4
)


In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_summarizer",
    eval_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=2,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    fp16=torch.cuda.is_available(),  # mixed precision if GPU
    push_to_hub=False,
    report_to="none",

    predict_with_generate=True,      # IMPORTANT
    generation_max_length=80,        # reduce memory
    generation_num_beams=2,          # reduce compute
    eval_accumulation_steps=4        # flush tensors frequently
)


In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [13]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=476, training_loss=2.3605755397251675, metrics={'train_runtime': 152.993, 'train_samples_per_second': 12.419, 'train_steps_per_second': 3.111, 'total_flos': 188482490990592.0, 'train_loss': 2.3605755397251675, 'epoch': 2.0})

In [14]:
model.save_pretrained("t5_finetuned_summarizer")
tokenizer.save_pretrained("t5_finetuned_summarizer")

('t5_finetuned_summarizer/tokenizer_config.json',
 't5_finetuned_summarizer/special_tokens_map.json',
 't5_finetuned_summarizer/spiece.model',
 't5_finetuned_summarizer/added_tokens.json')

In [15]:
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="t5_finetuned_summarizer",
    tokenizer="t5_finetuned_summarizer"
)

text = """
The Transformer architecture has transformed NLP by replacing recurrence with
self-attention, enabling better parallelization and performance on long sequences.
"""

print(summarizer(text, max_length=50, min_length=20, do_sample=False))

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Device set to use cuda:0
Your max_length is set to 50, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'summary_text': 'The Transformer architecture has transformed NLP by replacing recurrence with self-attention . The architecture enables better parallelization and performance .'}]


In [16]:
!pip install evaluate rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=af39c5777ad73a9d648963ae8e3a9c5d9e585a5f34d5e3f5875ba0128a544386
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [17]:
import evaluate
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: round(v, 4) for k, v in result.items()}


Downloading builder script: 0.00B [00:00, ?B/s]

In [30]:
import numpy as np
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Sometimes predictions come as a tuple
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Convert to numpy
    predictions = np.array(predictions)
    labels = np.array(labels)

    # Replace -100 in labels with pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Clip values to valid token range (safety)
    vocab_size = tokenizer.vocab_size
    predictions = np.clip(predictions, 0, vocab_size - 1)
    labels = np.clip(labels, 0, vocab_size - 1)

    # Decode
    decoded_preds = tokenizer.batch_decode(
        predictions, skip_special_tokens=True
    )
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True
    )

    # Compute ROUGE
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )

    return {
        "rouge1": round(result["rouge1"], 4),
        "rouge2": round(result["rouge2"], 4),
        "rougeL": round(result["rougeL"], 4),
    }


In [31]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [32]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=476, training_loss=2.088161845167144, metrics={'train_runtime': 161.5077, 'train_samples_per_second': 11.764, 'train_steps_per_second': 2.947, 'total_flos': 188482490990592.0, 'train_loss': 2.088161845167144, 'epoch': 2.0})

In [33]:
model.save_pretrained("t5_finetuned_summarizer_with_rouge")
tokenizer.save_pretrained("t5_finetuned_summarizer_with_rouge")

('t5_finetuned_summarizer_with_rouge/tokenizer_config.json',
 't5_finetuned_summarizer_with_rouge/special_tokens_map.json',
 't5_finetuned_summarizer_with_rouge/spiece.model',
 't5_finetuned_summarizer_with_rouge/added_tokens.json')

In [34]:
import torch, gc
torch.cuda.empty_cache()
gc.collect()

701

In [35]:
results = trainer.evaluate()
print(results)

{'eval_loss': 1.9946001768112183, 'eval_rouge1': 0.3315, 'eval_rouge2': 0.1407, 'eval_rougeL': 0.2484, 'eval_runtime': 45.1604, 'eval_samples_per_second': 1.107, 'eval_steps_per_second': 1.107, 'epoch': 2.0}


In [36]:
print("\nROUGE Scores:")
print(f"ROUGE-1 : {results['eval_rouge1']:.4f}")
print(f"ROUGE-2 : {results['eval_rouge2']:.4f}")
print(f"ROUGE-L : {results['eval_rougeL']:.4f}")



ROUGE Scores:
ROUGE-1 : 0.3315
ROUGE-2 : 0.1407
ROUGE-L : 0.2484
