In [8]:
!pip -q install transformers datasets evaluate rouge_score

In [9]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, EarlyStoppingCallback

t5_base = "google-t5/t5-base"
data = load_dataset("abisee/cnn_dailymail", "3.0.0")
tokenizer = AutoTokenizer.from_pretrained(t5_base)
data

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [10]:
import pandas as pd
train_percent, val_percent, test_percent = 0.8, 0.1, 0.1


def load_data_sampled(max_samples=10000):
  train_df = pd.DataFrame(data['train'])
  test_df = pd.DataFrame(data['test'])
  val_df = pd.DataFrame(data['validation'])
  train_df = train_df.sample(int(max_samples * train_percent)).reset_index(drop=True)
  test_df = test_df.sample(int(max_samples * test_percent)).reset_index(drop=True)
  val_df = val_df.sample(int(max_samples * val_percent)).reset_index(drop=True)
  return train_df, test_df, val_df

train_df, test_df, val_df = load_data_sampled(max_samples=10000)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

sampled_data = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': val_dataset
})

print("New DatasetDict rows and features below:\n")
sampled_data

New DatasetDict rows and features below:



DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 1000
    })
})

In [11]:
prefix = "summarize: "
def preprocess_function(examples):
    inputs = [prefix + str(article) for article in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding=True)

    labels = tokenizer(examples["highlights"], max_length=150, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = sampled_data.map(preprocess_function, batched=True, remove_columns=sampled_data['train'].column_names)
tokenized_data

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [12]:
from transformers import DataCollatorForSeq2Seq
import evaluate

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=t5_base, label_pad_token_id=-100)
rouge_metric = evaluate.load("rouge")

In [13]:
import numpy as np

def compute_metrics(eval_preds):
  predictions, labels = eval_preds
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)
  return {k: round(v, 4) for k, v in result.items()}

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(
    t5_base,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.to("cuda")

model.gradient_checkpointing_enable(
    gradient_checkpointing_kwargs={"use_reentrant": False}
)
model.zero_grad()




In [None]:
training_arguments = Seq2SeqTrainingArguments(
    output_dir="fine-tuned-t5-cnn-dailymail-10000",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=200,
    logging_steps=100,
    weight_decay=0.03,
    learning_rate = 8e-5,
    logging_dir='./logs',
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    save_total_limit=3,
    predict_with_generate=True,
    generation_max_length=150,
    generation_num_beams=5,
    bf16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.7708,0.830198,0.2347,0.1077,0.1633,0.1636,53.63
2,0.7236,0.823444,0.2551,0.1184,0.1771,0.1775,57.932
3,0.727,0.822413,0.2617,0.1208,0.1819,0.1824,60.128
4,0.7301,0.821479,0.2614,0.1208,0.1821,0.1823,59.024
5,0.7308,0.821524,0.2638,0.1228,0.1835,0.1838,59.633
6,0.7304,0.821548,0.2639,0.1221,0.1835,0.1837,59.929


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3000, training_loss=1.0296797841389973, metrics={'train_runtime': 32091.2395, 'train_samples_per_second': 1.496, 'train_steps_per_second': 0.093, 'total_flos': 5.845995749376e+16, 'train_loss': 1.0296797841389973, 'epoch': 6.0})

In [None]:
output_dir = "fine-tuned-t5-cnn-dailymail-model"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

('fine-tuned-t5-cnn-dailymail-model\\tokenizer_config.json',
 'fine-tuned-t5-cnn-dailymail-model\\special_tokens_map.json',
 'fine-tuned-t5-cnn-dailymail-model\\tokenizer.json')

In [14]:
# load my now trained model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_path = "ramyj/fine-tuned-t5-cnn-dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [15]:
import textwrap
sample_articles = sampled_data["test"]["article"][:5]
sample_highlights = sampled_data["test"]["highlights"][:5]
prefix = "summarize: "


for i, (article, reference) in enumerate(zip(sample_articles, sample_highlights)):
    print(f"Article {i+1}")
    print("=" * 80)

    # Prepare input for the model (single article)
    input_text = prefix + article
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate summary
    output = model.generate(
        **inputs,
        max_length=150,
        min_length=30,
        num_beams=5,
        length_penalty=1.0,
        early_stopping=True
    )

    # Decode the generated output
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]

    print("\nGENERATED SUMMARY:")
    # Format the generated summary with line breaks every ~80 characters
    formatted_summary = textwrap.fill(decoded_output, width=80).replace(" .", ".")
    print(formatted_summary)

    print("\nREFERENCE SUMMARY:")
    # Format the reference summary with line breaks every ~80 characters
    formatted_reference = textwrap.fill(reference, width=80).replace(" .", ".") + "."
    print(formatted_reference)

    print("\n" + "-" * 80 + "\n")

Article 1

GENERATED SUMMARY:
A 1921 application form to join the racist Ku Klux Klan organisation has emerged
. The form asks a further 19 questions ranging from the mundane, such as ‘what
is your age?’, to the overtly sinister – ‘Do you believe in White Supremacy?’
The form was purchased at an ephemera show by Delaware rare book dealer Ian
Brabner.

REFERENCE SUMMARY:
Delaware rare book dealer Ian Brabner bought the form at ephemera show. It's
dated August 21, 1921, and lists 20 questions for potential KKK recruits.
Questions include 'are you a Jew?' and 'do you want white supremacy?'.

--------------------------------------------------------------------------------

Article 2

GENERATED SUMMARY:
Paul Scholes does not believe there is a British manager who can turn fortunes
around. Ryan Giggs could lead Manchester United into the latter stages of the
competition. Scholes believes outgoing Borussia Dortmund manager Jurgen Klopp
would be a welcome addition to the Premier League.

REFER