In [None]:
!pip install datasets==3.1.0
!pip install accelerate==1.0.1
!pip install transformers==4.46.3
!pip install sentence-transformers==3.2.1
!pip install rouge-score==0.1.2
!pip install evaluate==0.4.3
!pip install numpy==1.24.0
!pip install scipy==1.9.3
!pip install torch==2.0.1

In [None]:
from itertools import product
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer,
                          DataCollatorForSeq2Seq,
                          AutoModelForSeq2SeqLM,
                          Seq2SeqTrainingArguments,
                          Seq2SeqTrainer,
                          EarlyStoppingCallback
                          )
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


## Hyperparameters

In [2]:
T5 = "google-t5/t5-small"
TOKENIZER = AutoTokenizer.from_pretrained(T5)
TEXT_N_TOKENS = 4000#5000#10000
SUMMARY_N_TOKENS = 1000

## Used functions

In [3]:
def extract_summaries(dataset):
    prefix = 'summarize: '
    documents = []
    summaries = []
    ids = []
    for i, sample in enumerate(zip(dataset['text'], dataset['summary'])):
        for text in sample[1]:
            documents.append(prefix + sample[0])
            summaries.append(text['text'])
            ids.append(i)
    return {'ids': ids, 'text': documents, 'summary': summaries}

In [4]:
def preprocess_function(examples, text_n_tokens, summary_n_tokens):
    model_inputs = TOKENIZER(examples['text'], max_length=text_n_tokens, padding='max_length')
    labels = TOKENIZER(examples['summary'], max_length=summary_n_tokens, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    model_inputs['ids'] = examples['ids']
    return model_inputs

In [5]:
def filter_text_by_k(text, kind, k):
    return len(text[kind]) <= k

In [6]:
def truncate_text(text, kind, k):
    text[kind] = text[kind][:k]
    return text

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rouge = load("rouge")
    
    labels = np.where(labels != -100, labels, TOKENIZER.pad_token_id)
    predictions = np.where(predictions != -100, predictions, TOKENIZER.pad_token_id)

    decoded_preds = TOKENIZER.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = TOKENIZER.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != TOKENIZER.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Prepare datasets

We are using the booksum dataset at a chapter level:

In [8]:
book_data = load_dataset("ubaada/booksum-complete-cleaned", "chapters")

train_ds = Dataset.from_dict(extract_summaries(book_data['train']))
test_ds = Dataset.from_dict(extract_summaries(book_data['test']))
val_ds = Dataset.from_dict(extract_summaries(book_data['validation']))

del book_data

We first tokenize it:

In [9]:
tokenized_train = train_ds.map(preprocess_function, batched=True, fn_kwargs={'text_n_tokens':TEXT_N_TOKENS, 'summary_n_tokens':SUMMARY_N_TOKENS})
tokenized_validation = val_ds.map(preprocess_function, batched=True, fn_kwargs={'text_n_tokens':TEXT_N_TOKENS, 'summary_n_tokens':SUMMARY_N_TOKENS})
tokenized_test = test_ds.map(preprocess_function, batched=True, fn_kwargs={'text_n_tokens':TEXT_N_TOKENS, 'summary_n_tokens':SUMMARY_N_TOKENS})

del train_ds, val_ds, test_ds

Map: 100%|██████████| 9534/9534 [00:27<00:00, 342.29 examples/s]
Map: 100%|██████████| 1485/1485 [00:03<00:00, 402.97 examples/s]
Map: 100%|██████████| 1432/1432 [00:04<00:00, 329.14 examples/s]


And drop the tests over 1500 tokens because we don't want to use incomplete targets:

In [10]:
usable_train = tokenized_train.filter(filter_text_by_k, fn_kwargs={'kind':'labels', 'k':SUMMARY_N_TOKENS})
usable_validation = tokenized_validation.filter(filter_text_by_k, fn_kwargs={'kind':'labels', 'k':SUMMARY_N_TOKENS})
usable_test = tokenized_test.filter(filter_text_by_k, fn_kwargs={'kind':'labels', 'k':SUMMARY_N_TOKENS})

Filter: 100%|██████████| 9534/9534 [00:27<00:00, 352.89 examples/s]
Filter: 100%|██████████| 1485/1485 [00:03<00:00, 398.06 examples/s]
Filter: 100%|██████████| 1432/1432 [00:04<00:00, 357.29 examples/s]


We can see we do not loose may points:

In [11]:
print('Ratio of kept train entries: ', len(usable_train) / len(tokenized_train))
print('Ratio of kept test entries: ', len(usable_test) / len(tokenized_test))
print('Ratio of kept validation entries: ', len(usable_validation) / len(tokenized_validation))

del tokenized_train, tokenized_test, tokenized_validation

Ratio of kept train entries:  0.8794839521711768
Ratio of kept test entries:  0.9050279329608939
Ratio of kept validation entries:  0.9218855218855219


Now we want to create two different datasets: One with the texts under the previously fixed number of tokens (under_k) and other with all the texts (complete).

In [12]:
train_complete = usable_train.map(truncate_text, fn_kwargs={'kind':'input_ids', 'k':TEXT_N_TOKENS})
test_complete = usable_validation.map(truncate_text, fn_kwargs={'kind':'input_ids', 'k':TEXT_N_TOKENS})
val_complete = usable_test.map(truncate_text, fn_kwargs={'kind':'input_ids', 'k':TEXT_N_TOKENS})

train_under_k = usable_train.filter(filter_text_by_k, fn_kwargs={'kind':'input_ids', 'k':TEXT_N_TOKENS})
test_under_k = usable_validation.filter(filter_text_by_k, fn_kwargs={'kind':'input_ids', 'k':TEXT_N_TOKENS})
val_under_k = usable_test.filter(filter_text_by_k, fn_kwargs={'kind':'input_ids', 'k':TEXT_N_TOKENS})

Map: 100%|██████████| 8385/8385 [00:12<00:00, 680.61 examples/s]
Map: 100%|██████████| 1369/1369 [00:01<00:00, 708.02 examples/s]
Map: 100%|██████████| 1296/1296 [00:01<00:00, 735.16 examples/s]
Filter: 100%|██████████| 8385/8385 [00:20<00:00, 402.70 examples/s]
Filter: 100%|██████████| 1369/1369 [00:03<00:00, 415.65 examples/s]
Filter: 100%|██████████| 1296/1296 [00:03<00:00, 426.05 examples/s]


Next, let us have a look to the ratios of the new datasets:

In [13]:
print(f'Ratio of train entries under {TEXT_N_TOKENS} tokens: ', len(train_under_k) / len(usable_train))
print(f'Ratio of test entries under {TEXT_N_TOKENS} tokens: ', len(test_under_k) / len(usable_test))
print(f'Ratio of validation entries under {TEXT_N_TOKENS} tokens: ', len(val_under_k) / len(usable_validation))

del usable_train, usable_test, usable_validation

Ratio of train entries under 4000 tokens:  0.4894454382826476
Ratio of test entries under 4000 tokens:  0.49382716049382713
Ratio of validation entries under 4000 tokens:  0.5178962746530315


In the cropped approach, we only keep about the 60% of each dataset. In the complete option we still have the same ratio, but with truncated texts.

In [14]:
datasets_under_k = {'train':train_under_k, 'test':test_under_k, 'val':val_under_k}
datasets_complete = {'train':train_complete, 'test':test_complete, 'val':val_complete}

## Model

Now, we define the pretrained models and arguments that are going to be used for training:

In [30]:
data_collator = DataCollatorForSeq2Seq(tokenizer=TOKENIZER, model=T5)
model = AutoModelForSeq2SeqLM.from_pretrained(T5)

Let us set the args:

In [31]:
learning_rate = 2e-05
weight_decay = 0.01
acc_steps = 4

training_args = Seq2SeqTrainingArguments(
        output_dir="t5_model",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end = True,
        learning_rate=learning_rate,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        weight_decay=weight_decay,
        save_total_limit=2,
        num_train_epochs=14,
        predict_with_generate=True,
        fp16=True,
        gradient_accumulation_steps=acc_steps,
        generation_max_length=128,
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Choose the approach that is going to be used:

In [32]:
datasets = datasets_complete # in {datasets_under_k, datasets_complete}
patience = 2

And create the trainer:

In [33]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["val"],
    tokenizer=TOKENIZER,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
)

  trainer = Seq2SeqTrainer(


Let us first evaluate in order to check the performance of the model before fine tunning it:

In [None]:
trainer.evaluate()

As we can see, it performs very poorly. We are going to train it now:

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,1.4801,1.246521,0.1872,0.0263,0.1231,0.1231,118.9074
1,1.4307,1.22791,0.1983,0.0277,0.1266,0.1266,121.3904
2,1.4453,1.215685,0.2041,0.0295,0.1302,0.1302,123.1335
4,1.4195,1.203647,0.2187,0.0315,0.1355,0.1355,126.1474
5,1.4276,1.199493,0.2166,0.0314,0.1356,0.1357,126.1844
6,1.4044,1.193996,0.2144,0.0309,0.1361,0.1362,126.2014
8,1.4148,1.190361,0.2142,0.0303,0.1357,0.1358,126.2971
9,1.3986,1.188405,0.2133,0.0306,0.1357,0.1357,126.2654
10,1.3968,1.187252,0.217,0.0315,0.1364,0.1364,126.2762
12,1.3905,1.186253,0.2164,0.0315,0.1362,0.1363,126.6497


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=29344, training_loss=1.4213041286822103, metrics={'train_runtime': 48075.5999, 'train_samples_per_second': 2.442, 'train_steps_per_second': 0.61, 'total_flos': 1.24108431949824e+17, 'train_loss': 1.4213041286822103, 'epoch': 13.998330351818725})

In [35]:
trainer.evaluate()

{'eval_loss': 1.1859015226364136,
 'eval_rouge1': 0.2156,
 'eval_rouge2': 0.0312,
 'eval_rougeL': 0.1359,
 'eval_rougeLsum': 0.1359,
 'eval_gen_len': 126.6265,
 'eval_runtime': 1009.8394,
 'eval_samples_per_second': 1.283,
 'eval_steps_per_second': 1.283,
 'epoch': 13.998330351818725}

The training did not really help with the scores, so we are going to perform a grid search in order to find the optimal parameters for our goal.

In [36]:
trainer.save_model('t5_over_4k')

### Grid search

In [17]:
param_grid = {
    "learning_rate": [1e-5, 2e-5, 5e-5],
    "weight_decay": [0.01, 0.05, 0.1],
    "gradient_accumulation_steps": [4, 8, 12]
}

# Generate all combinations of hyperparameters
param_combinations = list(product(*param_grid.values()))

param_names = list(param_grid.keys())

In [None]:
best_rouge = 0
best_params = None

for params in param_combinations:
    model = AutoModelForSeq2SeqLM.from_pretrained(T5)
    hyperparams = dict(zip(param_names, params))
    print(f"Testing with parameters: {hyperparams}")
    
    training_args = Seq2SeqTrainingArguments(
        output_dir="t5_model",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end = True,
        learning_rate=hyperparams["learning_rate"],
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        weight_decay=hyperparams["weight_decay"],
        save_total_limit=2,
        num_train_epochs=14,
        predict_with_generate=True,
        fp16=True,
        gradient_accumulation_steps=hyperparams["gradient_accumulation_steps"],
        generation_max_length=128,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=datasets["train"],
        eval_dataset=datasets["val"],
        tokenizer=TOKENIZER,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )
    
    trainer.train()
    metrics = trainer.evaluate()
    
    # Save the best performing parameters using rouge1 as main metric
    if metrics["eval_rouge1"] > best_rouge:
        best_rouge = metrics["eval_rouge1"]
        best_params = hyperparams
        print(f"New best parameters found: {best_params} with ROUGE-1: {best_rouge}")
        trainer.save_model('t5_best_model')

print(f"Best parameters: {best_params}")


After doing the param grid, the best parameters found are:

* 'learning_rate': 2e-05
* 'weight_decay': 0.01
* 'gradient_accumulation_steps': 4

## Evaluate

Now, let us have a look to the performance of the different models we have trained:

In [37]:
datasets = datasets_complete # in {datasets_under_k, datasets_complete}
model_name = 't5_over_4k' # in {T5, 't5_overfitted', 't5_best_model', 't5_over_4k'}

In [38]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    
training_args = Seq2SeqTrainingArguments(
    output_dir=model_name,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end = True,
    learning_rate=2e-05,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=14,
    predict_with_generate=True,
    fp16=True,
    gradient_accumulation_steps=4,
    generation_max_length=128,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=TOKENIZER,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
)
    
trainer.evaluate()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Seq2SeqTrainer(


{'eval_loss': 1.214807391166687,
 'eval_model_preparation_time': 0.0015,
 'eval_rouge1': 0.2082,
 'eval_rouge2': 0.0304,
 'eval_rougeL': 0.133,
 'eval_rougeLsum': 0.1329,
 'eval_gen_len': 125.5405,
 'eval_runtime': 1062.4515,
 'eval_samples_per_second': 1.289,
 'eval_steps_per_second': 1.289}

In [50]:
import torch

In [52]:
inputs = torch.tensor(datasets['test']['labels'][0], dtype=torch.int)

In [63]:
inputs = TOKENIZER(datasets['test']['text'][10], return_tensors="pt", truncation=True).input_ids

In [64]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
inputs = inputs.to(device)

In [65]:
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [68]:
TOKENIZER.decode(outputs[0], skip_special_tokens=True)

"Esther's story is about the happiest of the happy years she has been the mistress of Bleak House. The couple gave her darling into her arms, and through many weeks I never left her. The little child who was to have done so much was born before the turf was planted on its father's grave. The help that my dear counted on did come to her, though it came in the eternal wisdom, for another purpose. The help that my"