# Text-to-Text Generation
[Model] T5-Efficient-BASE-DL2 (Deep-Narrow version)

[Paper]
[Scale Efficiently: Insights from Pre-training and Fine-tuning Transformers](https://arxiv.org/abs/2109.10686)

In [None]:
!pip install -q transformers datasets rouge_score evaluate accelerate scikit-learn

## Importing necessary libraries and modules

In [None]:
import os
import torch
import torch.nn as nn

os.environ["CUDA_VISIBLE_DEVICES"] = "4"

print(torch.cuda.device_count())

1


In [None]:
import time
check_point = int(time.time())
check_point

1701489459

In [None]:
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, GenerationConfig
)
from datasets import load_dataset, DatasetDict, Dataset, load_from_disk, concatenate_datasets
import numpy as np
from evaluate import load
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import json
import zipfile
import nltk

# Downloading the 'punkt' tokenizer from the NLTK package
nltk.download('punkt')

[2023-12-02 11:57:41,094] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[nltk_data] Downloading package punkt to /home/lbrico/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Hyperparameter
config = {
    "try_small_dataset": False, # True for test
    "model_checkpoint": "google/t5-efficient-base-dl2",
    "max_input_length": 512,
    "max_target_length": 100,
    "batch_size": 4,
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "gradient_accumulation_steps":1,
    "learning_rate": 2e-5,
    "weight_decay": 0.05,
    "epochs": 10,
    "lr_scheduler_type": "linear",
    "metric": "exact_match",
    "generation_max_length": 50
}

In [None]:
# Loading dataset
file_path = '/mnt/nas/HYZ/AICUP/'
dataset = load_from_disk(f"{file_path}dataset_dict_v2")

if config["try_small_dataset"] is True:
    #Split small dataset
    train_sample = dataset["train"].select(range(500))
    validation_sample = dataset["validation"].select(range(10))
    test_sample = dataset["test"].select(range(10))
    dataset = DatasetDict({
    "train": train_sample,
    "validation": validation_sample,
    "test": test_sample
    })

dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 92933
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 10326
    })
})

In [None]:
# Loading the tokenizer and model for the T5-small architecture
tokenizer = AutoTokenizer.from_pretrained(config["model_checkpoint"])
model = AutoModelForSeq2SeqLM.from_pretrained(config["model_checkpoint"])

In [None]:
def preprocess_function(examples):
    """
    Preprocesses the input data for training or evaluating the T5 model.

    This function tokenizes the inputs and labels (if available) using the specified tokenizer.
    It's designed to be used with datasets in the Hugging Face 'datasets' library,
    where each item is a dictionary with 'prompt' and optionally 'completion' keys.

    Parameters:
    examples (dict): A dictionary containing 'prompt' and optionally 'completion' keys.
                     The values are lists of strings: the inputs and the expected outputs for the model.

    Returns:
    dict: A dictionary with tokenized 'input_ids' and optionally 'labels' for training/evaluation.

    The function tokenizes 'prompt' to create the model inputs.
    If 'completion' is present, it's also tokenized to create the labels for training.
    For labels, padding tokens are replaced with -100 to ignore them in the loss computation.
    """

    # Tokenize the input text
    model_inputs = tokenizer(examples["prompt"], padding="max_length", max_length=config["max_input_length"], truncation=True)

    if "completion" in examples:
        # Tokenize the labels (if present)
        labels = tokenizer(examples["completion"], padding="max_length", max_length=config["max_target_length"], truncation=True)

        # Replace padding token id with -100 in labels
        labels["input_ids"] = [
            [(label if label != tokenizer.pad_token_id else -100) for label in label_example] for label_example in labels["input_ids"]
        ]

        # Add labels to model inputs
        model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Applying the preprocessing function to the datasets
tokenized_dataset = dataset.map(preprocess_function, batched=True)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/10326 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels']


In [None]:
# Loading the Rouge metric for evaluation
metric = load("rouge")

def compute_metrics(eval_pred):
    """
    Compute metrics for evaluating the T5 model's performance using the Rouge metric and exact match rate.

    This function takes the predictions and labels from the model's evaluation and processes them
    to compute the Rouge metric, which is commonly used for evaluating text generation tasks. It also computes
    the exact match rate between the predictions and labels.

    Parameters:
    eval_pred (tuple): A tuple containing two elements: the predictions and the labels.
                       Both are numpy arrays with token IDs.

    Returns:
    dict: A dictionary containing computed metrics: Rouge scores, exact match rate, and average generation length.

    The function processes the predictions and labels, filters out invalid token IDs, decodes them,
    and then formats them for the Rouge metric computation. It also calculates the exact match rate
    and the average length of the generated predictions.
    """

    predictions, labels = eval_pred

    # Filter out invalid token IDs from the predictions
    filtered_predictions = [
        [token_id for token_id in pred if token_id != -100 and 0 <= token_id < tokenizer.vocab_size]
        for pred in predictions
    ]

    # Decode predictions and labels for comparison
    decoded_preds = tokenizer.batch_decode(filtered_predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Format predictions and labels for Rouge metric
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute the Rouge metric
    rouge_result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    rouge_result = {f'rouge_{key}': value * 100 for key, value in rouge_result.items()}

    # Calculate exact match count and rate
    exact_match_count = sum([pred.strip() == label.strip() for pred, label in zip(decoded_preds, decoded_labels)])
    exact_match_rate = exact_match_count / len(decoded_labels) * 100

    # Calculate average length of generated predictions
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    gen_len = np.mean(prediction_lens)

    # Merge results
    result = {**rouge_result, "exact_match": exact_match_rate, "gen_len": gen_len}

    return {k: round(v, 4) for k, v in result.items()}


In [None]:
# Setting up training arguments for fine-tuning
model_name = config["model_checkpoint"].split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir = f"{check_point}-{model_name}-finetuned-extracted-PHI",
    evaluation_strategy = config["evaluation_strategy"],
    save_strategy= config["save_strategy"],
    per_device_train_batch_size = config["batch_size"],
    per_device_eval_batch_size = config["batch_size"]*2,
    gradient_accumulation_steps = config["gradient_accumulation_steps"],
    learning_rate = config["learning_rate"],
    weight_decay = config["weight_decay"],
    num_train_epochs = config["epochs"],
    lr_scheduler_type = config["lr_scheduler_type"],
    load_best_model_at_end = True,
    metric_for_best_model = config["metric"],
    greater_is_better = True,
    save_total_limit = 3,
    fp16 = True,
    predict_with_generate = True,
    generation_max_length = config["generation_max_length"],
)

# Creating a data collator for batching
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, pad_to_multiple_of=8)

# Initializing the Seq2SeqTrainer with the model, arguments, datasets, and metrics
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Starting the training process
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge Rouge1,Rouge Rouge2,Rouge Rougel,Rouge Rougelsum,Exact Match,Gen Len
1,0.1192,0.08923,97.7527,96.7278,97.6806,97.6857,91.5359,10.6944
2,0.0806,0.067349,98.34,97.4359,98.2682,98.2761,93.1919,10.7065
3,0.0705,0.055258,98.5121,97.6177,98.4289,98.4367,93.589,10.7057
4,0.0507,0.051592,98.583,97.7573,98.5118,98.5174,93.9473,10.7621
5,0.054,0.046349,98.7291,97.8993,98.6454,98.6504,94.2475,10.7225
6,0.0427,0.044927,98.7699,97.9894,98.6866,98.6921,94.48,10.7187
7,0.0449,0.043117,98.7726,97.9906,98.6944,98.6966,94.4703,10.7249
8,0.041,0.041733,98.794,98.01,98.7077,98.7126,94.6155,10.7582
9,0.0429,0.041787,98.8099,98.0135,98.7177,98.7278,94.5381,10.7593
10,0.0408,0.041058,98.8145,98.0258,98.727,98.7359,94.5671,10.7532


TrainOutput(global_step=232340, training_loss=0.07572851886619561, metrics={'train_runtime': 28373.2342, 'train_samples_per_second': 32.754, 'train_steps_per_second': 8.189, 'total_flos': 2.96434727387136e+17, 'train_loss': 0.07572851886619561, 'epoch': 10.0})

In [None]:
trainer.save_model()