# Training FLAN-T5 Notebook
### Purpose is to train off of deciphering task data for the caesar cipher

In [2]:
# Imports
from datasets import load_from_disk
import torch
from evaluate import load
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [3]:
# Check CUDA working
torch.cuda.is_available()

True

In [4]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, model_max_length=128)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Load saved dataset
dataset = load_from_disk('/home/as6734/langgen_class_project/data/caesar')

  table = cls._concat_blocks(blocks, axis=0)


In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3803957
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189651
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1951
    })
})

In [7]:
# Reduce gigaword to 1% of size to expedite training
small_dataset = dataset.filter(lambda example, idx: idx < 190200, with_indices=True)

In [8]:
small_dataset['validation'] = small_dataset['validation'].filter(lambda example, idx: idx < 10000, with_indices=True)

In [9]:
small_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 190200
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1951
    })
})

In [10]:
# Define evaluation metric wrapper function using character error rate (CER)
cer = load("cer")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # decode preds and labels
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return {'cer': cer.compute(predictions=decoded_preds, references=decoded_labels)}

In [11]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 8
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir='/home/as6734/langgen_class_project/results/caesar_long',
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [12]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=small_dataset["train"],
   eval_dataset=small_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [13]:
# trainer.train(resume_from_checkpoint=True)
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Epoch,Training Loss,Validation Loss,Cer
2,0.6234,0.633596,0.581517
3,0.6109,0.613047,0.580716




TrainOutput(global_step=71325, training_loss=0.2082718645343854, metrics={'train_runtime': 12356.2379, 'train_samples_per_second': 46.179, 'train_steps_per_second': 5.772, 'total_flos': 9.76806363267072e+16, 'train_loss': 0.2082718645343854, 'epoch': 3.0})

In [18]:
small_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 190200
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1951
    })
})

In [19]:
small_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 190200
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1951
    })
})