In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

2024-12-23 02:12:32.603751: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('data/pre-training/pre-train.tsv', sep='\t', names=['text'])
dataset = Dataset.from_pandas(df)

In [8]:
# Split the dataset into train and eval sets
data = dataset.train_test_split(test_size=0.2)

# Now you have train and test splits
train_dataset = data['train']
eval_dataset = data['test']

In [9]:
# Load Pretrained Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
def preprocess_function(examples):
    # Tokenize the text and prepare the input_ids and labels
    model_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

    # T5 requires decoder_input_ids as well, and for most cases, it's the same as input_ids
    decoder_input_ids = model_inputs["input_ids"]

    # Labels should be the same as input_ids in this case for text-to-text tasks
    labels = model_inputs["input_ids"]

    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "decoder_input_ids": decoder_input_ids,  # Add decoder_input_ids explicitly
        "labels": labels,  # Labels are the same as input_ids for this task
    }

In [11]:
# Tokenize the dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
# Step 3: Load Pretrained T5 Model for Conditional Generation
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [13]:
# Step 4: Set Training Arguments
training_args = TrainingArguments(
    output_dir='./results',           # Output directory to store model checkpoints
    overwrite_output_dir=True,        # Overwrite output_dir
    num_train_epochs=2,               # Number of training epochs
    per_device_train_batch_size=2,    # Batch size per GPU/CPU
    per_device_eval_batch_size=2,     # Evaluation batch size
    warmup_steps=200,                 # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # Strength of weight decay
    logging_dir='./logs',             # Directory for storing logs
    logging_steps=50,
)

In [14]:
# Step 5: Initialize the Trainer
trainer = Trainer(
    model=model,                       # The model to train
    args=training_args,                # Training arguments
    train_dataset=tokenized_train_dataset,   # The training dataset
    eval_dataset=tokenized_eval_dataset,  # The evaluation dataset (if available)
)

In [31]:
PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

In [15]:
# Step 6: Pretrain the Model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,13.5979
100,8.8196
150,3.9075
200,1.7509
250,0.8592
300,0.4376
350,0.2596
400,0.183
450,0.1099
500,0.0894


TrainOutput(global_step=4000, training_loss=0.3863173695988953, metrics={'train_runtime': 13888.8658, 'train_samples_per_second': 0.576, 'train_steps_per_second': 0.288, 'total_flos': 1082734411776000.0, 'train_loss': 0.3863173695988953, 'epoch': 2.0})

In [16]:
# Step 7: Save the model
model.save_pretrained("./pretrained_t5_model")
tokenizer.save_pretrained("./pretrained_t5_model")

('./pretrained_t5_model/tokenizer_config.json',
 './pretrained_t5_model/special_tokens_map.json',
 './pretrained_t5_model/spiece.model',
 './pretrained_t5_model/added_tokens.json')

In [17]:
# Optional: Evaluate the model (if you have an evaluation set)
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

Evaluation Results: {'eval_loss': 0.0006152440328150988, 'eval_runtime': 752.4049, 'eval_samples_per_second': 1.329, 'eval_steps_per_second': 0.665, 'epoch': 2.0}
