- # Full fine-tuning

### 1. Select LLM

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained('google-t5/t5-small')

### 2. Select Tokenizer

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

### 3. Choose & Load the Dataset

In [3]:
from datasets import load_dataset
# Load a dataset (this example uses a simple summarization dataset)
dataset = load_dataset("cnn_dailymail", "3.0.0", split='train[:1%]')  # Only using 1% for low-resource setting
train_size = 0.8 # Use 80% for training and 20% for evaluation

# Split the dataset into training and validation sets
train_dataset = dataset.train_test_split(train_size=train_size)['train']
eval_dataset = dataset.train_test_split(train_size=train_size)['test']

### 4. Tokenize the input

In [4]:
def preprocess_function(examples):
    inputs = ["summerize: " + doc for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(examples['highlights'], max_length=150, truncation=True, padding="max_length", return_tensors="pt").input_ids
    model_inputs["labels"] = labels
    return model_inputs

### 5. Tokenize the dataset

In [5]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

### 6. Define training arguments

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Set evaluation strategy to 'epoch' or 'steps'
    save_strategy="epoch",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,  # Required when using EarlyStoppingCallback
)

### 7. Initialize the Trainer

In [7]:
from transformers import Trainer, EarlyStoppingCallback
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stops after 3 epochs without improvement
)

### 8. Train the model

In [None]:
trainer.train()

###  9. Evaluate the model

In [None]:
eval_results = trainer.evaluate()

print(f"Evaluation results: {eval_results}")

### 10. Save the trained model

In [None]:
trainer.save_model("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

### 11. Test the model

In [None]:
# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_t5")
tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_t5")

# Example input
input_text = "summarize: The stock market crashed today due to high inflation."
inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", max_length=256, truncation=True)

# Generate prediction
output = model.generate(**inputs, max_length=64, num_beams=5, early_stopping=True)

# Decode the prediction
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Generated summary: {decoded_output}")