In [7]:
%pip install --upgrade torch

Collecting torch
  Downloading torch-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.6.80 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.5.1.17 (from torch)
  Downloading nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.6.4.1 (from torch)
  Downloading nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.3.0.4 (from torch)
  Downloading nvidia_cufft_c

In [2]:
%pip install --upgrade transformers torch torchvision accelerate bitsandbytes

Collecting torchvision
  Downloading torchvision-0.22.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Downloading torchvision-0.22.1-cp310-cp310-manylinux_2_28_x86_64.whl (7.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: torchvision
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.17.0
    Uninstalling torchvision-0.17.0:
      Successfully uninstalled torchvision-0.17.0
Successfully installed torchvision-0.22.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
# =============================================================================
# 1. SETUP AND CONFIGURATION
# =============================================================================
import os

# Set environment variables
os.environ['HF_HOME'] = '/workspace/huggingface_cache'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import torch
import transformers
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    Trainer,
    EarlyStoppingCallback,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import wandb

# Enable tf32 for better performance on Ampere GPUs
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
    torch.backends.cuda.matmul.allow_tf32 = True

class TrainingConfig:
    """
    Configuration settings for the fine-tuning job.
    """
    # Model and dataset configuration
    MODEL_ID = "google/mt5-xl"
    DATASET_NAME = "RobbedoesHF/dutch-definitions"
    DATASET_TRAIN_SPLIT = "train"
    DATASET_EVAL_SPLIT = "validation"

    # LoRA configuration
    LORA_R = 32
    LORA_ALPHA = 64
    LORA_DROPOUT = 0.1
    # Target modules for mT5 models
    LORA_TARGET_MODULES = ["q", "k", "v", "o", "wi", "wo"] # Common for T5 architectures

    # Training configuration
    OUTPUT_DIR = "./results_mt5_xl"
    BATCH_SIZE = 2  # Adjusted for mT5-XL on A100
    GRAD_ACCUMULATION_STEPS = 8  # Effective batch size = 16
    LEARNING_RATE = 5e-5 # A common starting point for T5 fine-tuning
    NUM_TRAIN_EPOCHS = 5
    MAX_SEQ_LENGTH = 512
    LOGGING_STEPS = 10
    SAVE_STEPS = 100
    EVAL_STEPS = 100
    OPTIMIZER = "paged_adamw_8bit"
    LR_SCHEDULER_TYPE = "cosine"
    WARMUP_RATIO = 0.1
    EARLY_STOPPING_PATIENCE = 5
    SEED = 42

    # W&B configuration
    WANDB_PROJECT = "dutch-definition-modeling"
    WANDB_RUN_NAME = "mt5-xl-qlora-dutch-definitions"

    # Test run configuration
    IS_TEST_RUN = False
    TEST_RUN_SIZE = 100

config = TrainingConfig()

# --- Weights & Biases Login ---
try:
    wandb.login()
    os.environ["WANDB_PROJECT"] = config.WANDB_PROJECT
except Exception as e:
    print(f"Could not log in to W&B: {e}. Disabling monitoring.")
    os.environ["WANDB_DISABLED"] = "true"

[34m[1mwandb[0m: Currently logged in as: [33mrobbe-meersman[0m ([33mrobbe-meersman-ku-leuven[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
# =============================================================================
# 2. MODEL AND TOKENIZER LOADING
# =============================================================================
print(f"Loading base model: {config.MODEL_ID}")

# Configure quantization for 4-bit loading (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for A100
    bnb_4bit_use_double_quant=True,
)

# Load the Seq2Seq LM model with quantization
model = AutoModelForSeq2SeqLM.from_pretrained(
    config.MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)

Loading base model: google/mt5-xl


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
# =============================================================================
# 3. DATASET PREPARATION (REVISED)
# =============================================================================

def is_valid_example(example):
    """
    Checks if an example row has all the required non-empty fields.
    """
    return (
        example.get("Lemma") and isinstance(example["Lemma"], str) and len(example["Lemma"].strip()) > 0 and
        example.get("DefinitionShort") and isinstance(example["DefinitionShort"], str) and len(example["DefinitionShort"].strip()) > 0 and
        example.get("DefinitionFull") and isinstance(example["DefinitionFull"], str) and len(example["DefinitionFull"].strip()) > 0
    )

def create_prompts_for_mt5(examples):
    """
    Creates source and target text prompts from a batch of examples.
    The source text is now only the direct instruction.
    """
    source_texts = []
    target_texts = []

    for i in range(len(examples['Lemma'])):
        lemma = examples['Lemma'][i]
        definition_short = examples['DefinitionShort'][i]
        definition_full = examples['DefinitionFull'][i]

        # The source text is now only the direct user instruction.
        source_text = f"Breid de volgende korte definitie voor het woord '{lemma}' uit tot een volledige definitie: '{definition_short}'"
        source_texts.append(source_text)

        # The target text remains the desired full definition for the decoder.
        target_texts.append(definition_full)

    return {"source_text": source_texts, "target_text": target_texts}


def tokenize_function(examples):
    """
    Tokenizes the source and target texts for the model.
    """
    model_inputs = tokenizer(
        examples["source_text"],
        max_length=config.MAX_SEQ_LENGTH,
        padding="max_length",
        truncation=True
    )
    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=config.MAX_SEQ_LENGTH,
        padding="max_length",
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# --- Load and Clean the Dataset ---
dataset = load_dataset(config.DATASET_NAME)

print("Filtering datasets for invalid rows...")
train_dataset = dataset[config.DATASET_TRAIN_SPLIT].filter(is_valid_example, num_proc=4)
eval_dataset = dataset[config.DATASET_EVAL_SPLIT].filter(is_valid_example, num_proc=4)

# --- Create Prompts and Tokenize ---
print("Creating prompts and tokenizing datasets...")
train_dataset = train_dataset.map(create_prompts_for_mt5, batched=True, num_proc=4, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(create_prompts_for_mt5, batched=True, num_proc=4, remove_columns=eval_dataset.column_names)

train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["source_text", "target_text"])
eval_dataset = eval_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["source_text", "target_text"])

if config.IS_TEST_RUN:
    train_dataset = train_dataset.select(range(config.TEST_RUN_SIZE))
    eval_dataset = eval_dataset.select(range(config.TEST_RUN_SIZE))

# --- Print Final Dataset Sizes ---
print("\n" + "="*50)
print(f"Final training dataset size: {len(train_dataset)}")
print(f"Final evaluation dataset size: {len(eval_dataset)}")
print("="*50 + "\n")

Filtering datasets for invalid rows...
Creating prompts and tokenizing datasets...


Map (num_proc=4):   0%|          | 0/27880 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3494 [00:00<?, ? examples/s]


Final training dataset size: 27880
Final evaluation dataset size: 3494



In [4]:
# =============================================================================
# 4. TRAINER SETUP
# =============================================================================
print("Configuring LoRA and PEFT model...")

peft_config = LoraConfig(
    r=config.LORA_R,
    lora_alpha=config.LORA_ALPHA,
    lora_dropout=config.LORA_DROPOUT,
    target_modules=config.LORA_TARGET_MODULES,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM, # Specify task type for encoder-decoder
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

print("Setting up training arguments...")
training_arguments = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    gradient_accumulation_steps=config.GRAD_ACCUMULATION_STEPS,
    gradient_checkpointing=True,
    optim=config.OPTIMIZER,
    learning_rate=config.LEARNING_RATE,
    num_train_epochs=config.NUM_TRAIN_EPOCHS,
    fp16=False,
    bf16=True, # Use bfloat16 for A100 performance
    max_grad_norm=0.3,
    lr_scheduler_type=config.LR_SCHEDULER_TYPE,
    warmup_ratio=config.WARMUP_RATIO,
    logging_dir=f"{config.OUTPUT_DIR}/logs",
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    logging_steps=config.LOGGING_STEPS,
    save_steps=config.SAVE_STEPS,
    eval_steps=config.EVAL_STEPS,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="wandb",
    run_name=config.WANDB_RUN_NAME,
    seed=config.SEED,
    group_by_length=True,
)

# Data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id,
    padding=True,
    pad_to_multiple_of=8
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=config.EARLY_STOPPING_PATIENCE)],
)

Configuring LoRA and PEFT model...
trainable params: 48,758,784 || all params: 3,791,378,432 || trainable%: 1.2860
Setting up training arguments...


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
# =============================================================================
# 5. TRAINING
# =============================================================================

print("Starting model training...")
trainer.train()

print("\nSaving final LoRA adapter...")
final_model_path = os.path.join(config.OUTPUT_DIR, "final_checkpoint")
trainer.save_model(final_model_path)
print(f"Final model saved to {final_model_path}")

wandb.finish()