In [None]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
model_name_or_path = "bigscience/mt0-large"
tokenizer_name_or_path = "bigscience/mt0-large"

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from datasets import load_dataset
raw_datasets = load_dataset("super_glue", "rte",trust_remote_code=True)


Downloading data: 100%|██████████| 751k/751k [00:00<00:00, 794kB/s] 
Generating train split: 100%|██████████| 2490/2490 [00:00<00:00, 35896.19 examples/s]
Generating validation split: 100%|██████████| 277/277 [00:00<00:00, 41576.80 examples/s]
Generating test split: 100%|██████████| 3000/3000 [00:00<00:00, 53824.19 examples/s]


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large")


In [16]:
def tokenize_function(examples):
    input_texts = ["premise: " + p + " hypothesis: " + h for p, h in zip(examples["premise"], examples["hypothesis"])]
    model_inputs = tokenizer(input_texts, max_length=512, padding="max_length", truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["label"], max_length=64, padding="max_length", truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [17]:
print(raw_datasets['train'])
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets.shape

Dataset({
    features: ['premise', 'hypothesis', 'idx', 'label'],
    num_rows: 2490
})


Map:   0%|          | 0/2490 [00:00<?, ? examples/s]


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [2]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 1,231,940,608 || trainable%: 0.1915


In [None]:
training_args = TrainingArguments(
    output_dir="finetuned/mt0-large-lora",
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()