In [2]:
%pip install transformers datasets accelerate peft


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# RoBERTa Base

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets
import torch
import random
import numpy as np
from peft import LoraModel, LoraConfig
from evaluate import load
import math

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

mps


In [4]:
seed = 0
def set_seed(seed: int):
    # Set seed for Python's random module
    random.seed(seed)

    # Set seed for NumPy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)

    # Set seed for CUDA (if using)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

    # Make PyTorch deterministic (this can slow down the computation)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Example of usage
set_seed(seed)


In [7]:
dataset = load_dataset("glue", "rte")

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

In [9]:
print(f"Labels in the training set: {set(dataset['train']['label'])}")
print(f"Labels in the evaluation set: {set(dataset['validation']['label'])}")
print(f"Labels in the test set: {set(dataset['test']['label'])}")

Labels in the training set: {0, 1}
Labels in the evaluation set: {0, 1}
Labels in the test set: {-1}


<b>Test set is not labeled. We will combine training / evaluation set and re-split the dataset</b>

In [17]:
merged_dataset = concatenate_datasets([dataset['validation'], dataset['train']])

# Split into 80% train and 20% temp (for test + validation)
temp_split = merged_dataset.train_test_split(test_size=0.2, seed=seed)
train_dataset = temp_split["train"]
temp_dataset = temp_split["test"]

# Further split temp into 50% test and 50% validation
test_eval_split = temp_dataset.train_test_split(test_size=0.5, seed=seed)
test_dataset = test_eval_split["train"]
eval_dataset = test_eval_split["test"]


print(f"Labels in the test set after processing: {set(test_dataset['label'])}")

Labels in the test set after processing: {0, 1}


In [18]:
# Load Roberta Base
model_name = "roberta-base"
num_labels = len(set(dataset['train']['label']))
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
## For MRPC, the study authors loaded the best LORA checkpoint of MNLI, and used that as starting point for further adaptation.

from peft import PeftConfig
# Load LORA checkpoint
checkpoint_path = "./mnli/results/checkpoint-3"
config = PeftConfig.from_pretrained(checkpoint_path)

In [20]:
parameters = {
    "output_folder": 'rte',
    "sequence_length": 512,
    "epochs": 30,
    "batch_size": 16,
    "learning_rate": 5e-4,
    "weight_decay": 0.00,
    "warmup_ratio": 0.06,
    "optimizer": 'adamw_torch',
    "lora_alpha": 8,
    "lora_rank": 8,
    "lora_target_modules": ["query", "key", "value"],
    "lora_drop_out": 0.0
}


In [21]:
# Get "Lora-fied" model
target_modules = ["query", "key", "value"]
# config = LoraConfig(
#     task_type="SEQ_CLS",
#     r=parameters["lora_rank"],
#     lora_alpha=parameters["lora_alpha"],
#     target_modules=parameters["lora_target_modules"],
#     lora_dropout=parameters["lora_drop_out"],
#     init_lora_weights=True
# )
#lora_model = LoraModel(model, config, "default")

from peft import  get_peft_model
peft_model = get_peft_model(model, config)

In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

### We report the overall (matched and mismatched) accuracy for MNLI, Matthew’s correlation for CoLA, Pearson correlation for STS-B, and accuracy for other tasks.

In [24]:
# Tokenize the dataset
def preprocess_function(examples):
    # Remove entries with -1
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=parameters['sequence_length'])

#train_encoded_dataset = dataset['train'].map(preprocess_function, batched=True)
#test_encoded_dataset = test_set.map(preprocess_function, batched=True)
#val_encoded_dataset = validation_set.map(preprocess_function, batched=True)

# Prepare datasets for training

# For debug purposes: Load only 1 % of the dataset 
#train_dataset = dataset['train'].map(preprocess_function, batched=True).shuffle(seed=seed)
#eval_dataset = dataset['validation'].map(preprocess_function, batched=True).shuffle(seed=seed)
#test_dataset = dataset['test'].map(preprocess_function, batched=True)

train_dataset = train_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
eval_dataset = eval_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
test_dataset = test_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)


Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [25]:

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./{parameters['output_folder']}/results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",
    learning_rate=parameters['learning_rate'],
    per_device_train_batch_size=parameters['batch_size'],
    per_device_eval_batch_size=parameters['batch_size'],
    num_train_epochs=parameters['epochs'],
    weight_decay=parameters['weight_decay'],
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    optim=parameters['optimizer'],
    load_best_model_at_end=True,
    report_to="none",
    warmup_ratio=parameters['warmup_ratio'],
)

# Accuracy metric
metric = load('accuracy')

def compute_accuracy(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    return metric.compute(predictions=preds, references=labels)

# Define the trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

# Train the model
#trainer.train(resume_from_checkpoint=True)
trainer.train()


# Save model
trainer.save_model(f"./{parameters['output_folder']}/model")

  trainer = Trainer(


  0%|          | 0/4680 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [22]:
# Evaluate the model
results = trainer.evaluate()
print(results)

  0%|          | 0/26 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Test set
test_set_results = trainer.predict(test_dataset)
test_set_results.metrics

In [None]:
test_set_results.predictions.argmax(axis=1)

In [None]:
test_set_results.label_ids