In [1]:
%pip install transformers datasets accelerate peft


Note: you may need to restart the kernel to use updated packages.


# RoBERTa Base

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets
import torch
import random
import numpy as np
from peft import LoraModel, LoraConfig
from evaluate import load
import math

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

2025-01-07 02:03:15.575417: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736215395.590081    1071 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736215395.594597    1071 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-07 02:03:15.613087: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cuda


In [3]:
seed = 0
def set_seed(seed: int):
    # Set seed for Python's random module
    random.seed(seed)

    # Set seed for NumPy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)

    # Set seed for CUDA (if using)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

    # Make PyTorch deterministic (this can slow down the computation)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Example of usage
set_seed(seed)


In [4]:
dataset = load_dataset("glue", "mrpc")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [5]:
print(f"Labels in the training set: {set(dataset['train']['label'])}")
print(f"Labels in the evaluation set: {set(dataset['validation']['label'])}")
print(f"Labels in the test set: {set(dataset['test']['label'])}")

Labels in the training set: {0, 1}
Labels in the evaluation set: {0, 1}
Labels in the test set: {0, 1}


<b>Test set is labeled. No further processing required</b>

In [6]:
# Load Roberta Base
model_name = "roberta-base"
num_labels = len(set(dataset['train']['label']))
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
## For MRPC, the study authors loaded the best LORA checkpoint of MNLI, and used it as starting point for further finetuning.

from peft import PeftConfig
# Load LORA checkpoint
#checkpoint_path = "./mnli/results/checkpoint-82472"
#config = PeftConfig.from_pretrained(checkpoint_path)

In [10]:
parameters = {
    "output_folder": 'mrpc_no_lora_init',
    "sequence_length": 512,
    "epochs": 30,
    "batch_size": 16,
    "learning_rate": 4e-4,
    "weight_decay": 0.00,
    "warmup_ratio": 0.06,
    "optimizer": 'adamw_torch',
    "lora_alpha": 8,
    "lora_rank": 8,
    "lora_target_modules": ["query", "key", "value"],
    "lora_drop_out": 0.0
}


In [11]:
target_modules = ["query", "key", "value"]
config = LoraConfig(
    task_type="SEQ_CLS",
    r=parameters["lora_rank"],
    lora_alpha=parameters["lora_alpha"],
    target_modules=parameters["lora_target_modules"],
    lora_dropout=parameters["lora_drop_out"],
    init_lora_weights=True
)
#lora_model = LoraModel(model, config, "default")

from peft import  get_peft_model
peft_model = get_peft_model(model, config)

In [12]:
# Tokenize the dataset
def preprocess_function(examples):
    # Remove entries with -1
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=parameters['sequence_length'])

# Prepare datasets for training
train_dataset = dataset['train'].map(preprocess_function, batched=True).shuffle(seed=seed)
eval_dataset = dataset['validation'].map(preprocess_function, batched=True).shuffle(seed=seed)
test_dataset = dataset['test'].map(preprocess_function, batched=True)

#train_dataset = train_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
#eval_dataset = eval_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
#test_dataset = test_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [13]:

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./{parameters['output_folder']}/results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",
    learning_rate=parameters['learning_rate'],
    per_device_train_batch_size=parameters['batch_size'],
    per_device_eval_batch_size=parameters['batch_size'],
    num_train_epochs=parameters['epochs'],
    weight_decay=parameters['weight_decay'],
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    optim=parameters['optimizer'],
    load_best_model_at_end=True,
    report_to="none",
    warmup_ratio=parameters['warmup_ratio'],
)

# Accuracy metric
metric = load('accuracy')

def compute_accuracy(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    return metric.compute(predictions=preds, references=labels)

# Define the trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

# Train the model
#trainer.train(resume_from_checkpoint=True)
trainer.train()


# Save model
trainer.save_model(f"./{parameters['output_folder']}/model")



Epoch,Training Loss,Validation Loss,Accuracy
1,0.5764,0.488542,0.75
2,0.3944,0.409609,0.830882
3,0.2573,0.351252,0.857843
4,0.2414,0.305413,0.865196
5,0.2617,0.403203,0.857843
6,0.2492,0.446119,0.872549
7,0.2682,0.492312,0.857843
8,0.1532,0.405361,0.872549
9,0.0711,0.643304,0.862745
10,0.1516,0.649685,0.862745


In [15]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.3054126799106598, 'eval_accuracy': 0.8651960784313726, 'eval_runtime': 3.5346, 'eval_samples_per_second': 115.431, 'eval_steps_per_second': 7.356, 'epoch': 30.0}


In [14]:
# Test set
test_set_results = trainer.predict(test_dataset)
test_set_results.metrics

{'test_loss': 0.3288087844848633,
 'test_accuracy': 0.8655072463768116,
 'test_runtime': 14.899,
 'test_samples_per_second': 115.78,
 'test_steps_per_second': 7.249}

In [None]:
test_set_results.predictions.argmax(axis=1)

In [None]:
test_set_results.label_ids