In [1]:
%pip install transformers datasets accelerate peft


Note: you may need to restart the kernel to use updated packages.


# RoBERTa Base / STS-B

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets
import torch
import random
import numpy as np
from peft import LoraModel, LoraConfig
from evaluate import load
import math

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

2025-01-07 04:30:22.624643: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736224222.639216    1948 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736224222.643619    1948 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-07 04:30:22.660888: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cuda


In [2]:
seed = 0
def set_seed(seed: int):
    # Set seed for Python's random module
    random.seed(seed)

    # Set seed for NumPy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)

    # Set seed for CUDA (if using)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

    # Make PyTorch deterministic (this can slow down the computation)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Example of usage
set_seed(seed)


In [3]:
dataset = load_dataset("glue", "stsb")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 5749
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1379
    })
})

In [4]:
print(f"Labels in the training set: {set(dataset['train']['label'])}")
print(f"Labels in the evaluation set: {set(dataset['validation']['label'])}")
print(f"Labels in the test set: {set(dataset['test']['label'])}")

Labels in the training set: {0.5, 1.600000023841858, 2.5999999046325684, 3.799999952316284, 4.25, 5.0, 2.200000047683716, 4.199999809265137, 4.599999904632568, 3.867000102996826, 4.666999816894531, 1.6670000553131104, 3.75, 2.799999952316284, 2.4000000953674316, 2.75, 3.200000047683716, 3.0, 3.4000000953674316, 3.5999999046325684, 2.375, 4.800000190734863, 4.0, 4.908999919891357, 4.400000095367432, 4.75, 4.85699987411499, 4.132999897003174, 4.333000183105469, 4.5, 4.875, 0.6669999957084656, 1.2000000476837158, 0.25, 1.75, 1.0, 1.555999994277954, 1.399999976158142, 0.6000000238418579, 0.800000011920929, 1.6430000066757202, 2.9170000553131104, 2.0, 2.25, 2.5329999923706055, 2.5, 2.6670000553131104, 2.818000078201294, 2.3329999446868896, 2.875, 2.7690000534057617, 0.8999999761581421, 3.937999963760376, 3.5, 3.8329999446868896, 3.765000104904175, 3.940999984741211, 3.25, 3.1110000610351562, 3.9230000972747803, 3.3329999446868896, 3.5329999923706055, 4.056000232696533, 4.308000087738037, 4.

<b>Test set is not labeled. We will combine training / evaluation set and re-split the dataset</b>

In [6]:
merged_dataset = concatenate_datasets([dataset['validation'], dataset['train']])

# Split into 80% train and 20% temp (for test + validation)
temp_split = merged_dataset.train_test_split(test_size=0.2, seed=seed)
train_dataset = temp_split["train"]
temp_dataset = temp_split["test"]

# Further split temp into 50% test and 50% validation
test_eval_split = temp_dataset.train_test_split(test_size=0.5, seed=seed)
test_dataset = test_eval_split["train"]
eval_dataset = test_eval_split["test"]


print(f"Labels in the test set after processing: {set(test_dataset['label'])}")

Labels in the test set after processing: {0.0, 1.2000000476837158, 2.4000000953674316, 3.4000000953674316, 1.0, 3.799999952316284, 2.799999952316284, 3.75, 1.25, 4.599999904632568, 4.25, 5.0, 3.3329999446868896, 0.20000000298023224, 2.0, 1.7999999523162842, 2.25, 3.0, 3.5999999046325684, 2.3329999446868896, 2.200000047683716, 4.0, 4.400000095367432, 4.5, 3.25, 3.200000047683716, 4.800000190734863, 4.199999809265137, 4.132999897003174, 4.75, 4.666999816894531, 0.4000000059604645, 0.6000000238418579, 0.800000011920929, 1.600000023841858, 1.399999976158142, 1.75, 0.5, 0.8999999761581421, 0.25, 0.23100000619888306, 2.5999999046325684, 2.5, 2.75, 2.3299999237060547, 2.812000036239624, 1.5, 0.75, 0.3330000042915344, 3.5, 3.875, 3.3333332538604736, 4.571000099182129}


In [7]:
# Load Roberta Base
model_name = "roberta-base"

# Since sts-b is a regression dataset, we need to set label = 1
# See: RobertaForSequenceClassification
# https://huggingface.co/docs/transformers/model_doc/roberta
num_labels = 1
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
## For MRPC, the study authors loaded the best LORA checkpoint of MNLI, and used that as starting point for further adaptation.

from peft import PeftConfig
# Load LORA checkpoint
#checkpoint_path = "./mnli/results/checkpoint-82472"
#config = PeftConfig.from_pretrained(checkpoint_path)

In [13]:
parameters = {
    "output_folder": 'sts_b_no_lora_init',
    "sequence_length": 512,
    "epochs": 40,
    "batch_size": 16,
    "learning_rate": 4e-4,
    "weight_decay": 0.00,
    "warmup_ratio": 0.06,
    "optimizer": 'adamw_torch',
    "lora_alpha": 8,
    "lora_rank": 8,
    "lora_target_modules": ["query", "key", "value"],
    "lora_drop_out": 0.0
}


In [14]:
# Get "Lora-fied" model
target_modules = ["query", "key", "value"]
config = LoraConfig(
    task_type="SEQ_CLS",
    r=parameters["lora_rank"],
    lora_alpha=parameters["lora_alpha"],
    target_modules=parameters["lora_target_modules"],
    lora_dropout=parameters["lora_drop_out"],
    init_lora_weights=True
)
#lora_model = LoraModel(model, config, "default")

from peft import  get_peft_model
peft_model = get_peft_model(model, config)

In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

In [15]:
# Tokenize the dataset
def preprocess_function(examples):
    # Remove entries with -1
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=parameters['sequence_length'])

#train_encoded_dataset = dataset['train'].map(preprocess_function, batched=True)
#test_encoded_dataset = test_set.map(preprocess_function, batched=True)
#val_encoded_dataset = validation_set.map(preprocess_function, batched=True)

# Prepare datasets for training

# For debug purposes: Load only 1 % of the dataset 
#train_dataset = dataset['train'].map(preprocess_function, batched=True).shuffle(seed=seed)
#eval_dataset = dataset['validation'].map(preprocess_function, batched=True).shuffle(seed=seed)
#test_dataset = dataset['test'].map(preprocess_function, batched=True)

train_dataset = train_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
eval_dataset = eval_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
test_dataset = test_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)


Map:   0%|          | 0/5799 [00:00<?, ? examples/s]

Map:   0%|          | 0/725 [00:00<?, ? examples/s]

Map:   0%|          | 0/725 [00:00<?, ? examples/s]

In [24]:
dataset['train'].data

MemoryMappedTable
sentence1: string
sentence2: string
label: float
idx: int32
----
sentence1: [["A plane is taking off.","A man is playing a large flute.","A man is spreading shreded cheese on a pizza.","Three men are playing chess.","A man is playing the cello.",...,"A young woman is playing a guitar.","The men are playing soccer.","A woman is running on the beach.","A man is straining pasta.","Panda's play on a swing."],["Two green and white trains sitting on the tracks.","A small white cat with glowing eyes standing underneath a chair.","A large boat in the water at the marina.","a bus driving in a street.","A passenger train waiting in a station.",...,"Dog running towards camera with a ball in its mouth.","A young girl running on the beach.","A baseball player throws the ball.","A man is swinging on a rope over water.","A young girl dressed in a Minnie mouse outfit and an older woman walking down the sidewalk."],...,["Aust stocks open lower","WikiLeaks begins publishing two million

In [16]:

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./{parameters['output_folder']}/results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",
    learning_rate=parameters['learning_rate'],
    per_device_train_batch_size=parameters['batch_size'],
    per_device_eval_batch_size=parameters['batch_size'],
    num_train_epochs=parameters['epochs'],
    weight_decay=parameters['weight_decay'],
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    optim=parameters['optimizer'],
    load_best_model_at_end=True,
    report_to="none",
    warmup_ratio=parameters['warmup_ratio'],
)

# Accuracy metric
metric = load('pearsonr')

def compute_metric(p):
    preds = p.predictions
    labels = p.label_ids
    return metric.compute(predictions=preds, references=labels)

# Define the trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

# Train the model
#trainer.train(resume_from_checkpoint=True)
trainer.train()


# Save model
trainer.save_model(f"./{parameters['output_folder']}/model")



Epoch,Training Loss,Validation Loss,Pearsonr
1,0.7921,0.621271,0.85806
2,0.5946,0.496039,0.903894
3,0.5004,0.445352,0.907606
4,0.3828,0.392858,0.911693
5,0.6273,0.41876,0.908981
6,0.3,0.429684,0.915923
7,0.3124,0.500494,0.916262
8,0.3375,0.358964,0.917712
9,0.2871,0.394387,0.918848
10,0.2823,0.444584,0.913848


In [19]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.3589639961719513, 'eval_pearsonr': 0.9177121449162163, 'eval_runtime': 6.4806, 'eval_samples_per_second': 111.873, 'eval_steps_per_second': 7.098, 'epoch': 40.0}


In [14]:
eval_dataset.data

MemoryMappedTable
sentence1: string
sentence2: string
label: float
idx: int32
input_ids: list<item: int32>
  child 0, item: int32
attention_mask: list<item: int8>
  child 0, item: int8
----
sentence1: [["Older man wearing beret with mountains in background.","Tan cow with ear tags on a grassy field.","A slow loris bites a persons fingers.","A black dog is running through the snow.","X'mas cruise passengers hit by Norovirus",...,"Glover spoke at a news conference that included about 20 relatives of the victims.","Syrian envoy to Cyprus defects to Qatar","Yes, dull dialog should be removed completely (or transformed to interesting dialog).","Like Chris said above, stars rotate to conserve their angular momentum.","Lakers Fire Coach Mike Brown After 1-4 Start"]]
sentence2: [["A sheep in the morning mist with trees in the background.","Many sheep standing on a green hill.","A small animal is chewing on a finger.","The black and brown dog is running through the snow.","Missing cruise passen

In [20]:
# Test set
test_set_results = trainer.predict(test_dataset)
test_set_results.metrics

{'test_loss': 0.3933914601802826,
 'test_pearsonr': 0.908201448121808,
 'test_runtime': 6.1971,
 'test_samples_per_second': 116.99,
 'test_steps_per_second': 7.423}

In [None]:
test_set_results.predictions.argmax(axis=1)

In [None]:
test_set_results.label_ids