In [2]:
%pip install transformers datasets evaluate accelerate peft tf-keras


Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting accelerate
  Using cached accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Using cached peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting tf-keras
  Using cached tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting tensorflow<2.19,>=2.18 (from tf-keras)
  Using cached tensorflow-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow<2.1

# RoBERTa Base /COLA

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets
import torch
import random
import numpy as np
from peft import LoraModel, LoraConfig
from evaluate import load
import math

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

2025-01-05 19:11:53.125134: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736104313.140187     251 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736104313.144691     251 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-05 19:11:53.164599: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cuda


In [4]:
seed = 0
def set_seed(seed: int):
    # Set seed for Python's random module
    random.seed(seed)

    # Set seed for NumPy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)

    # Set seed for CUDA (if using)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

    # Make PyTorch deterministic (this can slow down the computation)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Example of usage
set_seed(seed)


In [5]:
dataset = load_dataset("glue", "cola")

In [21]:
dataset['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 8551
})

### Test set contains unlabeled data (-1). Hence, we will need to redo the trainig/evaluation/test split

In [6]:
print(f"Labels in the training set: {set(dataset['train']['label'])}")
print(f"Labels in the evaluation set: {set(dataset['validation']['label'])}")
print(f"Labels in the test set: {set(dataset['test']['label'])}")

Labels in the training set: {0, 1}
Labels in the evaluation set: {0, 1}
Labels in the test set: {-1}


### Combine validation / test sets

In [7]:
#combined_test_set = concatenate_datasets([dataset['test_matched'], dataset['test_mismatched']])

merged_dataset = concatenate_datasets([dataset['validation'], dataset['train']])

# Split into 80% train and 20% temp (for test + validation)
temp_split = merged_dataset.train_test_split(test_size=0.2, seed=seed)
train_dataset = temp_split["train"]
temp_dataset = temp_split["test"]

# Further split temp into 50% test and 50% validation
test_eval_split = temp_dataset.train_test_split(test_size=0.5, seed=seed)
test_dataset = test_eval_split["train"]
eval_dataset = test_eval_split["test"]


print(f"Labels in the test set after processing: {set(test_dataset['label'])}")

Labels in the test set after processing: {0, 1}


### Split: 80% Training data / 10% Evaluation / 10% Test

In [8]:
# Load Roberta Base
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(set(dataset['train']['label']))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
print(num_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2


In [9]:
parameters = {
    "output_folder": 'cola',
    "sequence_length": 512,
    "epochs": 80,
    "batch_size": 32,
    "learning_rate": 4e-4,
    "weight_decay": 0.00,
    "warmup_ratio": 0.06,
    "optimizer": 'adamw_torch',
    "lora_alpha": 8,
    "lora_rank": 8,
    "lora_target_modules": ["query", "key", "value"],
    "lora_drop_out": 0.0
}


In [10]:
# Load "lora-fied" model
target_modules = ["query", "key", "value"]
config = LoraConfig(
    task_type="SEQ_CLS",
    r=parameters["lora_rank"],
    lora_alpha=parameters["lora_alpha"],
    target_modules=parameters["lora_target_modules"],
    lora_dropout=parameters["lora_drop_out"],
    init_lora_weights=True
)
#lora_model = LoraModel(model, config, "default")

from peft import  get_peft_model
peft_model = get_peft_model(model, config)

### We report the overall (matched and mismatched) accuracy for MNLI, Matthew’s correlation for CoLA, Pearson correlation for STS-B, and accuracy for other tasks.

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [13]:
dataset['validation'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["The sailors rode the breeze clear of the rocks.","The weights made the rope stretch over the pulley.","The mechanical doll wriggled itself loose.","If you had eaten more, you would want less.","As you eat the most, you want the least.",...,"John counted on Bill to get there on time.","I bought Bill a book to read.","John told Mary that it would be important to leave early.","John told Mary that it was important to Fred to leave early.","John, told Mary that it would be appropriate to leave together."],["the election of John president surprised me.","John's arriving dead surprised me.","the attempt by John to leave surprised me.","John left orders to follow Pete.","John left us orders to follow Pete.",...,"John considers Bill silly.","John considers Bill to be silly.","John bought a dog for himself to play with.","John arranged for himself to get the prize.","John talked to Bill about himself."]]
label: [[1,1,1

In [11]:
# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, max_length=parameters['sequence_length'])

# Prepare datasets for training

# For debug purposes: Load only 1 % of the dataset
#train_dataset_tokenized = dataset['train'].map(preprocess_function, batched=True).shuffle(seed=seed)
#eval_dataset_tokenized = dataset['validation'].map(preprocess_function, batched=True).shuffle(seed=seed)
#test_dataset_tokenized = dataset['test'].map(preprocess_function, batched=True)


train_dataset_tokenized = train_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
eval_dataset_tokenized = eval_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
test_dataset_tokenized = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

In [15]:
train_dataset_tokenized[0]

{'sentence': 'The captain sank the boat.',
 'label': 1,
 'idx': 6145,
 'input_ids': [0, 133, 3449, 14501, 5, 4293, 4, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./{parameters['output_folder']}/results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",
    learning_rate=parameters['learning_rate'],
    per_device_train_batch_size=parameters['batch_size'],
    per_device_eval_batch_size=parameters['batch_size'],
    num_train_epochs=parameters['epochs'],
    weight_decay=parameters['weight_decay'],
    save_strategy="epoch",
    logging_dir=f"./{parameters['output_folder']}/logs",
    logging_steps=10,
    optim=parameters['optimizer'],
    load_best_model_at_end=True,
    report_to="none",
    warmup_ratio=parameters['warmup_ratio'],
)

# Accuracy metric
#accuracy_metric = load('accuracy')

# Matthews correlation metric
metric = load("matthews_correlation")

def compute_metric(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    return metric.compute(predictions=preds, references=labels)

# Define the trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

# Train the model
#trainer.train(resume_from_checkpoint=True)
trainer.train()


# Save model
trainer.save_model(f"./{parameters['output_folder']}/model")



Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.6177,0.590456,0.0
2,0.424,0.443403,0.526008
3,0.4129,0.435298,0.56307
4,0.4052,0.417291,0.551943
5,0.3547,0.476537,0.522092
6,0.2656,0.446542,0.592966
7,0.332,0.431125,0.619447
8,0.232,0.471423,0.600295
9,0.2309,0.450983,0.568879
10,0.2103,0.551051,0.596224


In [13]:
# Evaluate model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.4172908067703247, 'eval_matthews_correlation': 0.55194268929432, 'eval_runtime': 0.4282, 'eval_samples_per_second': 2241.782, 'eval_steps_per_second': 70.056, 'epoch': 80.0}


In [14]:
# Test set
test_set_results = trainer.predict(test_dataset_tokenized)
test_set_results.metrics

{'test_loss': 0.40766894817352295,
 'test_matthews_correlation': 0.6024931941418374,
 'test_runtime': 0.4415,
 'test_samples_per_second': 2172.178,
 'test_steps_per_second': 67.951}

In [15]:
test_dataset_tokenized[0]

{'sentence': 'I shaped a loaf from the dough.',
 'label': 0,
 'idx': 2952,
 'input_ids': [0, 100, 14216, 10, 34998, 31, 5, 14397, 4, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}