In [1]:
%pip install transformers datasets evaluate accelerate peft tf-keras


Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting accelerate
  Using cached accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Using cached peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting tf-keras
  Using cached tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting tensorflow<2.19,>=2.18 (from tf-keras)
  Using cached tensorflow-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow<2.1

# RoBERTa Base

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets
import torch
import random
import numpy as np
from peft import LoraModel, LoraConfig
from evaluate import load
import math

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

2025-01-06 17:57:32.174053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736186252.187997     434 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736186252.192184     434 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-06 17:57:32.209200: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cuda


In [8]:
seed = 40
def set_seed(seed: int):
    # Set seed for Python's random module
    random.seed(seed)

    # Set seed for NumPy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)

    # Set seed for CUDA (if using)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

    # Make PyTorch deterministic (this can slow down the computation)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Example of usage
set_seed(seed)


In [4]:
dataset = load_dataset("glue", "qnli")

In [8]:
dataset['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 8551
})

### Test set contains unlabeled data (-1). Hence, we will need to redo the trainig/evaluation/test split

In [9]:
print(f"Labels in the training set: {set(dataset['train']['label'])}")
print(f"Labels in the evaluation set: {set(dataset['validation']['label'])}")
print(f"Labels in the test set: {set(dataset['test']['label'])}")

Labels in the training set: {0, 1}
Labels in the evaluation set: {0, 1}
Labels in the test set: {-1}


### Combine validation / test sets

In [10]:
#combined_test_set = concatenate_datasets([dataset['test_matched'], dataset['test_mismatched']])

merged_dataset = concatenate_datasets([dataset['validation'], dataset['train']])

# Split into 80% train and 20% temp (for test + validation)
temp_split = merged_dataset.train_test_split(test_size=0.2, seed=seed)
train_dataset = temp_split["train"]
temp_dataset = temp_split["test"]

# Further split temp into 50% test and 50% validation
test_eval_split = temp_dataset.train_test_split(test_size=0.5, seed=seed)
test_dataset = test_eval_split["train"]
eval_dataset = test_eval_split["test"]


print(f"Labels in the test set after processing: {set(test_dataset['label'])}")

Labels in the test set after processing: {0, 1}


### Split: 80% Training data / 10% Evaluation / 10% Test

In [13]:
# Load Roberta Base
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(set(dataset['train']['label']))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
print(num_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2


In [14]:
parameters = {
    "output_folder": 'qnli',
    "sequence_length": 512,
    "epochs": 25,
    "batch_size": 32,
    "learning_rate": 4e-4,
    "weight_decay": 0.00,
    "warmup_ratio": 0.06,
    "optimizer": 'adamw_torch',
    "lora_alpha": 8,
    "lora_rank": 8,
    "lora_target_modules": ["query", "key", "value"],
    "lora_drop_out": 0.0
}


In [15]:
# Load "lora-fied" model
target_modules = ["query", "key", "value"]
config = LoraConfig(
    task_type="SEQ_CLS",
    r=parameters["lora_rank"],
    lora_alpha=parameters["lora_alpha"],
    target_modules=parameters["lora_target_modules"],
    lora_dropout=parameters["lora_drop_out"],
    init_lora_weights=True
)
#lora_model = LoraModel(model, config, "default")

from peft import  get_peft_model
peft_model = get_peft_model(model, config)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
})

In [18]:
dataset['validation'].data

MemoryMappedTable
idx: int32
sentence: string
label: int64
----
idx: [[0,1,2,3,4,...,867,868,869,870,871]]
sentence: [["it 's a charming and often affecting journey . ","unflinchingly bleak and desperate ","allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . ","the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . ","it 's slow -- very , very slow . ",...,"has all the depth of a wading pool . ","a movie with a real anarchic flair . ","a subject like this should inspire reaction in its audience ; the pianist does not . ","... is an arthritic attempt at directing by callie khouri . ","looking aristocratic , luminous yet careworn in jane hamilton 's exemplary costumes , rampling gives a performance that could not be improved upon . ' "]]
label: [[1,0,1,1,0,...,0,1,0,0,1]]

In [16]:
# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, max_length=parameters['sequence_length'])

# Prepare datasets for training

# For debug purposes: Load only 1 % of the dataset
#train_dataset_tokenized = dataset['train'].map(preprocess_function, batched=True).shuffle(seed=seed)
#eval_dataset_tokenized = dataset['validation'].map(preprocess_function, batched=True).shuffle(seed=seed)
#test_dataset_tokenized = dataset['test'].map(preprocess_function, batched=True)


train_dataset_tokenized = train_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
eval_dataset_tokenized = eval_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
test_dataset_tokenized = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/88164 [00:00<?, ? examples/s]

Map:   0%|          | 0/11021 [00:00<?, ? examples/s]

Map:   0%|          | 0/11021 [00:00<?, ? examples/s]

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./{parameters['output_folder']}/results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",
    learning_rate=parameters['learning_rate'],
    per_device_train_batch_size=parameters['batch_size'],
    per_device_eval_batch_size=parameters['batch_size'],
    num_train_epochs=parameters['epochs'],
    weight_decay=parameters['weight_decay'],
    save_strategy="epoch",
    logging_dir=f"./{parameters['output_folder']}/logs",
    logging_steps=10,
    optim=parameters['optimizer'],
    load_best_model_at_end=True,
    report_to="none",
    warmup_ratio=parameters['warmup_ratio'],
)

# Accuracy metric
metric = load('accuracy')

# Matthews correlation metric
#metric = load("matthews_correlation")

def compute_metric(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    return metric.compute(predictions=preds, references=labels)

# Define the trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

# Train the model
#trainer.train(resume_from_checkpoint=True)
trainer.train()


# Save model
trainer.save_model(f"./{parameters['output_folder']}/model")



Epoch,Training Loss,Validation Loss,Accuracy
1,0.6402,0.645038,0.624898
2,0.6381,0.651646,0.626985
3,0.6457,0.63768,0.633609
4,0.6574,0.655583,0.624989
5,0.5998,0.649361,0.624716
6,0.5954,0.667135,0.615643


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
# Evaluate model
results = trainer.evaluate()
print(results)

In [None]:
# Test set
test_set_results = trainer.predict(test_dataset_tokenized)
test_set_results.metrics

In [None]:
test_set_results.predictions.argmax(axis=1)

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
test_set_results.label_ids

array([2, 0, 1, ..., 2, 2, 0])