In [8]:
%pip install transformers datasets evaluate accelerate peft tf-keras


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# RoBERTa Base

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets
import torch
import random
import numpy as np
from peft import LoraModel, LoraConfig
from evaluate import load
import math

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

mps


In [10]:
seed = 0
def set_seed(seed: int):
    # Set seed for Python's random module
    random.seed(seed)

    # Set seed for NumPy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)

    # Set seed for CUDA (if using)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

    # Make PyTorch deterministic (this can slow down the computation)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Example of usage
set_seed(seed)


In [11]:
dataset = load_dataset("glue", "qnli")

In [12]:
dataset['train']

Dataset({
    features: ['question', 'sentence', 'label', 'idx'],
    num_rows: 104743
})

### Test set contains unlabeled data (-1). Hence, we will need to redo the trainig/evaluation/test split

In [13]:
print(f"Labels in the training set: {set(dataset['train']['label'])}")
print(f"Labels in the evaluation set: {set(dataset['validation']['label'])}")
print(f"Labels in the test set: {set(dataset['test']['label'])}")

Labels in the training set: {0, 1}
Labels in the evaluation set: {0, 1}
Labels in the test set: {-1}


### Combine validation / test sets

In [14]:
#combined_test_set = concatenate_datasets([dataset['test_matched'], dataset['test_mismatched']])

merged_dataset = concatenate_datasets([dataset['validation'], dataset['train']])

# Split into 80% train and 20% temp (for test + validation)
temp_split = merged_dataset.train_test_split(test_size=0.2, seed=seed)
train_dataset = temp_split["train"]
temp_dataset = temp_split["test"]

# Further split temp into 50% test and 50% validation
test_eval_split = temp_dataset.train_test_split(test_size=0.5, seed=seed)
test_dataset = test_eval_split["train"]
eval_dataset = test_eval_split["test"]


print(f"Labels in the test set after processing: {set(test_dataset['label'])}")

Labels in the test set after processing: {0, 1}


### Split: 80% Training data / 10% Evaluation / 10% Test

In [15]:
# Load Roberta Base
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(set(dataset['train']['label']))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
print(num_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2


In [16]:
parameters = {
    "output_folder": 'qnli',
    "sequence_length": 512,
    "epochs": 30,
    "batch_size": 16,
    "learning_rate": 5e-4,
    "weight_decay": 0.00,
    "warmup_ratio": 0.06,
    "optimizer": 'adamw_torch',
    "lora_alpha": 8,
    "lora_rank": 8,
    "lora_target_modules": ["query", "key", "value"],
    "lora_drop_out": 0.0
}


In [17]:
# Load "lora-fied" model
target_modules = ["query", "key", "value"]
config = LoraConfig(
    task_type="SEQ_CLS",
    r=parameters["lora_rank"],
    lora_alpha=parameters["lora_alpha"],
    target_modules=parameters["lora_target_modules"],
    lora_dropout=parameters["lora_drop_out"],
    init_lora_weights=True
)
#lora_model = LoraModel(model, config, "default")

from peft import  get_peft_model
peft_model = get_peft_model(model, config)

### We report the overall (matched and mismatched) accuracy for MNLI, Matthew’s correlation for CoLA, Pearson correlation for STS-B, and accuracy for other tasks.

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
})

In [19]:
dataset['validation'].data

MemoryMappedTable
question: string
sentence: string
label: int64
idx: int32
----
question: [["What came into force after the new constitution was herald?","What is the first major city in the stream of the Rhine?","What is the minimum required if you want to teach in Canada?","How was Temüjin kept imprisoned by the Tayichi'ud?","What did Herr Gott, dich loben wir become known as ?",...,"Which former referee served as an analyst for CBS?","The papers of which famous English Victorian author are collected in the library?","When was the Old Truman Brewery founded?","What did Tesla work on in 1888?","How high do plague fevers run?"],["How many more landing sites for the Apollo missions did NASA have planned?","What is issued once construction is complete and a final inspection has been passed?","Why did Tesla want funds from Morgan?","Where does the largest part of Kenya's power come from?","In the mid 18th century, who did not concur that 1 should be the first prime number?",...,"Who was 

In [21]:
# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples['question'],examples["sentence"], truncation=True, max_length=parameters['sequence_length'])

# Prepare datasets for training

# For debug purposes: Load only 1 % of the dataset
#train_dataset_tokenized = dataset['train'].map(preprocess_function, batched=True).shuffle(seed=seed)
#eval_dataset_tokenized = dataset['validation'].map(preprocess_function, batched=True).shuffle(seed=seed)
#test_dataset_tokenized = dataset['test'].map(preprocess_function, batched=True)


train_dataset_tokenized = train_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
eval_dataset_tokenized = eval_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
test_dataset_tokenized = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/88164 [00:00<?, ? examples/s]

Map:   0%|          | 0/11021 [00:00<?, ? examples/s]

Map:   0%|          | 0/11021 [00:00<?, ? examples/s]

In [24]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./{parameters['output_folder']}/results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",
    learning_rate=parameters['learning_rate'],
    per_device_train_batch_size=parameters['batch_size'],
    per_device_eval_batch_size=parameters['batch_size'],
    num_train_epochs=parameters['epochs'],
    weight_decay=parameters['weight_decay'],
    save_strategy="epoch",
    logging_dir=f"./{parameters['output_folder']}/logs",
    logging_steps=10,
    optim=parameters['optimizer'],
    load_best_model_at_end=True,
    report_to="none",
    warmup_ratio=parameters['warmup_ratio'],
)

# Accuracy metric
metric = load('accuracy')

# Matthews correlation metric
#metric = load("matthews_correlation")

def compute_metric(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    return metric.compute(predictions=preds, references=labels)

# Define the trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

# Train the model
#trainer.train(resume_from_checkpoint=True)
trainer.train()


# Save model
trainer.save_model(f"./{parameters['output_folder']}/model")

  trainer = Trainer(


  0%|          | 0/165330 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [28]:
# Evaluate model
results = trainer.evaluate()
print(results)

  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 0.45087283849716187, 'eval_matthews_correlation': 0.5293023756875906, 'eval_runtime': 13.4779, 'eval_samples_per_second': 77.386, 'eval_steps_per_second': 4.897, 'epoch': 1.0}


In [29]:
# Test set
test_set_results = trainer.predict(test_dataset_tokenized)
test_set_results.metrics

  0%|          | 0/67 [00:00<?, ?it/s]

{'test_loss': 0.0,
 'test_matthews_correlation': 0.0,
 'test_runtime': 15.2682,
 'test_samples_per_second': 69.622,
 'test_steps_per_second': 4.388}

In [None]:
test_set_results.predictions.argmax(axis=1)

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
test_set_results.label_ids

array([2, 0, 1, ..., 2, 2, 0])