In [12]:
%pip install transformers datasets evaluate accelerate peft tf-keras


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


# RoBERTa Base

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets
import torch
import random
import numpy as np
from peft import LoraModel, LoraConfig
from evaluate import load
import math

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

mps


In [3]:
seed = 0
def set_seed(seed: int):
    # Set seed for Python's random module
    random.seed(seed)

    # Set seed for NumPy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)

    # Set seed for CUDA (if using)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

    # Make PyTorch deterministic (this can slow down the computation)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Example of usage
set_seed(seed)


In [4]:
dataset = load_dataset("stanfordnlp/sst2")

Downloading readme:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 3.11M/3.11M [00:01<00:00, 2.59MB/s]
Downloading data: 100%|██████████| 72.8k/72.8k [00:00<00:00, 153kB/s]
Downloading data: 100%|██████████| 148k/148k [00:00<00:00, 368kB/s]


Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [5]:
dataset['train']

Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 67349
})

### Test set contains unlabeled data (-1). Hence, we will need to redo the trainig/evaluation/test split

In [8]:
print(f"Labels in the training set: {set(dataset['train']['label'])}")
print(f"Labels in the evaluation set: {set(dataset['validation']['label'])}")
print(f"Labels in the test set: {set(dataset['test']['label'])}")

Labels in the training set: {0, 1}
Labels in the evaluation set: {0, 1}
Labels in the test set: {-1}


### Combine validation / test sets

In [10]:
#combined_test_set = concatenate_datasets([dataset['test_matched'], dataset['test_mismatched']])

merged_dataset = concatenate_datasets([dataset['validation'], dataset['train']])

# Split into 80% train and 20% temp (for test + validation)
temp_split = merged_dataset.train_test_split(test_size=0.2, seed=seed)
train_dataset = temp_split["train"]
temp_dataset = temp_split["test"]

# Further split temp into 50% test and 50% validation
test_eval_split = temp_dataset.train_test_split(test_size=0.5, seed=seed)
test_dataset = test_eval_split["train"]
eval_dataset = test_eval_split["test"]


print(f"Labels in the test set after processing: {set(test_dataset['label'])}")

Labels in the test set after processing: {0, 1}


### Split: 80% Training data / 10% Evaluation / 10% Test

In [11]:
# Load Roberta Base
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(set(dataset['train']['label']))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
print(num_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2


In [12]:
parameters = {
    "output_folder": 'sst2',
    "sequence_length": 512,
    "epochs": 30,
    "batch_size": 16,
    "learning_rate": 5e-4,
    "weight_decay": 0.00,
    "warmup_ratio": 0.06,
    "optimizer": 'adamw_torch',
    "lora_alpha": 8,
    "lora_rank": 8,
    "lora_target_modules": ["query", "key", "value"],
    "lora_drop_out": 0.0
}


In [13]:
# Load "lora-fied" model
target_modules = ["query", "key", "value"]
config = LoraConfig(
    task_type="SEQ_CLS",
    r=parameters["lora_rank"],
    lora_alpha=parameters["lora_alpha"],
    target_modules=parameters["lora_target_modules"],
    lora_dropout=parameters["lora_drop_out"],
    init_lora_weights=True
)
#lora_model = LoraModel(model, config, "default")

from peft import  get_peft_model
peft_model = get_peft_model(model, config)

### We report the overall (matched and mismatched) accuracy for MNLI, Matthew’s correlation for CoLA, Pearson correlation for STS-B, and accuracy for other tasks.

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [18]:
dataset['validation'].data

MemoryMappedTable
idx: int32
sentence: string
label: int64
----
idx: [[0,1,2,3,4,...,867,868,869,870,871]]
sentence: [["it 's a charming and often affecting journey . ","unflinchingly bleak and desperate ","allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . ","the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . ","it 's slow -- very , very slow . ",...,"has all the depth of a wading pool . ","a movie with a real anarchic flair . ","a subject like this should inspire reaction in its audience ; the pianist does not . ","... is an arthritic attempt at directing by callie khouri . ","looking aristocratic , luminous yet careworn in jane hamilton 's exemplary costumes , rampling gives a performance that could not be improved upon . ' "]]
label: [[1,0,1,1,0,...,0,1,0,0,1]]

In [19]:
# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, max_length=parameters['sequence_length'])

# Prepare datasets for training

# For debug purposes: Load only 1 % of the dataset
train_dataset_tokenized = dataset['train'].map(preprocess_function, batched=True).shuffle(seed=seed)
eval_dataset_tokenized = dataset['validation'].map(preprocess_function, batched=True).shuffle(seed=seed)
test_dataset_tokenized = dataset['test'].map(preprocess_function, batched=True)


#train_dataset_tokenized = train_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
#eval_dataset_tokenized = eval_dataset.map(preprocess_function, batched=True).shuffle(seed=seed)
#test_dataset_tokenized = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [21]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./{parameters['output_folder']}/results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",
    learning_rate=parameters['learning_rate'],
    per_device_train_batch_size=parameters['batch_size'],
    per_device_eval_batch_size=parameters['batch_size'],
    num_train_epochs=parameters['epochs'],
    weight_decay=parameters['weight_decay'],
    save_strategy="epoch",
    logging_dir=f"./{parameters['output_folder']}/logs",
    logging_steps=10,
    optim=parameters['optimizer'],
    load_best_model_at_end=True,
    report_to="none",
    warmup_ratio=parameters['warmup_ratio'],
)

# Accuracy metric
accuracy_metric = load('accuracy')

def compute_accuracy(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    return accuracy_metric.compute(predictions=preds, references=labels)

# Define the trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

# Train the model
#trainer.train(resume_from_checkpoint=True)
trainer.train()


# Save model
trainer.save_model(f"./{parameters['output_folder']}/model")

  trainer = Trainer(


  0%|          | 0/126300 [00:00<?, ?it/s]

{'loss': 0.6931, 'grad_norm': 2.2929728031158447, 'learning_rate': 6.598046978094484e-07, 'epoch': 0.0}
{'loss': 0.6759, 'grad_norm': 1.9103926420211792, 'learning_rate': 1.3196093956188968e-06, 'epoch': 0.0}


KeyboardInterrupt: 

In [None]:
# Evaluate model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.45516806840896606,
 'eval_accuracy': 0.2,
 'eval_runtime': 0.1428,
 'eval_samples_per_second': 35.012,
 'eval_steps_per_second': 7.002,
 'epoch': 1.0}

In [None]:
# Test set
test_set_results = trainer.predict(test_dataset)
test_set_results.metrics

  0%|          | 0/2700 [00:00<?, ?it/s]

{'test_loss': 0.4476810693740845,
 'test_accuracy': 0.321025949674761,
 'test_runtime': 905.1736,
 'test_samples_per_second': 47.725,
 'test_steps_per_second': 2.983}

In [None]:
test_set_results.predictions.argmax(axis=1)

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
test_set_results.label_ids

array([2, 0, 1, ..., 2, 2, 0])