In [1]:
%pip install transformers datasets accelerate peft


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# RoBERTa Base

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import random
import numpy as np
from peft import LoraModel, LoraConfig

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

mps


In [3]:
seed = 0
def set_seed(seed: int):
    # Set seed for Python's random module
    random.seed(seed)

    # Set seed for NumPy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)

    # Set seed for CUDA (if using)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

    # Make PyTorch deterministic (this can slow down the computation)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Example of usage
set_seed(seed)


In [4]:
dataset = load_dataset("glue", "qnli")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
})

In [7]:
# Load Roberta Base
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [97]:
target_modules = ["query", "key", "value"]
config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,
    lora_alpha=8,
    target_modules=target_modules,
    lora_dropout=0.01,
    init_lora_weights=True
)
#lora_model = LoraModel(model, config, "default")

In [98]:
from peft import  get_peft_model
peft_model = get_peft_model(model, config)

In [99]:
### To be removed. Initialize weights to 0.
for layer in peft_model.base_model.roberta.encoder.layer:
    lora_A = layer.attention.self.query.lora_A["default"]
    lora_B = layer.attention.self.query.lora_B["default"]

    torch.nn.init.constant_(lora_A.weight, 0.1)
    torch.nn.init.constant_(lora_B.weight, 0.1)


### To be removed. Just for checking if lora matrices are filled after training
peft_model.base_model.roberta.encoder.layer[0].attention.self.query.lora_A.default.weight[0]

tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000, 0.1000, 0.1000, 0.1000, 

In [48]:
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): Mo

In [None]:
# Check that attention module has been substituded by lora
peft_model.print_trainable_parameters

<bound method PeftModel.print_trainable_parameters of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
              

In [12]:
from evaluate import load
import math

In [None]:
# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["question"], examples["sentence"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)

# Prepare datasets for training
train_dataset = encoded_dataset["train"].select((range(math.ceil(len(encoded_dataset['train']) * 0.01))))
val_dataset = encoded_dataset["validation"].select((range(math.ceil(len(encoded_dataset['train']) * 0.01))))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save model checkpoints
    evaluation_strategy="epoch",
   # learning_rate=2e-5,
    learning_rate=1, # Set high learning rate such that lora matrices are updated
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none"
)

# Accuracy metric
accuracy_metric = load('accuracy')

def compute_accuracy(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    return accuracy_metric.compute(predictions=preds, references=labels)

# Define the trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

  trainer = Trainer(


  0%|          | 0/66 [00:00<?, ?it/s]

{'loss': 236.225, 'grad_norm': 27.045738220214844, 'learning_rate': 0.8484848484848485, 'epoch': 0.15}
{'loss': 252.7661, 'grad_norm': 20.04033088684082, 'learning_rate': 0.696969696969697, 'epoch': 0.3}
{'loss': 149.3725, 'grad_norm': 22.198265075683594, 'learning_rate': 0.5454545454545454, 'epoch': 0.45}
{'loss': 111.466, 'grad_norm': 13.189260482788086, 'learning_rate': 0.3939393939393939, 'epoch': 0.61}
{'loss': 42.4818, 'grad_norm': 4.23345422744751, 'learning_rate': 0.24242424242424243, 'epoch': 0.76}
{'loss': 24.9342, 'grad_norm': 8.656357765197754, 'learning_rate': 0.09090909090909091, 'epoch': 0.91}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 6.260356426239014, 'eval_accuracy': 0.4818702290076336, 'eval_runtime': 43.976, 'eval_samples_per_second': 23.831, 'eval_steps_per_second': 1.501, 'epoch': 1.0}
{'train_runtime': 158.0842, 'train_samples_per_second': 6.629, 'train_steps_per_second': 0.417, 'train_loss': 125.51159078424627, 'epoch': 1.0}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 6.260356426239014, 'eval_accuracy': 0.4818702290076336, 'eval_runtime': 42.859, 'eval_samples_per_second': 24.452, 'eval_steps_per_second': 1.54, 'epoch': 1.0}


In [64]:
peft_model.base_model.roberta.encoder

RobertaEncoder(
  (layer): ModuleList(
    (0-11): 12 x RobertaLayer(
      (attention): RobertaAttention(
        (self): RobertaSdpaSelfAttention(
          (query): lora.Linear(
            (base_layer): Linear(in_features=768, out_features=768, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.01, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=768, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (key): lora.Linear(
            (base_layer): Linear(in_features=768, out_features=768, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.01, inplace=False)
            )
   

In [None]:
trainer.model.base

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): Mo

In [101]:
peft_model.base_model.roberta.encoder.layer[0].attention.self.query.lora_A.default.weight

Parameter containing:
tensor([[-3.0613, -4.4935, -0.2736,  ..., -5.5904, -3.7744,  4.0617],
        [-3.0613, -4.4935, -0.2736,  ..., -5.5904, -3.7744,  4.0617],
        [-3.0613, -4.4935, -0.2736,  ..., -5.5904, -3.7744,  4.0617],
        ...,
        [-3.0613, -4.4935, -0.2736,  ..., -5.5904, -3.7744,  4.0617],
        [-3.0613, -4.4935, -0.2736,  ..., -5.5904, -3.7744,  4.0617],
        [-3.0613, -4.4935, -0.2736,  ..., -5.5904, -3.7744,  4.0617]],
       device='mps:0', requires_grad=True)

In [85]:
peft_model.base_model

LoraModel(
  (model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSdpaSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.01, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=False)

In [105]:
peft_model.base_model.roberta.encoder.layer[0].attention.self.key.lora_B.default.weight

Parameter containing:
tensor([[ 1.2680,  4.1198,  4.6769,  ...,  1.6440,  2.3128, -0.1488],
        [ 3.9331,  5.6310,  5.4588,  ..., -4.5411, -5.3368, -5.5312],
        [ 4.6125,  0.6717,  2.3467,  ..., -5.6399,  1.5065, -5.2378],
        ...,
        [-0.9901, -1.6925,  1.3907,  ...,  2.6036,  1.8322, -2.7836],
        [ 2.3692,  3.2960,  0.1934,  ..., -5.5047, -1.6996,  5.5590],
        [ 2.6138,  0.5827, -1.0653,  ..., -4.4279,  1.5064,  1.6354]],
       device='mps:0', requires_grad=True)

In [None]:
print(torch.sum(layer.attention.self.query.lora_A["default"].weight))  # Should print 0


tensor(0., device='mps:0', grad_fn=<SumBackward0>)
