In [1]:
import argparse
import os

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
import peft

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from tqdm import tqdm


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/hakimo/anaconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
CUDA SETUP: CUDA runtime path found: /home/hakimo/anaconda3/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /home/hakimo/anaconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [2]:
import importlib
importlib.reload(peft)

<module 'peft' from '/home/hakimo/Documents/peft/src/peft/__init__.py'>

In [3]:
batch_size = 8
model_name_or_path = "roberta-large"
task = "mrpc"
peft_type = peft.PeftType.IA3
device = "cuda"
num_epochs = 20

In [4]:
# peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
peft_config = peft.IA3Config(task_type="SEQ_CLS", inference_mode=False, ia3_dropout=0.0)
lr = 3e-4

In [5]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

datasets = load_dataset("glue", task)
metric = evaluate.load("glue", task)


def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
    return outputs


tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["idx", "sentence1", "sentence2"],
)

# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")


def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")


# Instantiate dataloaders.
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
)

Found cached dataset glue (/home/hakimo/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/hakimo/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-686f2c2419aa9ed3.arrow
Loading cached processed dataset at /home/hakimo/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-5c3527a139d01a80.arrow
Loading cached processed dataset at /home/hakimo/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1085d9090b2bd349.arrow


In [6]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
model = peft.get_peft_model(model, peft_config)
model.print_trainable_parameters()
model

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

trainable params: 2201604 || all params: 356511748 || trainable%: 0.6175403790620667


PeftModelForSequenceClassification(
  (base_model): IA3Model(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0): RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(in_features=1024, out_features=1024, bias=True)
                  (key): Linear(
                    in_features=1024, out_features=1024, bias=True
                    (ia3_dropout): ModuleDict(
                      (default): Identity()
                    )
       

In [7]:
model.peft_config

{'default': IA3Config(peft_type=<PeftType.IA3: 'IA3'>, base_model_name_or_path='roberta-large', task_type='SEQ_CLS', inference_mode=False, target_modules=['key', 'value', 'output.dense'], fan_in_fan_out=False, ia3_dropout=0.0, modules_to_save=None, init_ia3_weights=True)}

In [19]:
key_list = [key for key, _ in model.model.named_modules()]
print("Key list: ", key_list)

Key list:  ['', 'roberta', 'roberta.embeddings', 'roberta.embeddings.word_embeddings', 'roberta.embeddings.position_embeddings', 'roberta.embeddings.token_type_embeddings', 'roberta.embeddings.LayerNorm', 'roberta.embeddings.dropout', 'roberta.encoder', 'roberta.encoder.layer', 'roberta.encoder.layer.0', 'roberta.encoder.layer.0.attention', 'roberta.encoder.layer.0.attention.self', 'roberta.encoder.layer.0.attention.self.query', 'roberta.encoder.layer.0.attention.self.query.ia3_dropout', 'roberta.encoder.layer.0.attention.self.query.ia3_dropout.default', 'roberta.encoder.layer.0.attention.self.query.ia3_l', 'roberta.encoder.layer.0.attention.self.key', 'roberta.encoder.layer.0.attention.self.value', 'roberta.encoder.layer.0.attention.self.value.ia3_dropout', 'roberta.encoder.layer.0.attention.self.value.ia3_dropout.default', 'roberta.encoder.layer.0.attention.self.value.ia3_l', 'roberta.encoder.layer.0.attention.self.dropout', 'roberta.encoder.layer.0.attention.output', 'roberta.encode

In [8]:
optimizer = AdamW(params=model.parameters(), lr=lr)

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [9]:
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    print(f"epoch {epoch}:", eval_metric)

  0%|                                                   | 0/459 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  6.04it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.70it/s]


epoch 0: {'accuracy': 0.6887254901960784, 'f1': 0.8095952023988007}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  6.04it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.63it/s]


epoch 1: {'accuracy': 0.7083333333333334, 'f1': 0.8226527570789866}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  6.02it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.60it/s]


epoch 2: {'accuracy': 0.7279411764705882, 'f1': 0.8042328042328043}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  5.99it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.57it/s]


epoch 3: {'accuracy': 0.7769607843137255, 'f1': 0.852988691437803}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  6.00it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.58it/s]


epoch 4: {'accuracy': 0.7843137254901961, 'f1': 0.8562091503267973}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  6.01it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.57it/s]


epoch 5: {'accuracy': 0.8014705882352942, 'f1': 0.867430441898527}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  6.00it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.55it/s]


epoch 6: {'accuracy': 0.8088235294117647, 'f1': 0.8717105263157895}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  6.01it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.56it/s]


epoch 7: {'accuracy': 0.8088235294117647, 'f1': 0.8592057761732851}


100%|█████████████████████████████████████████| 459/459 [01:15<00:00,  6.05it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.56it/s]


epoch 8: {'accuracy': 0.8308823529411765, 'f1': 0.8836424957841484}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  5.98it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.54it/s]


epoch 9: {'accuracy': 0.8186274509803921, 'f1': 0.875}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  5.98it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.55it/s]


epoch 10: {'accuracy': 0.8357843137254902, 'f1': 0.8885191347753744}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  5.99it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.53it/s]


epoch 11: {'accuracy': 0.8382352941176471, 'f1': 0.8892617449664431}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  5.99it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.54it/s]


epoch 12: {'accuracy': 0.8333333333333334, 'f1': 0.8823529411764706}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  6.02it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.56it/s]


epoch 13: {'accuracy': 0.8382352941176471, 'f1': 0.8885135135135136}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  5.99it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.52it/s]


epoch 14: {'accuracy': 0.8431372549019608, 'f1': 0.891156462585034}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  5.98it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.47it/s]


epoch 15: {'accuracy': 0.8357843137254902, 'f1': 0.8846815834767642}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  6.01it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.55it/s]


epoch 16: {'accuracy': 0.8357843137254902, 'f1': 0.8850771869639793}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  5.97it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.53it/s]


epoch 17: {'accuracy': 0.8382352941176471, 'f1': 0.8877551020408163}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  5.99it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.47it/s]


epoch 18: {'accuracy': 0.8357843137254902, 'f1': 0.8858603066439524}


100%|█████████████████████████████████████████| 459/459 [01:16<00:00,  5.96it/s]
100%|███████████████████████████████████████████| 51/51 [00:04<00:00, 12.48it/s]

epoch 19: {'accuracy': 0.8357843137254902, 'f1': 0.8858603066439524}





## Share adapters on the 🤗 Hub

In [8]:
# model.push_to_hub("smangrul/roberta-large-peft-lora", use_auth_token=True)

CommitInfo(commit_url='https://huggingface.co/smangrul/roberta-large-peft-lora/commit/c2c661898b8b6a0c68ecd068931e598d0a79686b', commit_message='Upload model', commit_description='', oid='c2c661898b8b6a0c68ecd068931e598d0a79686b', pr_url=None, pr_revision=None, pr_num=None)

## Load adapters from the Hub

You can also directly load adapters from the Hub using the commands below:

In [11]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "smangrul/roberta-large-peft-lora"
config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
inference_model = PeftModel.from_pretrained(inference_model, peft_model_id)

inference_model.to(device)
inference_model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    batch.to(device)
    with torch.no_grad():
        outputs = inference_model(**batch)
    predictions = outputs.logits.argmax(dim=-1)
    predictions, references = predictions, batch["labels"]
    metric.add_batch(
        predictions=predictions,
        references=references,
    )

eval_metric = metric.compute()
print(eval_metric)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifie

{'accuracy': 0.8946078431372549, 'f1': 0.924693520140105}



