## Fine-tune large models using 🤗 `peft` adapters, `transformers` & `bitsandbytes`

In this tutorial we will cover how we can fine-tune large language models using the very recent `peft` library and `bitsandbytes` for loading large models in 8-bit.
The fine-tuning method will rely on a recent method called "Low Rank Adapters" (LoRA), instead of fine-tuning the entire model you just have to fine-tune these adapters and load them properly inside the model.
After fine-tuning the model you can also share your adapters on the 🤗 Hub and load them very easily. Let's get started!

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import numpy as np
import torch
os.chdir('/content/drive/My Drive/Colab Notebooks/compositional-reasoning-finetuning-2/')
data_set = 'direct'

### Install requirements

First, run the cells below to install the requirements:

In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build depende

### Model loading

Here let's load the `opt-6.7b` model, its weights in half-precision (float16) are about 13GB on the Hub! If we load them in 8-bit we would require around 7GB of memory instead.

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
model_size = '125m'
model = AutoModelForCausalLM.from_pretrained(
    f"facebook/opt-{model_size}",
    load_in_8bit=True,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained(f"facebook/opt-{model_size}")#,
    # model_max_length=300,
    # padding_side="left")

### Post-processing on the model

Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in `float32` for stability. We also cast the output of the last layer in `float32` for the same reasons.

In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

### Apply LoRA

Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 589824 || all params: 125829120 || trainable%: 0.46875


### Training

In [None]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
data['train'][0]

In [None]:
if data_set == 'self-ask':
  data_files = {"train": "self_ask_train_300.json", "validation": "self_ask_dev_300.json"}
elif data_set == 'direct':
  data_files = {"train": "direct_train_130.json", "validation": "direct_dev_130.json"}
data = load_dataset('data/OPT/', data_files=data_files)
data = data.map(lambda samples: {'input': samples['prompt']+samples['target']})
print(data)
print(data['train'][0])
data = data.map(lambda samples: tokenizer(samples['input'], truncation=True, padding='max_length'), batched=True) # how to include the target?



  0%|          | 0/2 [00:00<?, ?it/s]



DatasetDict({
    train: Dataset({
        features: ['num_target_tokens', 'num_tokens', 'prompt', 'num_prompt_tokens', 'target', 'input'],
        num_rows: 105479
    })
    validation: Dataset({
        features: ['num_target_tokens', 'num_tokens', 'prompt', 'num_prompt_tokens', 'target', 'input'],
        num_rows: 8657
    })
})
{'num_target_tokens': 3, 'num_tokens': 84, 'prompt': "Facts:\nFact #0: Egan was the wife of the state of Alaska's first governor, William Allen Egan, and the mother of former Juneau Mayor and Alaska State Senator Dennis Egan.\nFact #1: William Allen Egan (October 8, 1914 – May 6, 1984) was an American Democratic politician.\n\nQuestion: When was Neva Egan's husband born?\nAnswer:", 'num_prompt_tokens': 81, 'target': 'October 8, 1914', 'input': "Facts:\nFact #0: Egan was the wife of the state of Alaska's first governor, William Allen Egan, and the mother of former Juneau Mayor and Alaska State Senator Dennis Egan.\nFact #1: William Allen Egan (October 8, 19

Map:   0%|          | 0/105479 [00:00<?, ? examples/s]

Map:   0%|          | 0/8657 [00:00<?, ? examples/s]

In [None]:
# print([len(x['input_ids']) for x in data['train'][:10]])
print([(data['train'][i]['input_ids']) for i in range(10) ])
print([len(data['train'][i]['input_ids']) for i in range(10) ])

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 597, 19170, 35, 50118, 37724, 849, 288, 35, 381, 3494, 21, 5, 1141, 9, 5, 194, 9, 7227, 18, 78, 2318, 6, 2897, 3823, 381, 3494, 6, 8, 5, 985, 9, 320, 502, 1180, 3129, 8, 7227, 331, 4336, 8093, 381, 3494, 4, 50118, 37724, 849, 134, 35, 2897, 3823, 381, 3494, 36, 20144, 290, 6, 34720, 126, 392, 231, 6, 11724, 43, 21, 41, 470, 1557, 8676, 4, 50118, 50118, 45641, 35, 520, 21, 17423, 102, 381, 3494, 18, 1623, 2421, 116, 50118, 33683, 35, 20144, 290, 6, 34720], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    eval_dataset=data['validation'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        evaluation_strategy="steps",
        eval_steps=100,
        # warmup_steps=100,
        # max_steps=100,
        num_train_epochs=2,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=50,
        output_dir='outputs',
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
with torch.autocast("cuda"):
  trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,2.2852,2.190836
200,2.1579,2.123133
300,2.108,2.055159
400,2.0918,2.045769
500,2.0526,2.05096
600,2.0485,2.038487
700,2.0455,1.966473
800,2.0458,2.022928
900,2.0126,1.944468
1000,2.0103,1.931143




Step,Training Loss,Validation Loss
100,2.2852,2.190836
200,2.1579,2.123133
300,2.108,2.055159
400,2.0918,2.045769
500,2.0526,2.05096
600,2.0485,2.038487
700,2.0455,1.966473
800,2.0458,2.022928
900,2.0126,1.944468
1000,2.0103,1.931143




## Share adapters on the 🤗 Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub(f"adam-wein/opt-{model_size}-{data_set}", use_auth_token=True)

CommitInfo(commit_url='https://huggingface.co/adam-wein/opt-125m-direct/commit/7bbf0965541b444137f62ce6aea404ef56c79f8f', commit_message='Upload model', commit_description='', oid='7bbf0965541b444137f62ce6aea404ef56c79f8f', pr_url=None, pr_revision=None, pr_num=None)

## Load adapters from the Hub

You can also directly load adapters from the Hub using the commands below:

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = f"adam-wein/opt-{model_size}-{data_set}"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto', max_length=50)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, padding_side='left', model_max_length=500)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

## Inference

You can then directly use the trained model or the model that you have loaded from the 🤗 Hub for inference as you would do it usually in `transformers`.

In [None]:
print(data['validation'][0]['prompt'])
print(data['validation'][0]['target'])

Facts:
Fact #0: Éric Deflandre( born 2 August 1973 in Rocourt) is a former Belgian football right fullback.
Fact #1: Polly Swann( born 5 June 1988) is a British rower and a member of the Great Britain Rowing Team.

Question: Who was born earlier, Polly Swann or Éric Deflandre?
Answer:
Éric Deflandre


In [None]:
batch = tokenizer("Facts:\nFact #0: Éric Deflandre( born 2 August 1973 in Rocourt) is a former Belgian football right fullback.\nFact #1: Polly Swann( born 5 June 1988) is a British rower and a member of the Great Britain Rowing Team.\n\nQuestion: Who was born earlier, Polly Swann or Éric Deflandre?\nAnswer:", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))



 Facts:
Fact #0: Éric Deflandre( born 2 August 1973 in Rocourt) is a former Belgian football right fullback.
Fact #1: Polly Swann( born 5 June 1988) is a British rower and a member of the Great Britain Rowing Team.

Question: Who was born earlier, Polly Swann or Éric Deflandre?
Answer:Éric Def, who is a British, British, British and American rower.

Question: Who, who, in the United, was born, born, born, born, born, is a British, British and born, is a


In [None]:
tokenizer_config_ = {"truncation": True, "return_tensors": "pt"}
print(tokenizer(data['validation'][0]['prompt'], **tokenizer_config_))
data['validation_for_inference'] = data['validation'].map(lambda samples: tokenizer(samples['prompt'], **tokenizer_config_), batched=False)
# data['validation_for_inference'][0]



{'input_ids': tensor([[    2,   597, 19170,    35, 50118, 37724,   849,   288,    35, 24986,
          4063,  7858,  1245,   241,  1640,  2421,   132,   830, 14757,    11,
         16809, 14103,    43,    16,    10,   320, 11731,  1037,   235, 24907,
             4, 50118, 37724,   849,   134,    35, 36189,  3323,  2279,  1640,
          2421,   195,   502, 11151,    43,    16,    10,  1089,  3236,   254,
             8,    10,   919,     9,     5,  2860,  1444,   248,  6932,  2711,
             4, 50118, 50118, 45641,    35,  3394,    21,  2421,   656,     6,
         36189,  3323,  2279,    50, 24986,  4063,  7858,  1245,   241,   116,
         50118, 33683,    35]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
with torch.cuda.amp.autocast():
  data['validation_for_inference'].map(lambda samples:
  {'input_ids': model.generate(inputs=torch.tensor(np.array(samples['input_ids'])),
                                max_new_tokens=10)})

Map:   0%|          | 0/8657 [00:00<?, ? examples/s]



KeyboardInterrupt: ignored