# Using 🤗 PEFT & bitsandbytes to finetune a LoRa checkpoint




In [1]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!nvidia-smi -L

GPU 0: Tesla P40 (UUID: GPU-b574f6d8-095e-1ff6-54a6-8fc5bfa6b7c3)


### Setup the model

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    load_in_4bit=True,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer.pad_token = tokenizer.eos_token

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Freezing the original weights


In [5]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

### Setting up the LoRa Adapters

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, #attention heads
    lora_alpha=16, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 3407872 || all params: 4544008192 || trainable%: 0.07499704789264605


## Data

In [8]:
import transformers
from datasets import load_dataset
data = load_dataset("corbt/all-recipes")


In [9]:
def merge_columns(example):
    example["prediction"] = "This is an example of a recipe: \n\n" + example["input"]
    return example

data['train'] = data['train'].map(merge_columns)
data['train']["prediction"][:5]

['This is an example of a recipe: \n\nNo-Bake Nut Cookies\n\nIngredients:\n- 1 c. firmly packed brown sugar\n- 1/2 c. evaporated milk\n- 1/2 tsp. vanilla\n- 1/2 c. broken nuts (pecans)\n- 2 Tbsp. butter or margarine\n- 3 1/2 c. bite size shredded rice biscuits\n\nDirections:\n- In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.\n- Stir over medium heat until mixture bubbles all over top.\n- Boil and stir 5 minutes more. Take off heat.\n- Stir in vanilla and cereal; mix well.\n- Using 2 teaspoons, drop and shape into 30 clusters on wax paper.\n- Let stand until firm, about 30 minutes.',
 "This is an example of a recipe: \n\nJewell Ball'S Chicken\n\nIngredients:\n- 1 small jar chipped beef, cut up\n- 4 boned chicken breasts\n- 1 can cream of mushroom soup\n- 1 carton sour cream\n\nDirections:\n- Place chipped beef on bottom of baking dish.\n- Place chicken on top of beef.\n- Mix soup and cream together; pour over chicken. Bake, uncovered, at 275°

In [10]:
data['train'][0]

{'input': 'No-Bake Nut Cookies\n\nIngredients:\n- 1 c. firmly packed brown sugar\n- 1/2 c. evaporated milk\n- 1/2 tsp. vanilla\n- 1/2 c. broken nuts (pecans)\n- 2 Tbsp. butter or margarine\n- 3 1/2 c. bite size shredded rice biscuits\n\nDirections:\n- In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.\n- Stir over medium heat until mixture bubbles all over top.\n- Boil and stir 5 minutes more. Take off heat.\n- Stir in vanilla and cereal; mix well.\n- Using 2 teaspoons, drop and shape into 30 clusters on wax paper.\n- Let stand until firm, about 30 minutes.',
 'prediction': 'This is an example of a recipe: \n\nNo-Bake Nut Cookies\n\nIngredients:\n- 1 c. firmly packed brown sugar\n- 1/2 c. evaporated milk\n- 1/2 tsp. vanilla\n- 1/2 c. broken nuts (pecans)\n- 2 Tbsp. butter or margarine\n- 3 1/2 c. bite size shredded rice biscuits\n\nDirections:\n- In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.\n- Sti

In [11]:
data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)

Map:   0%|          | 0/2147248 [00:00<?, ? examples/s]

In [12]:
data

DatasetDict({
    train: Dataset({
        features: ['input', 'prediction', 'input_ids', 'attention_mask'],
        num_rows: 2147248
    })
})

### Pretrain Generate

In [13]:
batch = tokenizer("The best recipe for chocolate chip cookies goes:", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=500)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




 The best recipe for chocolate chip cookies goes: 1 cup of flour, 1 cup of butter, 1 cup of sugar, 1 cup of chocolate chips, and 1 cup of love. You can make a lot of cookies with that recipe. You can make a lot of friends with that recipe. And you can make a lot of money with that recipe. But the best thing about that recipe is that you can make a lot of memories with that recipe.
And that’s what we’re going to do today. We’re going to make a lot of memories with this recipe. We’re going to make a lot of memories with the best recipe for chocolate chip cookies.
This recipe is the best because it’s easy. It’s easy because it’s simple. It’s simple because it’s just a few ingredients. And it’s just a few ingredients because it’s the best recipe for chocolate chip cookies.


### Training

In [14]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

max_steps is given, it will override any value given in num_train_epochs
[34m[1mwandb[0m: Currently logged in as: [33msameerabdulmohamed[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,1.5566
2,1.5261
3,1.5031
4,1.5611
5,1.4521
6,1.537
7,1.4712
8,1.5053
9,1.5274
10,1.5733


TrainOutput(global_step=200, training_loss=1.2903247690200805, metrics={'train_runtime': 11002.2998, 'train_samples_per_second': 0.291, 'train_steps_per_second': 0.018, 'total_flos': 4.680027767999693e+16, 'train_loss': 1.2903247690200805, 'epoch': 0.0014902796509765057})

## Share adapters on the 🤗 Hub

In [16]:
# Saving the model
model_save_path = './llama_3_8b_recipes_qlora'
tokenizer_save_path = './llama_3_8b_recipes_qlora_tokenizer'
trainer.save_model(model_save_path)

# If you want to save the entire model including the configuration, use this method:
model.save_pretrained(model_save_path)

# Saving the tokenizer associated with the model
tokenizer.save_pretrained(tokenizer_save_path)

('./llama_3_8b_recipes_qlora_tokenizer/tokenizer_config.json',
 './llama_3_8b_recipes_qlora_tokenizer/special_tokens_map.json',
 './llama_3_8b_recipes_qlora_tokenizer/tokenizer.json')

In [None]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(model_save_path)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

# Example of using the model for inference
inputs = tokenizer("Your input text here", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [14]:
# model.push_to_hub("samwit/bloom-7b1-lora-tagger",
#                   use_auth_token=True,
#                   commit_message="basic training",
#                   private=True)

## Load adapters from the Hub

In [15]:
# import torch
# from peft import PeftModel, PeftConfig
# from transformers import AutoModelForCausalLM, AutoTokenizer

# peft_model_id = "samwit/bloom-7b1-lora-tagger"
# config = PeftConfig.from_pretrained(peft_model_id)
# model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# # Load the Lora model
# model = PeftModel.from_pretrained(model, peft_model_id)

## Inference

In [17]:
batch = tokenizer("The best recipe with ingredient amounts and instructions for chocolate chip cookies goes:", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=200)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




 The best recipe with ingredient amounts and instructions for chocolate chip cookies goes: 2 1/2 cups flour 1 tsp. salt 1 tsp. baking soda 1 tsp. vanilla 1/2 cup butter, softened 1/2 cup margarine, softened 1 1/2 cups brown sugar 1/2 cup white sugar 2 eggs 2 tsp. water 1 tsp. vanilla 2 1/2 cups chocolate chips. Mix flour, salt, baking soda and vanilla in a bowl. In another bowl, mix butter, margarine, brown sugar, white sugar, eggs and water. Add vanilla. Add dry ingredients to wet ingredients and mix. Add chocolate chips. Drop by teaspoon onto ungreased cookie sheet. Bake at 375 for 10 minutes. Makes 4 dozen. I use 1/2 cup of butter and 1/2 cup of margarine.


In [16]:
batch = tokenizer("“Training models with PEFT and LoRa is cool” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.




 “Training models with PEFT and LoRa is cool” ->: 1
 hopefully, this is a good idea.
 “Using LoRaWAN for Internet of Things” ->: 1
 “LoRaWAN: A Low-Power Wide Area Network for the Internet of Things” ->


In [19]:
model2 = AutoModelForCausalLM.from_pretrained(
    "meta-llama/meta-llama/Meta-Llama-3-8B",
    load_in_4bit=True,
    device_map='auto',
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
batch = tokenizer("The best recipe for chocolate chip cookies goes:", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model2.generate(**batch, max_new_tokens=500)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))



 The best recipe for chocolate chip cookies goes:
 nobody’s perfect, but you’re pretty close.
I was in my 20s when I got my first job at a bakery. I loved the work, and the owners were great. The bakery was a small family business and the owners had a daughter who was my age. We were friends, and when I was hired, I was given a crash course in how to make the most popular items: cookies, cakes, and breads.
The cookies I learned to make were the best of the best. They were soft, chewy, and had a hint of salt. They were also the most popular item in the bakery. The cakes I learned to make were the best of the best. They were soft, chewy, and had a hint of salt. They were also the most popular item in the bakery. The breads I learned to make were the best of the best. They were soft, chewy, and had a hint of salt. They were also the most popular item in the bakery.
The bakery was a small family business and the owners had a daughter who was my age. We were friends, and when I was hired,