# Finetune LLM using HuggingFace PEFT library, transformers and bitsandbytes.

Notebook Goal: How to fine-tune LLM using peft library and bitsandbytes for loading Larger models in 8 bit. (using LoRA)

## Installing Dependencies

In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ibis-framework 7.1.0 requires pyarrow<15,>=2, but you have pyarrow 15.0.0 which

## Loading Model

1. load **opt-6.7b** model(float16) with 13GB weigth on the Hub.
2. loading it in 8 bit (to save around 6 GB)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

In [None]:
model = AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b",
                                             load_in_8bit= True,
                                             device_map='auto')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


pytorch_model.bin.index.json:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b")

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

# Post-processing on the 8-bit model.

1. freeze all our layers
2. cast the layer-norm in float32 for stability

In [None]:
for param in model.parameters():
  # freeze
  param.requires_grad = False
  if param.ndim == 1:
    #cast small parameters to fp32 for stability
    param.data = param.data.to(torch.float32)

# reduce number of stored activations ( a memory-saving technique)
model.gradient_checkpointing_enable()
# ensures gradient are calculated for model's input tensors.
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self,x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

# LoRA

In [None]:
# using peft library and applying LoRA using get_peft_model utility function.
def print_trainable_parameters(model):
  trainable_parameters = 0
  all_param = 0
  for _,param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_parameters += param.numel()
  print(
      f"trainable parameters:{trainable_parameters} || all params: {all_param} || trainable%: {100*trainable_parameters/all_param}"
  )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r = 16,
    lora_alpha = 32,
    target_modules = ['q_proj', 'v_proj'],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = 'CASUAL_LM'
)

In [None]:
model = get_peft_model(model, config)

In [None]:
print_trainable_parameters(model)

trainable parameters:8388608 || all params: 6666862592 || trainable%: 0.12582542214183376


# Training

In [None]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples['quote']), batched = True)

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

In [None]:
trainer = transformers.Trainer(
    model = model,
    train_dataset = data['train'],
    args = transformers.TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 100,
        max_steps = 200,
        learning_rate = 2e-4,
        fp16 = True,
        logging_steps = 1,
        output_dir = 'outputs'
    ),
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)

)
model.config.use_cache = False
trainer.train()



Step,Training Loss
1,2.25
2,2.3866
3,2.1958
4,2.2289
5,1.7662
6,2.3583
7,2.3382
8,2.1986
9,2.1757
10,2.4032


TrainOutput(global_step=200, training_loss=1.9264344310760497, metrics={'train_runtime': 2301.3217, 'train_samples_per_second': 1.391, 'train_steps_per_second': 0.087, 'total_flos': 1.164184860229632e+16, 'train_loss': 1.9264344310760497, 'epoch': 1.28})

# Sharing on the HuggingFace Hub

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("SumitxThokar/opt-6.7b-lora-version1", use_auth_token= True)



adapter_model.safetensors:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SumitxThokar/opt-6.7b-lora-version1/commit/75a1f6d96d448a0c8587e500de3f607d0aa08bcd', commit_message='Upload model', commit_description='', oid='75a1f6d96d448a0c8587e500de3f607d0aa08bcd', pr_url=None, pr_revision=None, pr_num=None)