# Free Notebook by Prompt Engineer
Youtube Channel: https://www.youtube.com/@PromptEngineer48

# For Fine-tuning of Llama Models using 4-bit Quantization

This notebook allows you to load Llama-7B-chat-hf in 4bit
Then train it using PEFT library from Hugging Face


This notebook is based on a fork of [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing#scrollTo=E0Nl5mWL0k2T)


# Installation of the Required Libraries

In [1]:
## This will install all the required libraries
!pip install -q -U bitsandbytes #8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions.
!pip install transformers #provides thousands of pretrained models to perform tasks on different modalities
!pip install -q -U git+https://github.com/huggingface/peft.git #only fine-tune a small number of (extra) model parameters #https://huggingface.co/docs/peft/index
!pip install -q -U git+https://github.com/huggingface/accelerate.git #https://huggingface.co/docs/accelerate/v0.11.0/en/index
!pip install -q datasets #provides access to large collection of datasets| easily add and share new datasets.

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 

In [2]:
# This will help access models from huggingface and pushing models to hugging face
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

# need to put your token for accessing the hugging face libraries, datasets, models etc.



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Loading a Pre-trained Model from Huggingface

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

#Autotokenizer: load the appropriate tokenizer for a given model. A tokenizer is a tool that converts text into numerical tokens that can be fed to the model
#AutoModelCausalLM: automatically load the appropriate model for a given model identifier
#BitsAndBytesConfig: configure the quantization settings for a model. Quantization is a technique that reduces the memory and computation requirements of a model

model_id = "meta-llama/Llama-2-7b-chat-hf"
#Need permission to access this model
#Go to google and copy the model and request for permission.

#now store the quantization settings
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, #each weight and activation of the model will be represented by 4 bits | reduce memory requirements
    bnb_4bit_use_double_quant=True, #double quantization | further reduce the memory footprint of the model by quantizing the quantization constants
    bnb_4bit_quant_type="nf4", #Normal Float 4bit| 4-bit quantization method that can reduce memory usage and improve performance for large language models
    bnb_4bit_compute_dtype=torch.bfloat16 #16-bit floating-point format that preserves more range than precision| improves accuracy
)

tokenizer = AutoTokenizer.from_pretrained(model_id) #loads the appropriate tokenizer for the model

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
##  load the appropriate model for the model identifier, with the quantization settings, dictionary that maps the model’s modules to the devices on which they should run

#### Summary: The code loads a pre-trained language model, configures it with BitsAndBytes quantization, and specifies the device on which the model should run.

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Setting Up for Training

In [None]:
from peft import prepare_model_for_kbit_training #prepare the model

model.gradient_checkpointing_enable() #save memory
model = prepare_model_for_kbit_training(model)

In [None]:
# A trainable parameter is a parameter that can be updated during the training process by gradient descent.
# This code give you the output of the total number of param, number of trainable param and percentage of param

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel() # numel()=number of elements in the parameter
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [None]:
### Using Low Rank Decomposition to represent the weight updates

from peft import LoraConfig, get_peft_model

#LoraConfig: configure the LoRA settings for a model.
#get_peft_model: prepare a model for LoRA fine-tuning.

config = LoraConfig(
    r=8, # rank of the low-rank decomposition is 8
    lora_alpha=32, #scaling factor that controls the magnitude of the weight updates.
    target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"], #these are the layers of the self-attention mechanism and these are specific to Llama models
    lora_dropout=0.05, #randomly drop out layers to prevent overfitting
    bias="none",
    task_type="CAUSAL_LM" #Causal: predict the next word, Other eg. Seq to Seq, Seq classification, Token Classification
)

model = get_peft_model(model, config) #return a model for that is ready for LoRA fine-tuning
#The above model will have the LoRA layers added to the target modules, and the original weights will be frozen
print_trainable_parameters(model) #prints the trainable parameters as per the defined function earlier

#Setting up of Data

In [None]:
from datasets import load_dataset #imports the function from dataset package

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

#### In summary, this code loads a dataset named "Abirate/english_quotes" using the "datasets" library and
#### then tokenizes the text data in the "quote" field of the dataset using a tokenizer.
#### The result is a preprocessed dataset that can be used for various natural language processing tasks,
#### such as text classification, text generation, or other tasks that involve working with text data.

# Let's Train

In [None]:
import transformers

tokenizer.pad_token = tokenizer.eos_token #The tokenizer will use the same token for both padding and indicating the end of the sequence

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4, #accumulate the gradients before updating the model parameters. This is useful when the batch size is too large to fit in the memory of a single device.
        warmup_steps=2, #the number of steps to gradually increase the learning rate from zero to the specified value.
        max_steps=10, # the maximum number of steps to perform during the training.
        learning_rate=2e-4, # the initial learning rate to use for the optimizer, which is a function that updates the model parameters based on the gradients.
        fp16=True, # 16-bit floating point for reducing the memory usage and speed up the training
        logging_steps=1, #  the number of steps to log the training metrics, such as the loss and the accuracy.
        output_dir="outputs", #directory where to save the model checkpoints and the training logs.
        optim="paged_adamw_8bit" #name of the optimizer
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False #model will not use the cached hidden states from the previous layers, which can speed up the computation but also increase the memory usage.

trainer.train() #starts the training process

# Let's do some Inferencing

In [None]:
from transformers import TextStreamer
model.config.use_cache = True
model.eval() #evaluation mode deactivates some modules such as Dropout and BatchNorm to make the model more deterministic

In [None]:
# Define a stream *without* function calling capabilities
import json
# save this as generation_config.json file
# {
#   "max_length": 100,
#   "num_beams": 1,
#   "temperature": 0.7
# }


with open("generation_config.json", "r") as config_file:
  generation_config = json.load(config_file)


def stream(user_prompt):
    runtimeFlag = "cuda:0" #the model will run on the fist GPU device available
    system_prompt = 'You are a helpful assistant that provides accurate and concise responses'

    B_INST, E_INST = "[INST]", "[/INST]" #special tokens markers
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n" #special tokens markers

    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{user_prompt.strip()} {E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag) #tokenize, convert to pyTorch tensor and move to cuda device

    streamer = TextStreamer(tokenizer) #stream the data


    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=500,**generation_config)

In [None]:
stream('Give me a report comparing an apple and a mango.')

# Pushing Model to Hub

In [None]:
# Extracting the last portion of the base_model
base_model_name = model_id.split("/")[-1]

## Change the name 'Prompt48' to your organization name
adapter_model = f"Prompt48/{base_model_name}-fine-tuned-adapters-V1"
new_model = f"Prompt48/{base_model_name}-fine-tuned-V1"

In [None]:
# Save the model
model.save_pretrained(adapter_model, push_to_hub=True, use_auth_token=True)

# Push the model to the hub
model.push_to_hub(adapter_model, use_auth_token=True)

In [None]:
# reload the base model
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='cpu', trust_remote_code=True, torch_dtype=torch.float16) #, cache_dir=cache_dir)

In [None]:
from peft import PeftModel

# load peft model with new adapters
model = PeftModel.from_pretrained(
    model,
    adapter_model,
)

In [None]:
model = model.merge_and_unload() # merging adapters with the base model.

In [None]:
model.push_to_hub(new_model, use_auth_token=True, max_shard_size="5GB") #pushing the model to hub

In [None]:
#Push the tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.push_to_hub(new_model, use_auth_token=True)