<a href="https://colab.research.google.com/github/VarjuAkos/Onlab/blob/main/Fine_Tuning_of_LLMs_Partial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tuning of LLMs with Hugging Face

## Step 1: Installing and importing the libraries for Hugging Face

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install huggingface_hub



In [None]:
import os
import torch
from trl import SFTTrainer
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


## Step 2: Setting up links to Hugging Face datasets and models

In [None]:
model_identifier = "aboonaji/llama2finetune-v2"
source_dataset = "gamino/wiki_medical_terms"
formatted_dataset = "aboonaji/wiki_medical_terms_llam2_format"

## Step 3: Setting up all the QLoRA hyperparameters for fine-tuning

In [None]:
#rank:  balance between the model complexity and the learning ability : 64
#alpha: scaling factor applied to LoRA matraxies High: more substantial adjustment but might lead tooverfit : 16
#dropout: regularization technique used,to prevent over fitting in neural networks : 10
lora_hyper_rank = 64
lora_hyper_alpha = 16
lora_hyper_dropout = 0.1

## Step 4: Setting up all the bitsandbytes hyperparameters for fine-tuning

In [None]:
#parameters for quantization -16bit to 4bit
enable_4bit = True
#specify the type and the number of bits in the computations during the training.
compute_dtype_bnb = "float16"
#type of quantization to be used. quantize the model to a 4 bit precision, and the quantization type that allows to do it is called n f 4.
quant_type_bnb = "nf4"
#double quantization is another technique where 2 different quantization processes are applied potentially to different parts of the model in different stages of the computation.
double_quant_flag = False

## Step 5: Setting up all the training arguments hyperparameters for fine-tuning

In [None]:
results_dir = "./results"
#number of training epochs
epochs_count = 10
#hyperparameter that will make sure that we don't use 16 bit floating point precision
enable_fp16 = False
#the brain floating point during the training
enable_bf16 = False
#setting these to make sure the training uses the default precision of 32 bit

#training batch size
train_batch_size = 4
# evaluation batch size
eval_batch_size = 4
#accumulation  steps: number of gradient accumulation steps. Technique that is used to effectively increase the batch size without increasing the memory requirement.
accumulate_steps = 1
#gradient check pointing: technique that saves some memory, but at the cost of additional computation, and it's useful for training large models such as the 1 we will fine tune
checkpointing_flag = True
#gradient clipping Which will prevent the gradients from becoming too large an causing, therefore, training instability.
grad_norm_limit = 0.3
#train learning rate
train_learning_rate = 2e-4
#decay rate which is used for regularization to prevent, overfitting. this should be a very, very small number because, you know, the decay should be progressive, and so we're gonna set it to 0.001
decay_rate = 0.001
#name of the optimizer
optimizer_type = "paged_adamw_32bit"
#purpose of the scheduler to help in stabilizing the training
scheduler_type = "cosine"
#maximum number of training step
steps_limit = 100
#warm up percentage, which is the proportion of training steps that will be used for a warm up phase where the learning rate will gradually increase to the initial learning rate.
warmup_percentage = 0.03
#length underscore grouping, which will indicate whether we want to group training samples of similar lengths together.
length_grouping = True

#checkpoint interval that will set how often we want to save the model checkpoint
#log interval, which will set how often we want to log the training progress.
checkpoint_interval = 0
log_interval = 25

## Step 6: Setting up all the supervised fine-tuning arguments hyperparameters for fine-tuning

In [None]:
#enable packing which telling whether or not packing should be used during the training process.
#packing is technique used in processing sequences, like exactly the text in our dataset, where multiple shorter sequences are combined
#they're packed together into a single training example, and that's to improve computational efficiency.
enable_packing = False
# maximum sequence lengths that the model should handle
sequence_lenght_max = None
#whether we wanna use a CPU or a GPU.
device_assigment = {"":0}

## Step 7: Loading the dataset

In [None]:
training_data = load_dataset(formatted_dataset,split = "train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/54.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

## Step 8: Defining the QLoRA configuration

In [None]:
dtype_computation = getattr(torch,compute_dtype_bnb)
bnb_setup = BitsAndBytesConfig(load_in_4bit=enable_4bit,
                               bnb_4bit_quant_type=quant_type_bnb,
                               bnb_4bit_use_double_quant=double_quant_flag,
                               bnb_4bit_compute_dtype = dtype_computation)

## Step 9: Loading the pre-trained LLaMA 2 model

In [None]:
llama_model =AutoModelForCausalLM.from_pretrained(model_identifier,
                                                  quantization_config = bnb_setup,
                                                  device_map = device_assigment)
llama_model.config.use_case = False
llama_model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

## Step 10: Loading the pre-trained tokenizer for the LLaMA 2 model

In [None]:
llama_tokenizer = AutoTokenizer.from_pretrained(model_identifier,trust_remote_code = True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"

## Step 11: Setting up the configuration for the LoRA fine-tuning method

In [None]:
#this means that no additional bias term will be introduced in the Laura adaptation layers, you know, in neural networks,
#biased terms are used to shift the activation function to either fit the data better or to introduce a particular behavior.
#And here, the choice of none will imply that the original bias terms of the pre trained model will be used without modification.
peft_setup = LoraConfig(lora_alpha = lora_hyper_alpha,
                        lora_dropout = lora_hyper_dropout,
                        r = lora_hyper_rank,
                        bias = "none",
                        task_type = "CAUSAL_LM")

## Step 12: Creating a training configuration by setting the training parameters

In [None]:


train_args = TrainingArguments(output_dir = results_dir,
                               num_train_epochs = epochs_count,
                               fp16 = enable_fp16,
                               bf16 = enable_bf16,
                               per_device_train_batch_size = train_batch_size,
                               per_device_eval_batch_size = eval_batch_size,
                               gradient_accumulation_steps = accumulate_steps,
                               gradient_checkpointing = checkpointing_flag,
                               max_grad_norm = grad_norm_limit,
                               learning_rate = train_learning_rate,
                               weight_decay = decay_rate,
                               optim = optimizer_type,
                               lr_scheduler_type = scheduler_type,
                               max_steps = steps_limit,
                               warmup_ratio = warmup_percentage,
                               group_by_length = length_grouping,
                               save_steps = checkpoint_interval,
                               logging_steps = log_interval
                               )

## Step 13: Creating the Supervised Fine-Tuning Trainer

In [None]:
#you need the model, data set, the training data, PEFT setup, the SFTT arguments hyperparameters
llama_sftt_trainer = SFTTrainer(model = llama_model,
                                args = TrainingArguments,
                                train_dataset = training_data,
                                tokenizer = llama_tokenizer,
                                peft_config = peft_setup,
                                dataset_text_field = "text",
                                max_seq_length = sequence_lenght_max,
                                packing = enable_packing
                                )

## Step 14: Training the model

In [None]:
llama_sftt_trainer.train()

## Step 15: Chatting with the model

In [None]:
user_prompt = "Please tell me about Bursitis"
text_generation_pipe = pipeline(task = "text-generation", model = llama_model, tokenizer = llama_tokenizer, max_length = 300)
generation_result = text_generation_pipe(f"<s>[INST] {user_prompt} [/INST]")
print(generation_result[0]['generated_text'])S