#### Setup and Environment Creation

In [None]:
'''
    Techniques we will be using:
    1. PEFT - Perimeter Effecient Transfer Learning 
    2. LoRA - Low-Rank Adaptation of LLM
'''

In [None]:
# pip install -q accelerate peft bitsandbytes transformers trl
# Create a conda environment and run 
# pip install -r requirements.txt

# my python version is 3.12.4 and i will be using GPU acceleration

#### Import all the required libraries

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
# during transfer learning peft freezes the weights and only some weights will be retrained
from peft import LoraConfig, PeftModel 
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


#### We will reformat our dataset to follow Llama 2 template

In [2]:
'''
    Llama 2 template
    
    <s>[INST] <<SYS>>
    System prompt
    <</SYS>>

    User prompt [/INST] Model answer </s>
'''

'\n    Llama 2 template\n    \n    <s>[INST] <<SYS>>\n    System prompt\n    <</SYS>>\n\n    User prompt [/INST] Model answer </s>\n'

In [3]:
model_name = "NousResearch/Llama-2-7b-chat-hf"

dataset_name = 'mlabonne/guanaco-llama2-1k' 

new_model = 'Llama-2-7b-chat-finetune'

## QLoRA parameterrs

lora_r = 64                         #LoRA attention dimention
lora_alpha = 16                     #LoRA Scaling
lora_dropout = 0.1                  #LoRA Dropout

## Bites and Bytes parameters

use_4bit = True                     #4-bit precision model load
bnb_4bit_compute_dtype = 'float16'  #d-type for 4-bit base model
bnb_4bit_quant_type = 'nf4'         #fp4 or nf4 - type quant
use_nested_quant = False            #nested quantization

## Training Arguments parameters

output_dir = './results'            #saving model predictions and checkpoints
num_train_epochs = 1                #training epochs
fp16 = False
bf16 = False
per_device_train_batch_size = 2     #Batch size for GPU training
per_device_eval_batch_size = 2      #Batch size for GPU evaluation
gradient_accumulation_steps = 1     #Update steps for gradients
gradient_checkpointing = True       #gradient checkpoints
max_grad_norm = 0.3                 #gradient clipping
learning_rate = 2e-4                #AdamW optimizer
weight_decay = 0.001                #Weight decay for all layers
optim = 'paged_adamw_32bit'         #Optimizer
lr_scheduler_type = 'cosine'        #Learning rate schedule
max_steps = -1                      #Number of training steps(overrides epochs)
warmup_ratio = 0.03                 #from 0 to linear warmup(learning rate)
group_by_length = True              #group sequences wit batches with same length and saves memory and speds up training
save_steps = 0                      #saves every X updates steps
logging_steps = 25                  #logs every X updates steps

## SFT parameters

max_seq_length = None               #Max sequence length to use
packing = False                     #Pack multiple short examples in the same input to increase effeciency
device_map = {"": 0}                #Load the entire model on GPU 0 

#### Load everything and start finetuning process

1. First of all, we want to load the dataset we defined. Here, our dataset is already preprocessed but, usually, this is where you would reformat the prompt, filter out bad text, combine multiple datasets, etc.


2. Then, we’re configuring bitsandbytes for 4-bit quantization.


3. Next, we're loading the Llama 2 model in 4-bit precision on a GPU with the corresponding tokenizer.


4. Finally, we're loading configurations for QLoRA, regular training parameters, and passing everything to the SFTTrainer. The training can finally start!

In [None]:
dataset = load_dataset(dataset_name, split='train')
compute_dtype = getattr(torch, bnb_4bit_compute_dtype) # tokenizer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

## Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

## Load Llama Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'        #wierd issue with fp16

## Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

## Set training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

## Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)
trainer.train()