# Instruction Finetuning using QLoRA

This notebook looks into how to perform instruction finetuning using QLoRA PEFT method. The task is to perform Supervised finetuning (SFT) of CodeLlama for function calling

In [None]:
import os
os.environ["WANDB_PROJECT"]="codellama_instruct_finetuning"

from enum import Enum
from functools import partial
import pandas as pd
import torch
import json

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig, set_seed
from datasets import load_dataset
from trl import SFTTrainer
from peft import get_peft_model, LoraConfig, TaskType

seed = 42
set_seed(seed)

## Data preprocessing

In [None]:
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
dataset_name = "heegyu/glaive-function-calling-v2-formatted"
tokenizer = AutoTokenizer.from_pretrained(model_name)
template = """{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"""
tokenizer.chat_template = template

def preprocess(samples):
    batch = []
    for system_prompt, function_desc, conversation in zip(samples["system_message"], samples["function_description"], samples["conversations"]):
        try:
            function_desc_formatted = json.dumps(json.loads(f"[{function_desc}]"), indent=2, sort_keys=True)
        except:
            function_desc_formatted = f"[{function_desc}]"
        system_message = {"role": "system", "content": f"{system_prompt}\nfunctions: {function_desc_formatted}"}
        conversation.insert(0, system_message)
        batch.append(tokenizer.apply_chat_template(conversation, tokenize=False))
    return {"content": batch}

dataset = load_dataset(dataset_name)
dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)
dataset = dataset["train"].train_test_split(0.1)
print(dataset)
print(dataset["train"][0])

In [None]:
from datasets import DatasetDict

# Assuming `dataset` is your DatasetDict
dataset = dataset.rename_columns({"content": "text"})

# Verify the change
print(dataset)

In [None]:
print(len(dataset["train"]))

## Create the PEFT model

In [None]:
peft_config = LoraConfig(r=8,
                         lora_alpha=16,
                         lora_dropout=0.1,
                         target_modules=["gate_proj","q_proj","lm_head","o_proj","k_proj","embed_tokens","down_proj","up_proj","v_proj"],
                         task_type=TaskType.CAUSAL_LM)

In [None]:
bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )

In [None]:
class ChatmlSpecialTokens(str, Enum):
    user = "<|im_start|>user"
    assistant = "<|im_start|>assistant"
    system = "<|im_start|>system"
    function_call = "<|im_start|>function-call"
    function_response = "<|im_start|>function-response"
    eos_token = "<|im_end|>"
    bos_token = "<s>"
    pad_token = "<pad>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        pad_token=ChatmlSpecialTokens.pad_token.value,
        bos_token=ChatmlSpecialTokens.bos_token.value,
        eos_token=ChatmlSpecialTokens.eos_token.value,
        additional_special_tokens=ChatmlSpecialTokens.list(),
        trust_remote_code=True
    )
tokenizer.chat_template = template

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             attn_implementation="flash_attention_2")
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)


In [None]:
model

## Training

In [None]:
output_dir = "Qwen2.5-Coder-7B_function_calling_instruct"
per_device_train_batch_size = 2
per_device_eval_batch_size = 2
gradient_accumulation_steps = 4
logging_steps = 5
learning_rate = 5e-4
max_grad_norm = 1.0
num_train_epochs=1
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_seq_length = 2048

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",
    evaluation_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    bf16=True,
    report_to=["tensorboard", "wandb"],
    hub_private_repo=True,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)


In [None]:
# Shuffle and select subsets
train_subset = dataset['train'].shuffle(seed=42).select(range(5000))
test_subset = dataset['test'].shuffle(seed=42).select(range(500))

# Create a new DatasetDict with the subsets
subset_dataset = DatasetDict({
    'train': train_subset,
    'test': test_subset
})

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=subset_dataset["train"],
    eval_dataset=subset_dataset["test"],
    tokenizer=tokenizer,
    # packing=True,
    # dataset_text_field="content",
    # max_seq_length=max_seq_length,
    peft_config=peft_config,
    # dataset_kwargs={
    #     "append_concat_token": False,
    #     "add_special_tokens": False,
    # },
)

In [None]:
trainer.train()
trainer.save_model()