<a href="https://colab.research.google.com/github/ThisIsFarhan/QLoRA-FineTuning-LLM/blob/main/QLoRA_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install accelerate peft bitsandbytes transformers trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=3.0.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=3.0.0->trl)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=3.0.0->trl)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets>=3.0.0->trl)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x8

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [2]:
# load the required packages.

import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

In [3]:
dataset="latest_dataset.json"
model_id="Qwen/Qwen2.5-0.5B-Instruct"
output_model="qwen2.5-BlindAssistance"

### Data preparation

In [4]:
# we need to reformat the data in teh ChatML format.

def formatted_train(input,response)->str:
    return f"<|im_start|>user\n{input}\n<|im_start|>assistant\n{response}"

In [5]:
def prepare_train_data(data_id):
    data = load_dataset("json", data_files=data_id, split="train")
    data_df = data.to_pandas()
    data_df["text"] = data_df[["input", "output"]].apply(lambda x: "<|im_start|>user\n" + x["input"] + "\n<|im_start|>assistant\n" + x["output"], axis=1)
    data = Dataset.from_pandas(data_df)
    return data

In [6]:
data = prepare_train_data(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
data

Dataset({
    features: ['id', 'input', 'output', 'text'],
    num_rows: 3018
})

In [8]:
data[1030]

{'id': 1031,
 'input': 'left_input: bird, cat\nright_input: car\nup_input: traffic light, building\nbottom_input: person, bike',
 'output': 'In the distance, a traffic light and building stand tall. In front, a person rides a bike, while a bird and cat sit on a nearby bench to the left, and a car drives by on the right.',
 'text': '<|im_start|>user\nleft_input: bird, cat\nright_input: car\nup_input: traffic light, building\nbottom_input: person, bike\n<|im_start|>assistant\nIn the distance, a traffic light and building stand tall. In front, a person rides a bike, while a bird and cat sit on a nearby bench to the left, and a car drives by on the right.'}

### Model the Model (not the base version)

In [9]:
def get_model_and_tokenizer(mode_id):

    tokenizer = AutoTokenizer.from_pretrained(mode_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [None]:
# !pip install -i https://test.pypi.org/simple/bitsandbytes

In [10]:
model, tokenizer = get_model_and_tokenizer(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

### Setting up the LoRA

In [11]:
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [12]:
training_arguments = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=250,
        fp16=True,
        # push_to_hub=True
    )

In [14]:
trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        #dataset_text_field="text",
        args=training_arguments,
        #tokenizer=tokenizer,
        #packing=False,
        #max_seq_length=1024
    )

Converting train dataset to ChatML:   0%|          | 0/3018 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3018 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3018 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3018 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfarhanak128[0m ([33mfarhanak128-comsats-university-islamabad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,3.1496
20,2.4234
30,1.8348
40,1.4718
50,1.2208
60,1.0707
70,0.9914
80,0.939
90,0.8773
100,0.8299


TrainOutput(global_step=250, training_loss=1.0123772602081298, metrics={'train_runtime': 453.3732, 'train_samples_per_second': 35.291, 'train_steps_per_second': 0.551, 'total_flos': 3135532155858432.0, 'train_loss': 1.0123772602081298})

### Merging the LoRA with the base model

In [16]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = "/content/qwen2.5-BlindAssistance/checkpoint-250"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()

In [17]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

### Inference from the LLM

In [18]:
from transformers import GenerationConfig
from time import perf_counter

def generate_response(user_input):

  prompt = formatted_prompt(user_input)

  inputs = tokenizer([prompt], return_tensors="pt")
  generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.1,repetition_penalty=1.2,
      max_new_tokens=500,pad_token_id=tokenizer.eos_token_id
  )
  start_time = perf_counter()

  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

  outputs = model.generate(**inputs, generation_config=generation_config)
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [19]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}\n<|im_start|>assistant"

In [20]:
generate_response(user_input='left_input: phone, chair, lamp\nright_input: cabinet, person, human\nup_input: mouse, glasses\nbottom_input: laptop, shelf')

user
left_input: phone, chair, lamp
right_input: cabinet, person, human
up_input: mouse, glasses
bottom_input: laptop, shelf
assistant
In the distance, a mouse and glasses are visible. On top of a table, there is a laptop with its user on it, while to the left lies a chair and lamp, and in front of them sits a person holding a human figure. To their right, a bookshelf contains books stacked neatly against the wall.
Time taken for inference: 5.56 seconds


In [21]:
model.save_pretrained("qwen2.5_finetuned_model")
tokenizer.save_pretrained("qwen2.5_finetuned_model")

print("Model and tokenizer saved successfully!")

Model and tokenizer saved successfully!


In [22]:
!zip -r /content/qwen2.5_finetuned_model.zip /content/qwen2.5_finetuned_model

  adding: content/qwen2.5_finetuned_model/ (stored 0%)
  adding: content/qwen2.5_finetuned_model/vocab.json (deflated 61%)
  adding: content/qwen2.5_finetuned_model/added_tokens.json (deflated 67%)
  adding: content/qwen2.5_finetuned_model/model.safetensors (deflated 22%)
  adding: content/qwen2.5_finetuned_model/tokenizer_config.json (deflated 83%)
  adding: content/qwen2.5_finetuned_model/generation_config.json (deflated 39%)
  adding: content/qwen2.5_finetuned_model/special_tokens_map.json (deflated 63%)
  adding: content/qwen2.5_finetuned_model/merges.txt (deflated 57%)
  adding: content/qwen2.5_finetuned_model/tokenizer.json (deflated 81%)
  adding: content/qwen2.5_finetuned_model/config.json (deflated 48%)


In [23]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/qwen2.5_finetuned_model.zip /content/drive/MyDrive/

Mounted at /content/drive


#Loading Saved Model

In [28]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
model_path = "qwen2.5_finetuned_model"  # Path to your saved model directory
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
user_input = "left_input: tree\nright_input: bench\nup_input: kite\nbottom_input: frisbee, dog"

prompt = f"<|im_start|>user\n{user_input}\n<|im_start|>assistant"  # Format the prompt
inputs = tokenizer(prompt, return_tensors="pt")  # Tokenize the prompt

generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=60,pad_token_id=tokenizer.eos_token_id
    )

outputs = model.generate(**inputs, generation_config=generation_config)  # Generate text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)  # Decode the output
print(response)

user
left_input: tree
right_input: bench
up_input: kite
bottom_input: frisbee, dog
assistant
In the distance, a kite dances in the wind. On top of the hill, a small dog plays with its toy while a left input points to a tall tree and right input sits on a bench.
