<a href="https://colab.research.google.com/github/ThisIsFarhan/QLoRA-FineTuning-LLM/blob/main/QLoRA_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install accelerate peft bitsandbytes transformers trl

Collecting bitsandbytes
  Using cached bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting trl
  Using cached trl-0.15.1-py3-none-any.whl.metadata (11 kB)
Collecting datasets>=2.21.0 (from trl)
  Using cached datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [3]:
# load the required packages.

import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

In [4]:
dataset="scene_descriptions_dataset.json"
model_id="Qwen/Qwen2.5-0.5B-Instruct"
output_model="qwen2.5-BlindAssistance"

### Data preparation

In [5]:
# we need to reformat the data in teh ChatML format.

def formatted_train(input,response)->str:
    return f"<|im_start|>user\n{input}\n<|im_start|>assistant\n{response}"

In [6]:
def prepare_train_data(data_id):
    data = load_dataset("json", data_files=data_id, split="train")
    data_df = data.to_pandas()
    data_df["text"] = data_df[["input", "output"]].apply(lambda x: "<|im_start|>user\n" + x["input"] + "\n<|im_start|>assistant\n" + x["output"], axis=1)
    data = Dataset.from_pandas(data_df)
    return data

In [7]:
data = prepare_train_data(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
data

Dataset({
    features: ['id', 'input', 'output', 'text'],
    num_rows: 1000
})

In [9]:
data[0]

{'id': 1,
 'input': 'left_input: phone, chair, lamp\nright_input: cabinet, person, human\nup_input: mouse, glasses\nbottom_input: laptop, shelf',
 'output': 'In the left side area, there were a phone, a chair, and a lamp. In the right area, there were a cabinet, a person, and a human. In the above area, there were a mouse and a glasses. a laptop and a shelf were detected in the below section.',
 'text': '<|im_start|>user\nleft_input: phone, chair, lamp\nright_input: cabinet, person, human\nup_input: mouse, glasses\nbottom_input: laptop, shelf\n<|im_start|>assistant\nIn the left side area, there were a phone, a chair, and a lamp. In the right area, there were a cabinet, a person, and a human. In the above area, there were a mouse and a glasses. a laptop and a shelf were detected in the below section.'}

### Model the Model (not the base version)

In [10]:
def get_model_and_tokenizer(mode_id):

    tokenizer = AutoTokenizer.from_pretrained(mode_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [None]:
# !pip install -i https://test.pypi.org/simple/bitsandbytes

In [11]:
model, tokenizer = get_model_and_tokenizer(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

### Setting up the LoRA

In [12]:
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [13]:
training_arguments = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=250,
        fp16=True,
        # push_to_hub=True
    )

In [14]:
trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        #dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        #packing=False,
        #max_seq_length=1024
    )

  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [15]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfarhanak128[0m ([33mfarhanak128-comsats-university-islamabad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
10,3.4219
20,2.6369
30,1.9838
40,1.5927
50,1.3199
60,1.133
70,0.9682
80,0.8535
90,0.7818
100,0.7222


TrainOutput(global_step=250, training_loss=0.9801149997711182, metrics={'train_runtime': 535.1351, 'train_samples_per_second': 29.899, 'train_steps_per_second': 0.467, 'total_flos': 3075267126226944.0, 'train_loss': 0.9801149997711182})

### Merging the LoRA with the base model

In [16]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = "/content/qwen2.5-BlindAssistance/checkpoint-250"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()

In [17]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

### Inference from the LLM

In [28]:
from transformers import GenerationConfig
from time import perf_counter

def generate_response(user_input):

  prompt = formatted_prompt(user_input)

  inputs = tokenizer([prompt], return_tensors="pt")
  generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=500,pad_token_id=tokenizer.eos_token_id
  )
  start_time = perf_counter()

  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

  outputs = model.generate(**inputs, generation_config=generation_config)
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [29]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}\n<|im_start|>assistant"

In [30]:
generate_response(user_input='left_input: phone, chair, lamp\nright_input: cabinet, person, human\nup_input: mouse, glasses\nbottom_input: laptop, shelf')



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [21]:
model.save_pretrained("qwen2.5_finetuned_model")
tokenizer.save_pretrained("qwen2.5_finetuned_model")

print("Model and tokenizer saved successfully!")

Model and tokenizer saved successfully!


In [22]:
!zip -r /content/qwen2.5_finetuned_model.zip /content/qwen2.5_finetuned_model

  adding: content/qwen2.5_finetuned_model/ (stored 0%)
  adding: content/qwen2.5_finetuned_model/added_tokens.json (deflated 67%)
  adding: content/qwen2.5_finetuned_model/model.safetensors (deflated 22%)
  adding: content/qwen2.5_finetuned_model/tokenizer_config.json (deflated 83%)
  adding: content/qwen2.5_finetuned_model/special_tokens_map.json (deflated 63%)
  adding: content/qwen2.5_finetuned_model/tokenizer.json (deflated 81%)
  adding: content/qwen2.5_finetuned_model/generation_config.json (deflated 39%)
  adding: content/qwen2.5_finetuned_model/config.json (deflated 47%)
  adding: content/qwen2.5_finetuned_model/vocab.json (deflated 61%)
  adding: content/qwen2.5_finetuned_model/merges.txt (deflated 57%)


In [24]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/qwen2.5_finetuned_model.zip /content/drive/MyDrive/

Mounted at /content/drive


#Loading Saved Model

In [27]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
model_path = "qwen2.5_finetuned_model"  # Path to your saved model directory
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
user_input = "left_input: phone, chair, lamp\nright_input: cabinet, person, human\nup_input: mouse, glasses\nbottom_input: laptop, shelf"

prompt = f"<|im_start|>user\n{user_input}\n<|im_start|>assistant"  # Format the prompt
inputs = tokenizer(prompt, return_tensors="pt")  # Tokenize the prompt

generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=100,pad_token_id=tokenizer.eos_token_id
    )

outputs = model.generate(**inputs, generation_config=generation_config)  # Generate text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)  # Decode the output
print(response)

user
left_input: phone, chair, lamp
right_input: cabinet, person, human
up_input: mouse, glasses
bottom_input: laptop, shelf
assistant

The left side portion contains a phone, a chair, and a lamp. The right area has a cabinet, a person, and a human.
a mouse and a glass were observed above.
lower half:
- laptop,
- shelf
