<a href="https://colab.research.google.com/github/ThisIsFarhan/LoRA-FineTuning-LLM/blob/main/LoRA_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install accelerate peft bitsandbytes transformers trl



In [14]:
# from huggingface_hub import notebook_login
# notebook_login()

In [15]:
# load the required packages.

import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

In [16]:
dataset="scene_descriptions_dataset.json"
model_id="cognitivecomputations/TinyDolphin-2.8-1.1b"
output_model="TinyDolphin-BlindAssistance"

### Data preparation

In [17]:
# we need to reformat the data in teh ChatML format.

def formatted_train(input,response)->str:
    return f"<|user|>\n{input}</s>\n<|assistant|>\n{response}</s>"

In [18]:
def prepare_train_data(data_id):
    data = load_dataset("json", data_files=data_id, split="train")
    data_df = data.to_pandas()
    data_df["text"] = data_df[["input", "output"]].apply(lambda x: "<|user|>\n" + x["input"] + "</s>\n<|assistant|>\n" + x["output"] + "</s>", axis=1)
    data = Dataset.from_pandas(data_df)
    return data

In [19]:
data = prepare_train_data(dataset)

In [20]:
data

Dataset({
    features: ['id', 'input', 'output', 'text'],
    num_rows: 1000
})

In [21]:
data[0]

{'id': 1,
 'input': 'left_input: phone, chair, lamp\nright_input: cabinet, person, human\nup_input: mouse, glasses\nbottom_input: laptop, shelf',
 'output': 'In the left side area, there were a phone, a chair, and a lamp. In the right area, there were a cabinet, a person, and a human. In the above area, there were a mouse and a glasses. a laptop and a shelf were detected in the below section.',
 'text': '<|user|>\nleft_input: phone, chair, lamp\nright_input: cabinet, person, human\nup_input: mouse, glasses\nbottom_input: laptop, shelf</s>\n<|assistant|>\nIn the left side area, there were a phone, a chair, and a lamp. In the right area, there were a cabinet, a person, and a human. In the above area, there were a mouse and a glasses. a laptop and a shelf were detected in the below section.</s>'}

### Model the Model (not the base version)

In [22]:
def get_model_and_tokenizer(mode_id):

    tokenizer = AutoTokenizer.from_pretrained(mode_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [11]:
# !pip install -i https://test.pypi.org/simple/bitsandbytes

In [23]:
model, tokenizer = get_model_and_tokenizer(model_id)

### Setting up the LoRA

In [24]:
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [25]:
training_arguments = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=250,
        fp16=True,
        # push_to_hub=True
    )

In [29]:
trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        #dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        #packing=False,
        #max_seq_length=1024
    )

  trainer = SFTTrainer(


Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [30]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfarhanak128[0m ([33mfarhanak128-comsats-university-islamabad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
10,3.0514
20,2.2222
30,1.3209
40,0.912
50,0.7342
60,0.6444
70,0.5887
80,0.5397
90,0.4882
100,0.4715


TrainOutput(global_step=250, training_loss=0.7074475173950195, metrics={'train_runtime': 816.6436, 'train_samples_per_second': 19.592, 'train_steps_per_second': 0.306, 'total_flos': 1.094379573067776e+16, 'train_loss': 0.7074475173950195})

### Merging the LoRA with the base model

In [31]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = "/content/TinyDolphin-BlindAssistance/checkpoint-250"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()

In [32]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32002, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

### Inference from the LLM

In [39]:
from transformers import GenerationConfig
from time import perf_counter

def generate_response(user_input):

  prompt = formatted_prompt(user_input)

  inputs = tokenizer([prompt], return_tensors="pt")
  generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=500,pad_token_id=tokenizer.eos_token_id
  )
  start_time = perf_counter()

  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

  outputs = model.generate(**inputs, generation_config=generation_config)
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [40]:
def formatted_prompt(question)-> str:
    return f"<|user|>\n{question}</s>\n<|assistant|>"

In [41]:
generate_response(user_input='left_input: phone, chair, lamp\nright_input: cabinet, person, human\nup_input: mouse, glasses\nbottom_input: laptop, shelf')

<|user|>
left_input: phone, chair, lamp
right_input: cabinet, person, human
up_input: mouse, glasses
bottom_input: laptop, shelf
<|assistant|>
The left side portion contains a phone, a chair, and a lamp. The right area portion contains a cabinet, a person, and a human. In the top area, there were a mouse and a glasses. a laptop and a shelf were observed below.
Time taken for inference: 4.9 seconds


In [42]:
model.save_pretrained("my_lora_finetuned_model")
tokenizer.save_pretrained("my_lora_finetuned_model")

print("Model and tokenizer saved successfully!")

Model and tokenizer saved successfully!


In [43]:
!zip -r /content/my_lora_finetuned_model.zip /content/my_lora_finetuned_model

  adding: content/my_lora_finetuned_model/ (stored 0%)
  adding: content/my_lora_finetuned_model/tokenizer.model (deflated 55%)
  adding: content/my_lora_finetuned_model/config.json (deflated 46%)
  adding: content/my_lora_finetuned_model/added_tokens.json (deflated 25%)
  adding: content/my_lora_finetuned_model/tokenizer.json (deflated 85%)
  adding: content/my_lora_finetuned_model/tokenizer_config.json (deflated 74%)
  adding: content/my_lora_finetuned_model/generation_config.json (deflated 29%)
  adding: content/my_lora_finetuned_model/model.safetensors (deflated 22%)
  adding: content/my_lora_finetuned_model/special_tokens_map.json (deflated 73%)


In [44]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/my_lora_finetuned_model.zip /content/drive/MyDrive/

Mounted at /content/drive


#Loading Saved Model

In [45]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
model_path = "my_lora_finetuned_model"  # Path to your saved model directory
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
user_input = "left_input: phone, chair, lamp\nright_input: cabinet, person, human\nup_input: mouse, glasses\nbottom_input: laptop, shelf"

prompt = f"<|user|>\n{user_input}</s>\n<|assistant|>"  # Format the prompt
inputs = tokenizer(prompt, return_tensors="pt")  # Tokenize the prompt

generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=500,pad_token_id=tokenizer.eos_token_id
    )

outputs = model.generate(**inputs, generation_config=generation_config)  # Generate text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)  # Decode the output
print(response)

<|user|>
left_input: phone, chair, lamp
right_input: cabinet, person, human
up_input: mouse, glasses
bottom_input: laptop, shelf
<|assistant|>
a phone, a chair, and a lamp were detected in the left side section. The right area portion contains a cabinet, a person, and a human. In the top area, there was a mouse and a glasses. A laptop and a shelf were observed below.
