# Fine Tuning Base LLAMA as Chat [Instruct] Model

### Normalized Quantization
- **Normalized Quantization:** QLoRA uses normalized quantization to quantize the low-rank matrices. This technique ensures that the quantized weights are normalized to the same scale as the original weights, preserving the model's performance during fine-tuning.


In [1]:
!pip install -q accelerate -U
!pip install -q bitsandbytes -U
!pip install -q trl -U
!pip install -q peft -U
!pip install -q transformers -U
!pip install -q datasets -U

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [38]:
from datasets import load_dataset

dataset = load_dataset("UCSD26/medical_dialog", 'processed.en')
# dataset = dataset.shuffle(seed=0).select(range(10_000))

In [39]:
dataset

DatasetDict({
    train: Dataset({
        features: ['description', 'utterances'],
        num_rows: 482
    })
    validation: Dataset({
        features: ['description', 'utterances'],
        num_rows: 60
    })
    test: Dataset({
        features: ['description', 'utterances'],
        num_rows: 61
    })
})

In [40]:
dataset['train'][0]  # Jika dataset memiliki split 'train'

{'description': 'throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.',
 'utterances': ['patient: throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.',
  "doctor: during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)"]}

In [41]:
dataset['validation'][0]  # Jika dataset memiliki split 'train'

{'description': 'good day. this morning i coughed for the very first time in a long time. with the corona virus around i feel the need to report this. i coughed for about 5 min. i have no fever, not tired and chest feels weird. what should i do?',
 'utterances': ['patient: good day. this morning i coughed for the very first time in a long time. with the corona virus around i feel the need to report this. i coughed for about 5 min. i have no fever, not tired and chest feels weird. what should i do?',
  'doctor: in brief: best to stay home right now stay home, consult here. disinfect everything and stay safe. we are here to answer your questions. would you like to video or text chat with me?']}

In [64]:
from transformers import pipeline

# base model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [65]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_size="left"

def format_prompt(example):
    """Format the prompt using the <|user|> and <|assistant|> format"""
    chat = "\n".join(example['utterances'])  # Gabungkan percakapan
    prompt = tokenizer.apply_chat_template(chat, tokenize=False)
    return {'text': prompt}

# print(format_prompt(dataset[0])['text'])
dataset_train = dataset['train'].map(format_prompt)
dataset_validation = dataset['validation'].map(format_prompt)

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [66]:
dataset_train['utterances'][0]

['patient: throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.',
 "doctor: during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)"]

In [80]:
print(dataset_train[0])
print(dataset_validation[0])

{'description': 'throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.', 'utterances': ['patient: throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.', "doctor: during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)"], 'text': ''}
{'description': 'good day. this morning i coughed for the very first time in a long time. with the corona virus around i feel the need to report this. i coughed for about 5 min. i have no fever

In [81]:
print(len(dataset_train), len(dataset_validation))
print(dataset_train[0], dataset_validation[0])

482 60
{'description': 'throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.', 'utterances': ['patient: throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.', "doctor: during this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)"], 'text': ''} {'description': 'good day. this morning i coughed for the very first time in a long time. with the corona virus around i feel the need to report this. i coughed for about 5 min. i have n

In [82]:
sample_text = dataset_train[0]["text"]
tokens = tokenizer(sample_text, padding=True, truncation=True, return_tensors="pt")
print(tokens)

{'input_ids': tensor([[1]]), 'attention_mask': tensor([[1]])}


### Testing Base LLAMA Model
- Let's see how base Tiny-LLAMA performs out of the box

## Model Configuration for Training

In [83]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [68]:
# do the  4-bit quantization configuration in Q-LORA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=True
)

In [69]:
print(tokenizer.chat_template)

{% for message in messages %}
{% if message['role'] == 'user' %}
{{ '<|user|>
' + message['content'] + eos_token }}
{% elif message['role'] == 'system' %}
{{ '<|system|>
' + message['content'] + eos_token }}
{% elif message['role'] == 'assistant' %}
{{ '<|assistant|>
'  + message['content'] + eos_token }}
{% endif %}
{% if loop.last and add_generation_prompt %}
{{ '<|assistant|>' }}
{% endif %}
{% endfor %}


In [97]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    quantization_config=bnb_config
)

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [85]:
model.config.use_cache=False
model.config.pretraining_tp=1

# Prepare LoRA Configuration for PEFT Fine tuning


In [93]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [94]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=64,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
)

model = prepare_model_for_kbit_training(model)

model = get_peft_model(model, peft_config)

w = 2048 * 256
→ Biasanya ini adalah jumlah parameter dalam weight matrix asli sebelum LoRA diterapkan.

a = 2048 * 64
→ Ini bisa merujuk ke matriks adaptasi LoRA pertama dengan rank r=64.

b = 64 * 256
→ Ini bisa merujuk ke matriks adaptasi LoRA kedua.

a + b
→ Total jumlah elemen dalam matriks LoRA tambahan.

(a + b) / w
→ Rasio pengurangan parameter dengan LoRA dibandingkan full fine-tuning.

In [95]:
w = 2048*256
a = 2048*64
b = 64*256

w, a, b, a+b, (a+b)/w

(524288, 131072, 16384, 147456, 0.28125)

## Model Fine Tuning

In [96]:
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer

output_dir = "train_dir"

args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,  # Tambahkan batch size untuk validasi
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    logging_steps=10,
    evaluation_strategy="steps",  # Aktifkan validasi selama pelatihan
    eval_steps=10,  # Jalankan validasi setiap 10 langkah
    save_strategy="steps",  # Simpan model setiap epoch
    fp16=True,
    gradient_checkpointing=True,
    load_best_model_at_end=True,  # Gunakan model terbaik di akhir pelatihan
    metric_for_best_model="loss",  # Gunakan loss sebagai metrik evaluasi
    greater_is_better=False  # Karena loss lebih kecil lebih baik
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_validation,  # Tambahkan dataset validasi
    tokenizer=tokenizer,
    args=args,
    peft_config=peft_config
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = SFTTrainer(


Map:   0%|          | 0/482 [00:00<?, ? examples/s]

NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

In [None]:
import torch

if torch.cuda.is_available():
    print("GPU Tersedia:", torch.cuda.get_device_name(0))
else:
    print("Menggunakan CPU")

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

import wandb
wandb.init(mode="disabled")

trainer.train()

In [None]:
trainer.model.save_pretrained("TinyLlama-1.1B-qlora2")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!zip -r /content/drive/MyDrive/TinyLlama-1.1B-qlora2.zip /content/TinyLlama-1.1B-qlora2

## Load Pre-Trained PEFT Model for Prediction

In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora",
    device_map='auto'
)

merged_model = model.merge_and_unload()

In [None]:
from transformers import pipeline

prompt = """<|user|>
Tell me something about Large Language Models.</s>
<|assistant|>
"""

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_size="left"

pipe = pipeline(task='text-generation', model=merged_model, tokenizer=tokenizer)
output = pipe(prompt)
print(output[0]['generated_text'])

In [None]:
!zip -r tiny_llama_qlora_adapter.zip TinyLlama-1.1B-qlora