In [None]:
!pip install -q -U transformers datasets accelerate bitsandbytes trl peft evaluate

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import setup_chat_format, SFTTrainer
from peft import LoraConfig

import torch

# Full Fine-Tuning vs PEFT (Parameter Efficient Fine-Tuning)

## Model: Llama-2-7b

### What Makes Our LLama Fine-Tuning Expensive?
- 2 bytes for the weight
- 2 bytes for the gradient
- 4 + 8 bytes for the Adam optimizer states

With a total of 16 bytes per trainable parameter, this makes a total of 112GB (excluding the intermediate hidden states). Given that the largest GPU available today can have up to 80GB GPU VRAM, it makes fine-tuning challenging and less accessible to everyone. To bridge this gap, Parameter Efficient Fine-Tuning (PEFT) methods are largely adopted today by the community.

## What is PEFT?

![Scaling Down to Scale Up: A Guide to Parameter-Efficient Fine-Tuning](https://drive.google.com/uc?export=view&id=1wIWeCvKvbr5CSYiUlr_qUotPJ6vT92Rg)
- Image taken from the paper: Scaling Down to Scale Up: A Guide to Parameter-Efficient Fine-Tuning

## What is Lora?

The LoRA method by Hu et al. from the Microsoft team came out in 2021 and works by attaching extra trainable parameters into a model (that we will denote by base model). The original weight matrix remains frozen and doesn’t receive any further adjustments. To produce the final results, both the original and the adapted weights are combined.

**This approach has several advantages:**
- LoRA makes fine-tuning more efficient by drastically reducing the number of trainable parameters.
- The original pre-trained weights are kept frozen, which means you can have multiple lightweight and portable LoRA models for various downstream tasks built on top of them.
- LoRA is orthogonal to many other parameter-efficient methods and can be combined with many of them.
- The performance of models fine-tuned using LoRA is comparable to the performance of fully fine-tuned models.
- LoRA does not add any inference latency when adapter weights are merged with the base model

![Adapter merging with base model](https://pytorch.org/assets/images/finetune-llms/fg2.gif)


In [None]:
# LoRA config based on QLoRA paper & Sebastian Raschka experiment
from peft import LoraConfig

peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=8, #
        bias="none",
        target_modules="all-linear", # ["q_proj","k_proj","v_proj"]
        task_type="CAUSAL_LM",
)

## Quantization Methods: QLORA

You can read more about quantization features in this specific section of the documentation: https://huggingface.co/docs/transformers/main_classes/quantization

When using QLoRA with Adam optimizer using a 4-bit base model and mixed-precision mode, we need to allocate per parameter:

* ~0.5 bytes for the weight
* 2 bytes for the gradient
* 4 + 8 bytes for the Adam optimizer states


In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype= torch.bfloat16 #but should be set to the optimal BFloat16 for newer hardware supporting it to achieve the best performance.
)

In [None]:
# Huggingface Model ID
model_id = "Trendyol/Trendyol-LLM-7b-chat-v0.1"

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings

In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(44222, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [None]:
from peft import get_peft_model

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

peft_model = get_peft_model(model, peft_config)
print_trainable_parameters(peft_model)

trainable params: 19988480 || all params: 3620524032 || trainable%: 0.5520880354150899


## Dataset Preperation

In [None]:
dataset =  load_dataset("oguuzhansahin/chatdoctor-translated", split="train")
print(dataset)

Dataset({
    features: ['output', 'input'],
    num_rows: 13293
})


In [None]:
dataset[1]

{'output': "Sohbet Doktor'u kullanarak için teşekkür ederim. Bir doktora görünmenizi öneririm. Bebeğinizin yaşı itibariyle yaygın olan bronşiyolit adında bir akciğer enfeksiyonu geçiriyor olabilir. Bu, genellikle bir virüs tarafından neden olur. Hırıltıyı hafifletmek ve ayrıca tıkanıklıkla başa çıkmak için Albuterol'ün nebülizasyon yoluyla kullanılması gerekmektedir. Soğuk algınlığı için bir dekonjestan da kullanılabilir. Ayrıca, diğer hastalıkları (örn. zatürree) dışlamak için bir göğüs röntgeni yapılmasını da tavsiye ederim.\nSaygılarımla,\nMark Rosario\nGenel Pediatri/Pediyatrik Pulmonoloji",
 'input': 'Beş aylık çok tıkanıklığı olan ve korkunç bir öksürüğü bulunan bir bebeğim var. Öksürüğü gıcırdıyor/kaba ve havlamalı sesli. Öksürüklerinde ve yukarı çıkan mukusta boğulmaya başladı. Ayrıca ateşi ve akıntılı burnu var. Onu acil bakıma götürmeli miyim?'}

In [None]:
system_message = "Sen hastalara yardım eden Sohbet Doktorusun. Hastaların şikayetlerini dinleyip onlara çözüm öner."

def create_conversation(sample):

  return {
    "messages": [
      {"role": "system", "content": system_message},
      {"role": "user", "content": sample["input"]},
      {"role": "assistant", "content": sample["output"]}
    ]
  }


dataset = dataset.train_test_split(test_size = 0.05)
dataset = dataset.map(create_conversation, remove_columns=dataset["train"].features,batched=False)

print("Dataset: ",dataset["train"][0]["messages"])

Map:   0%|          | 0/12628 [00:00<?, ? examples/s]

Map:   0%|          | 0/665 [00:00<?, ? examples/s]

Dataset:  [{'content': 'Sen hastalara yardım eden Sohbet Doktorusun. Hastaların şikayetlerini dinleyip onlara çözüm öner.', 'role': 'system'}, {'content': 'Doktor, ayağımda çok fazla tahrişe neden olan ve ayağımı düzgün bir şekilde hareket ettirmemi zorlaştıran bir cilt büyümesi var. Aynı zamanda parmağımda bir yumru da var. Buna ne sebep oluyor olabilir?', 'role': 'user'}, {'content': 'Belirtilerinize göre, bir halluks valgusunuz olabilir. Halluks valgus, büyük ayak parmağının tabanındaki eklem üzerinde oluşan kemiksi bir çıkıntıdır ve cilt tahrişine hatta gözle görülür bir kitleye neden olabilir. Ayak hareket yeteneğinizi etkileyebilir ve yürürken rahatsızlık yaratma ihtimali bile vardır.', 'role': 'assistant'}]


In [None]:
tokenizer

LlamaTokenizerFast(name_or_path='Trendyol/Trendyol-LLM-7b-chat-v0.1', vocab_size=44222, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [None]:
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
tokenizer

LlamaTokenizerFast(name_or_path='Trendyol/Trendyol-LLM-7b-chat-v0.1', vocab_size=44222, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'unk_token': '<unk>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	44222: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	44223: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
args = TrainingArguments(
    output_dir="trendyol-chat-doctor", # directory to save and repository id
    num_train_epochs=5,                     # number of training epochs
    per_device_train_batch_size=2,          # batch size per device during training
    #gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    #gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
#   push_to_hub=True,                       # push model to hub
#   report_to="tensorboard",                # report metrics to tensorboard
)

max_seq_length = 1024 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)


trainer.train()
trainer.save_model()