# Hello! and welcome to my Typhoon fine tuning notebook.


Hello! and welcome to my Typhoon fine tuning notebook.

Learn more about the model: https://arxiv.org/abs/2312.13951

# Install requirements

In [1]:

import IPython
import sys

def clean_notebook():
    IPython.display.clear_output(wait=True)
    print("Notebook cleaned.")

# Installs Unsloth, Xformers (Flash Attention) and all other packages!

!pip install datasets peft accelerate bitsandbytes

# Clean up the notebook
clean_notebook()

Notebook cleaned.


# Set up environment variables

This is set with kaggle secret collection. If you're runing with other enviroment, they can be set .env

In [2]:
import os
os.environ['HF_TOKEN']         ="code"


# Select dataset

In [3]:
dataset_id = "Thaweewat/thai-med-pack"

# Load tokenizer and model

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


model_id = "scb10x/llama-3-typhoon-v1.5-8b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

# Set up trainable parameters

In [5]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

In [8]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=20,
    lora_alpha=32,
    target_modules="all-linear",
#     target_modules=["lm_head"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 52428800 || all params: 4593029120 || trainable%: 1.141486340064833


# Preprocess dataset

I parsed into `# Instruction: # Input: # Response:` also I added `<answer></answer>` xml and response but you may modify it. I just like this way :D

In [9]:
import os
import re
from datasets import load_dataset

data = load_dataset(dataset_id)


def parse(text: str) -> str:
    try:
        question_search = re.search(r'\[INST\](.*)\[/INST\]', text, re.IGNORECASE)
        question = question_search.group(1).strip()


        answer_search = re.search(r'\[/INST\](.*)\</s\>', text, re.IGNORECASE)
        answer = answer_search.group(1).strip()
        spec = f"""<s>Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Act as a doctor and response the input question from a patient in Thai language with XML format <answer>Your answer</answer>
### Input:
{question}
### Response:
<answer> {answer} </answer>
</s>"""
    except:
        print(text)
        raise
    return {
        "spec": spec
    }


# parse(data["train"]["text"][0])
# data.map(lambda )
data = data.map(lambda samples: parse(samples["text"]), batch_size=8,num_proc=os.cpu_count())
data = data.map(lambda samples: tokenizer(samples["spec"]), batched=True, batch_size=8,num_proc=os.cpu_count())

In [None]:
data

In [None]:
from IPython.display import Markdown, display


idx = 5

display(Markdown(f"**text:**\n\n{data['train'][idx]['text']}"))
display(Markdown("---"))
display(Markdown(f"**spec:**\n\n{data['train'][idx]['spec']}"))


# Training

## Train

In [None]:
import transformers
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Clear the GPU cache
torch.cuda.empty_cache()

# Set the padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Define the data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Set up the training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=15,  # Adjusted batch size
    gradient_accumulation_steps=4,  # Adjust gradient accumulation steps
    max_steps=35,
    learning_rate=1e-5,
    logging_steps=1,
    output_dir="outputs",
    #optim="paged_adamw_8bit",
    load_best_model_at_end=True,
    save_strategy="no"
    
)

# Define the trainer
trainer = Trainer(
    model=model,
    train_dataset=data["train"],
    args=training_args,
    data_collator=data_collator

)

# # Disable caching to silence warnings (enable for inference)
model.config.use_cache = False

# # Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

# Train the model
trainer.train()


## Export model

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

## Test model

In [None]:
from transformers import  StoppingCriteria, StoppingCriteriaList

class KeywordsStoppingCriteria(StoppingCriteria):
    def __init__(self, keywords_ids:list):
        self._i = 0
        self.keywords = keywords_ids

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        text = tokenizer.decode(
                input_ids[0],
                skip_special_tokens=True
        )
        if text.strip().endswith("</answer>"):
            return True
        if self._i % 50 == 0:
            print(text)
            print("-" * 16)
        self._i += 1
        if input_ids[0][-1] in self.keywords:

            return True
        return False

stop_words = ['</answer>']


stop_ids = [tokenizer.encode(w) for w in stop_words]
stop_criteria = KeywordsStoppingCriteria(stop_ids)

stopping_criteria = StoppingCriteriaList([stop_criteria])


text = """<s>Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Act as a doctor and response the input question from a patient in Thai language with XML format <answer>Your answer</answer>
### Input:
คือผมอยากทราบว่า อาการที่ผมเป็นตอนนี้คือกรดไหลย้อน หรือ เป็นสัญญาณของพิษสุนัขบ้าครับ ผมมีอาการ เเน่นๆ อึดอัดที่คอ เเล้วก็ กลืนน้ำลายลำบากครับ ก่อนหน้านี้มีไข้ต่ำ ปวดหัวนิดหน่อยครับ ช่วง 1 เดือนก่อน ผมทำงานเดินทางโดยจักรยานครับ ทางผ่านมีสุนัขอยู่ตามทางเยอะมากๆ (เเต่จากที่เห็นไม่ได้เห่าเเละไล่ตามผมครับ) เเล้วมาพึ่งมาเป็นอาการดังกล่าวช่วงนี้ครับ ผมจึงไม่เเน่ใจว่าเป็นกรดไหลย้อนหรือเป็นสัญญานอาการเเรกเริ่มของพิษสุนัขบ้าหรอครับ ผมอ่านเเล้วเห็นอาการคล้ายๆกันครับคุณ รบกวนด้วยนะครับ
### Response:"""

device = "cuda:0"

lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model_to_save.generate(
    **inputs, max_new_tokens=400,
    stopping_criteria=stopping_criteria,
    pad_token_id=tokenizer.eos_token_id,
    no_repeat_ngram_size=10,
    forced_eos_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_p=0.95
)



display(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
display(tokenizer.decode(outputs[0], skip_special_tokens=True))

# (Optional) Push trained model to your Huggingface account

In [None]:
model_to_save.push_to_hub("typhoon-med")
tokenizer.push_to_hub("typhoon-med")
model_to_save.config.push_to_hub("typhoon-med")