# Hello! and welcome to my Typhoon fine tuning notebook.

<center>
<img src="https://opentyphoon.ai/_next/image?url=%2Fimages%2Flogo.png&w=384&q=75" alt="drawing" width="1400" class="center"/>
</center>

Hello! and welcome to my Typhoon fine tuning notebook.

This notebook I modified [bnb-4bit](https://colab.research.google.com/drive/1Vvju5kOyBsDr7RX_YAvp6ZsSOoSMjhKD?usp=sharing#scrollTo=kEESIVXyESi-) with the model `scb10x/typhoon-7b` from SCB10 and the dataset `Thaweewat/thai-med-pack`. Also, I configured it to be able to run on Kaggle environment with GPU P100 using `bitsandbytes`.
<br/>
Learn more about the model: https://arxiv.org/abs/2312.13951

# Install requirements

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

# Set up environment variables

This is set with kaggle secret collection. If you're runing with other enviroment, they can be set .env

In [2]:
import os
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
os.environ["HF_TOKEN"] = user_secrets.get_secret("HUGGING_FACE")


report_to = "wandb" # Change to none if you're not desired to record train results
os.environ["WANDB_API_KEY"] = user_secrets.get_secret("WANDB_API_KEY") # If `report_to` is set to "none", this can be unset

# Select dataset

In [3]:
dataset_id = "Thaweewat/thai-med-pack"

# Load tokenizer and model

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "scb10x/typhoon-7b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map={"":0}
)

tokenizer_config.json:   0%|          | 0.00/915 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/563k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.99M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.57G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

# Set up trainable parameters

In [5]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(35219, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

In [8]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=128,
    lora_alpha=24,
    target_modules="all-linear",
#     target_modules=["lm_head"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 335544320 || all params: 4113985536 || trainable%: 8.156186186455274


# Preprocess dataset

I parsed into `# Instruction: # Input: # Response:` also I added `<answer></answer>` xml and response but you may modify it. I just like this way :D 

In [9]:
import re
from datasets import load_dataset

data = load_dataset(dataset_id)


def parse(text: str) -> str:
    try:
        question_search = re.search(r'\[INST\](.*)\[/INST\]', text, re.IGNORECASE)
        question = question_search.group(1).strip()


        answer_search = re.search(r'\[/INST\](.*)\</s\>', text, re.IGNORECASE)
        answer = answer_search.group(1).strip()
        spec = f"""<s>Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Act as a doctor and response the input question from a patient in Thai language with XML format <answer>Your answer</answer>
### Input:
{question}
### Response:
<answer> {answer} </answer>
</s>"""
    except:
        print(text)
        raise
    return {
        "spec": spec
    }
    

# parse(data["train"]["text"][0])
# data.map(lambda )
data = data.map(lambda samples: parse(samples["text"]))
data = data.map(lambda samples: tokenizer(samples["spec"]), batched=True)

Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 185M/185M [00:01<00:00, 148MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/189190 [00:00<?, ? examples/s]

Map:   0%|          | 0/189190 [00:00<?, ? examples/s]

# Training

## Train

In [10]:
import transformers
import torch

torch.cuda.empty_cache()

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=128,
        max_steps=30,
        learning_rate=4e-5,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        report_to=report_to,
        load_best_model_at_end=True,
        save_strategy = "no"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

trainer.train()

2024-04-20 04:58:08.034898: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-20 04:58:08.034997: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-20 04:58:08.169472: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
max_steps is given, it will override any value given in num_train_epochs
[34m[1mwandb[0m: Currently logged in as: [33mbatprem[0m ([33mto-the-gold[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.16.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tra

Step,Training Loss
1,2.7321
2,2.6015
3,2.4532
4,2.3515
5,2.2382
6,2.1187
7,2.1583
8,2.0566
9,2.0724
10,1.9851


TrainOutput(global_step=30, training_loss=2.0182278553644815, metrics={'train_runtime': 14382.0131, 'train_samples_per_second': 0.267, 'train_steps_per_second': 0.002, 'total_flos': 6.107959920132096e+16, 'train_loss': 2.0182278553644815, 'epoch': 0.020297055869760557})

## Export model

In [11]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

## Test model

In [12]:
from transformers import  StoppingCriteria, StoppingCriteriaList

class KeywordsStoppingCriteria(StoppingCriteria):
    def __init__(self, keywords_ids:list):
        self._i = 0
        self.keywords = keywords_ids

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        text = tokenizer.decode(
                input_ids[0],
                skip_special_tokens=True
        )
        if text.strip().endswith("</answer>"):
            return True
        if self._i % 50 == 0:
            print(text)
            print("-" * 16)
        self._i += 1
        if input_ids[0][-1] in self.keywords:

            return True
        return False

stop_words = ['</answer>']


stop_ids = [tokenizer.encode(w) for w in stop_words]
stop_criteria = KeywordsStoppingCriteria(stop_ids)

stopping_criteria = StoppingCriteriaList([stop_criteria])


text = """<s>Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Act as a doctor and response the input question from a patient in Thai language with XML format <answer>Your answer</answer>
### Input:
คือผมอยากทราบว่า อาการที่ผมเป็นตอนนี้คือกรดไหลย้อน หรือ เป็นสัญญาณของพิษสุนัขบ้าครับ ผมมีอาการ เเน่นๆ อึดอัดที่คอ เเล้วก็ กลืนน้ำลายลำบากครับ ก่อนหน้านี้มีไข้ต่ำ ปวดหัวนิดหน่อยครับ ช่วง 1 เดือนก่อน ผมทำงานเดินทางโดยจักรยานครับ ทางผ่านมีสุนัขอยู่ตามทางเยอะมากๆ (เเต่จากที่เห็นไม่ได้เห่าเเละไล่ตามผมครับ) เเล้วมาพึ่งมาเป็นอาการดังกล่าวช่วงนี้ครับ ผมจึงไม่เเน่ใจว่าเป็นกรดไหลย้อนหรือเป็นสัญญานอาการเเรกเริ่มของพิษสุนัขบ้าหรอครับ ผมอ่านเเล้วเห็นอาการคล้ายๆกันครับคุณ รบกวนด้วยนะครับ
### Response:"""

device = "cuda:0"

lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model_to_save.generate(
    **inputs, max_new_tokens=400,
    stopping_criteria=stopping_criteria,
    pad_token_id=tokenizer.eos_token_id,
    no_repeat_ngram_size=10,
    forced_eos_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_p=0.95
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Act as a doctor and response the input question from a patient in Thai language with XML format <answer>Your answer</answer>
### Input:
คือผมอยากทราบว่า อาการที่ผมเป็นตอนนี้คือกรดไหลย้อน หรือ เป็นสัญญาณของพิษสุนัขบ้าครับ ผมมีอาการ เเน่นๆ อึดอัดที่คอ เเล้วก็ กลืนน้ำลายลำบากครับ ก่อนหน้านี้มีไข้ต่ำ ปวดหัวนิดหน่อยครับ ช่วง 1 เดือนก่อน ผมทำงานเดินทางโดยจักรยานครับ ทางผ่านมีสุนัขอยู่ตามทางเยอะมากๆ (เเต่จากที่เห็นไม่ได้เห่าเเละไล่ตามผมครับ) เเล้วมาพึ่งมาเป็นอาการดังกล่าวช่วงนี้ครับ ผมจึงไม่เเน่ใจว่าเป็นกรดไหลย้อนหรือเป็นสัญญานอาการเเรกเริ่มของพิษสุนัขบ้าหรอครับ ผมอ่านเเล้วเห็นอาการคล้ายๆกันครับคุณ รบกวนด้วยนะครับ
### Response:

----------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Act as a doctor and response the input question from a patient in Thai language with XML format <answer>You

# (Optional) Push trained model to your Huggingface account

In [13]:
model_to_save.push_to_hub("typhoon-med")
tokenizer.push_to_hub("typhoon-med")
model_to_save.config.push_to_hub("typhoon-med")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/batprem/typhoon-med/commit/fb93b27843b43d37a908777c6d13515fe174d741', commit_message='Upload config', commit_description='', oid='fb93b27843b43d37a908777c6d13515fe174d741', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)

Map:   0%|          | 0/189190 [00:00<?, ? examples/s]