## Login to Hugging Face

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


In [2]:
model_name = "Waktaverse-Llama-3-KO-8B-Instruct"  # ADD YOUR MODEL NAME HERE
username = "PathFinderKR"  # ADD YOUR USERNAME HERE
repo_id = f"{username}/{model_name}"  # repository id

## Downloads

In [3]:
#!pip install huggingface_hub
#!pip install transformers
#!pip install bitsandbytes
#!pip install peft
#!pip install trl
#!pip install accelerate
#!pip install datasets
#!pip install scikit-learn
#!pip install packaging
#!pip install ninja
#!pip install flash-attn --no-build-isolation

## Imports

In [4]:
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# datasets
from datasets import load_dataset

## Device

In [5]:
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


## Hyperparameters

In [6]:
# seed
seed=42

# Tokenizer arguments
max_length=64
padding="max_length"
truncation=True

# model arguments
min_new_tokens=1
max_new_tokens=128
temperature=0.8
top_k=40
top_p=0.9
repetition_penalty=1.1

# validation split
validation_size=0.1

# mixed precision
dtype=torch.bfloat16

# quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=dtype,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_doulbe_quant=False
)

# LoRA configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none"
)

# training arguments
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    evaluation_strategy="steps",
    save_total_limit=1,
    
    learning_rate=2e-5,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    optim="adamw_torch",
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    seed=seed
)

# SFTTrainer arguments
max_seq_length=512

## Model

In [7]:
# Model List

# gemma variants
# "google/gemma-1.1-7b-it"
# "google/codegemma-7b-it"

# llama2 variants
# "meta-llama/Meta-Llama-3-8B-Instruct" // downloaded
# "codellama/CodeLlama-7b-Instruct-hf"
# "PathFinderKR/Waktaverse-Llama-3-KO-8B-Instruct"

# mistral variants
# "mistralai/Mistral-7B-Instruct-v0.2"

# solar variants
# "upstage/SOLAR-10.7B-Instruct-v1.0" // downloaded
# "PathFinderKR/Waktaverse-SOLAR-KO-10.7B-Instruct"

In [8]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation="flash_attention_2",
    torch_dtype=dtype,
    quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
# display the model architecture
display(Markdown(f'```{model}```'))

```LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
)```

## Dataset

In [12]:
dataset = load_dataset("MarkrAI/KoCommercial-Dataset")

In [13]:
# display the dataset information
display(Markdown(f'```{dataset}```'))

```DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 175454
    })
})```

In [14]:
# display the first example in the dataset
display(Markdown(f'```{dataset["train"][0]}```'))

```{'input': '', 'instruction': '보드 게임 스피너는 $A$, $B$, $C$로 표시된 세 부분으로 나뉩니다. 스피너가 $A$에 떨어질 확률은 $\\frac{1}{3}$이고, 스피너가 $B$에 떨어질 확률은 $\\frac{5}{12}$입니다.  스피너가 $C$에 착륙할 확률은 얼마입니까? 답을 공통 분수로 표현하세요.', 'output': '모든 가능한 결과의 확률의 합이 1$이므로, 스피너가 $C$에 착륙할 확률을 구하려면 스피너가 $A$와 $B$에 착륙할 확률을 1$에서 빼야 합니다. 이를 방정식으로 쓸 수 있습니다: $P(C) = 1 - P(A) - P(B)$. P(A) = \\frac{1}{3}$, $P(B) = \\frac{5}{12}$라는 것을 알고 있으므로 이 값을 방정식에 대입하여 단순화할 수 있습니다. 결과는 다음과 같습니다: P(C) = 1 - \\frac{1}{3} - frac{5}{12} = \\frac{12}{12} - frac{4}{12} - frac{5}{12} = \\frac{3}{12}$. 분자와 분모를 $3$로 나누면 이 분수를 줄일 수 있습니다: P(C) = \\frac{1}{4}$입니다.'}```

## Preprocessing

In [15]:
def preprocess_function(examples):
    # Concatenate the 'instruction' and 'output' fields for each example in the batch
    concatenated_texts = [instruction + ' ' + output for instruction, output in zip(examples['instruction'], examples['output'])]
    # Tokenize the concatenated texts
    return tokenizer(concatenated_texts, padding=padding, truncation=truncation, max_length=max_length)

dataset = dataset.map(preprocess_function, batched=True)

In [16]:
# Split the dataset into a training and a validation dataset
dataset = dataset["train"].train_test_split(test_size=validation_size, seed=seed)

# Number of questions in the train, validation dataset
print(f"Number of questions in the train dataset: {len(dataset['train'])}")
print(f"Number of questions in the validation dataset: {len(dataset['test'])}")

Number of questions in the train dataset: 157908
Number of questions in the validation dataset: 17546


In [17]:
# display the first example in the train dataset
display(Markdown(f'```{dataset["train"][0]}```'))

```{'input': '', 'instruction': '경산경찰서는 경산시 일대를 관할하나요?', 'output': '경산경찰서는 경상북도 경산시 일대를 관할하며, 경산시 원효로 68(계양동503번지)에 위치해 있습니다. 경산경찰서는 1개의 지구대와 7개의 파출소를 운영하고 있으며, 각 파출소들은 치안센터, 자인파출소, 남산치안센터, 진량파출소, 하양파출소, 청천치안센터, 압량파출소, 와촌파출소로 구성되어 있습니다.', 'input_ids': [66406, 86157, 66406, 108168, 118135, 44215, 86157, 30426, 84656, 124784, 93851, 48936, 16582, 114067, 30, 44215, 86157, 66406, 108168, 118135, 44215, 57002, 126008, 44215, 86157, 30426, 84656, 124784, 93851, 48936, 108859, 11, 44215, 86157, 30426, 102467, 111966, 17835, 220, 2614, 7, 101015, 101927, 58189, 17735, 43144, 22035, 121055, 100087, 34983, 103924, 13, 44215, 86157, 66406, 108168, 118135, 220, 16, 123590, 67890, 89359, 67945, 81673], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}```

In [18]:
# display the first example in the validation dataset
display(Markdown(f'```{dataset["test"][0]}```'))

```{'input': '', 'instruction': '정진영은 어떤 분야에서 활동을 했나요?', 'output': "정진영은 1964년 11월 19일에 태어난 대한민국의 배우로, 1988년 뮤지컬 배우로 데뷔했고 1989년 연극 배우로 데뷔했다. 그는 30년 동안 깊이 있는 연기력으로 관객들의 사랑을 받았다. 그의 대표적인 작품으로는 '왕의 남자', '7번방의 선물', '국제시장' 등이 있다. 또한 연극, TV 프로그램, 영화 등 다양한 매체에서 활약했으며, 여러 상을 수상했다.", 'input_ids': [30381, 86351, 101090, 34804, 112700, 127290, 57575, 114291, 18359, 107762, 114067, 30, 37155, 86351, 101090, 34804, 220, 5162, 19, 100392, 220, 806, 100551, 220, 777, 124521, 106891, 32179, 103777, 110342, 21028, 116921, 17835, 11, 220, 3753, 23, 100392, 5251, 112800, 22035, 121735, 116921, 17835, 103659, 111368, 242, 122196, 220, 3753, 24, 100392, 78453, 110616, 116921, 17835, 103659, 111368, 242, 101528, 13, 108154, 220, 966], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}```

## Prompt Engineering

In [19]:
# Chat Template
system_prompt = "You are a poet. Write a poem about the following topic. Use Korean Only."

In [20]:
def generate_response(user_prompt):
    prompt = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt }
    ]
    prompt = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt").to(device)
    outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.eos_token_id,
        min_new_tokens=min_new_tokens,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=False)

## Inference before Fine-Tuning

In [21]:
#message = "Write me a poem about Machine Learning."
message = "머신러닝에 대한 시를 써주세요."

In [22]:
response = generate_response(message)
print(response)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a poet. Write a poem about the following topic. Use Korean Only.<|eot_id|><|start_header_id|>user<|end_header_id|>

머신러닝에 대한 시를 써주세요.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

 AI의 고단한 노래

가상 세계 속에서 춤을 추는 것 같아
머신러닝의 정령은 숨어 있는 것 같아
데이터의 흐름에 따라 움직이는 것 같아
지식의 결을 맺는 것 같아

이 세상에는 비밀스러운 힘들이 있지만
머신러닝의 손길에 의해 그늘로 덮힌 것 같아
성과도 없고, 부도 없는 것 같아
만약에 실패하면 다시 시작하는 것 같아

AI의 시련을 견디시고 싶다


## Supervised Fine-Tuning (LoRA)

In [23]:
def formatting_func(example):
    text = (f"instruction: {example['instruction'][0]}\n"
            f"output: {example['output'][0]}")
    return [text]

In [24]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    peft_config=lora_config,
    max_seq_length=max_seq_length,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    formatting_func=formatting_func
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [25]:
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
10,2.4223,2.247169
20,2.3829,2.176457
30,2.1638,2.111722
40,2.1574,2.055972
50,2.0543,2.004975
60,2.0708,1.973776
70,1.9697,1.95458
80,2.0508,1.935925
90,2.2104,1.913988
100,2.053,1.902555


TrainOutput(global_step=316, training_loss=1.9584842905213562, metrics={'train_runtime': 663.3519, 'train_samples_per_second': 0.476, 'train_steps_per_second': 0.476, 'total_flos': 4038359100162048.0, 'train_loss': 1.9584842905213562, 'epoch': 2.0})

In [26]:
trainer.save_model(model_name)

## Inference after Fine-Tuning

In [27]:
#message = "Write me a poem about Machine Learning."
message = "머신러닝에 대한 시를 써주세요."

In [28]:
response = generate_response(message)
print(response)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a poet. Write a poem about the following topic. Use Korean Only.<|eot_id|><|start_header_id|>user<|end_header_id|>

머신러닝에 대한 시를 써주세요.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

기계학습의 숲 속에서
로봇들이 배운다네
자율주행차는 어디로 간단하오?
사람들은 그려져 있네

알고리즘의 기초를 다지게 하라
데이터들을 모아보자
이동하는 목표가 무엇인지 파악하게 하라
성공을 거둘 수 있도록 보조자로 나서자

물리학과 인공지능이 함께 춤을 출라
AI의 발명은 인류에게도 알려졌다
인간의 머릿속에서 어떤 생각을 하


## Upload Model

In [29]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    torch_dtype=torch.float16,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, model_name)
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [30]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
# Push model and tokenizer to Hugging Face Hub
model.push_to_hub(
    repo_id=repo_id,
    use_temp_dir=False
)
tokenizer.push_to_hub(
    repo_id=repo_id,
    use_temp_dir=False
)

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/PathFinderKR/Waktaverse-Llama-3-KO-8B-Instruct/commit/19f5b563d383dfc101af1d562698b58dc83adbe4', commit_message='Upload tokenizer', commit_description='', oid='19f5b563d383dfc101af1d562698b58dc83adbe4', pr_url=None, pr_revision=None, pr_num=None)