## Login to Hugging Face

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token,  # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


In [2]:
model_name = "Waktaverse-Llama-3-KO-8B-Instruct"  # ADD YOUR MODEL NAME HERE
username = "PathFinderKR"  # ADD YOUR USERNAME HERE
repo_id = f"{username}/{model_name}"  # repository id

## Login to Weights & Biases

In [3]:
import wandb


api_key = os.getenv("WANDB_API_KEY")
wandb.login(
    key=api_key  # ADD YOUR API KEY HERE
)
wandb.init(project=model_name)

[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m ([33mwaktaverse[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/pathfinder/.netrc


## Downloads

In [4]:
#!pip install huggingface_hub
#!pip install wandb
#!pip install transformers
#!pip install bitsandbytes
#!pip install peft
#!pip install trl
#!pip install accelerate
#!pip install datasets
#!pip install scikit-learn
#!pip install packaging
#!pip install ninja
#!pip install flash-attn --no-build-isolation

## Imports

In [5]:
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

# datasets
from datasets import load_dataset

## Device

In [6]:
# Device setup
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


In [7]:
# Flash Attention Implementation
if device == "cuda:0":
    if torch.cuda.get_device_capability()[0] >= 8: # Ampere, Ada, or Hopper GPUs
        attn_implementation = "flash_attention_2"
        torch_dtype = torch.bfloat16
    else:
        attn_implementation = "eager"
        torch_dtype = torch.float16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float32
print(f"Attention Implementation = {attn_implementation}")

Attention Implementation = flash_attention_2


## Hyperparameters

In [8]:
################################################################################
# seed
################################################################################
seed=42
torch.manual_seed(seed)

################################################################################
# Tokenizer parameters
################################################################################
max_length=1024
padding="do_not_pad" # "max_length", "longest", "do_not_pad"
truncation=True

################################################################################
# Generation parameters
################################################################################
num_return_sequences=1
min_new_tokens=1
max_new_tokens=1024
do_sample=True # True for sampling, False for greedy decoding
temperature=0.6
top_k=40
top_p=0.9
repetition_penalty=1.1

################################################################################
# Dataset parameters
################################################################################
validation_size=0.1

################################################################################
# bitsandbytes parameters
################################################################################
load_in_4bit=True
bnb_4bit_compute_dtype=torch_dtype
bnb_4bit_quant_type="nf4" # "nf4", #fp4"
bnb_4bit_use_double_quant=True

################################################################################
# LoRA parameters
################################################################################
task_type="CAUSAL_LM"
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
r=8
lora_alpha=16
lora_dropout=0.05
bias="none"

################################################################################
# TrainingArguments parameters
################################################################################
output_dir="./results"
logging_dir="./logs"
save_strategy="epoch" # "steps", "epoch"
logging_strategy="steps" # "steps", "epoch"
if logging_strategy == "steps":
    logging_steps=10
else:
    logging_steps=None
evaluation_strategy="steps" # "steps", "epoch"
if evaluation_strategy == "steps":
    eval_steps=10
else:
    eval_steps=None
save_total_limit=1
report_to="wandb"

learning_rate=2e-5
num_train_epochs=1
per_device_train_batch_size=1
per_device_eval_batch_size=2
gradient_accumulation_steps=4
optim="adamw_torch" # "sgd", "adamw_torch"
weight_decay=0.1
lr_scheduler_type="cosine" # "constant", "linear", "cosine"
warmup_steps=10
warmup_ratio=0.1

################################################################################
# SFT parameters
################################################################################
max_seq_length=1024
packing=True # packing not supported for masking instructions

## Model

In [9]:
# Model List

# gemma variants
# "google/gemma-1.1-7b-it"
# "google/codegemma-7b-it"

# llama2 variants
# "meta-llama/Meta-Llama-3-8B-Instruct" // downloaded
# "codellama/CodeLlama-7b-Instruct-hf"
# "PathFinderKR/Waktaverse-Llama-3-KO-8B-Instruct"

# mistral variants
# "mistralai/Mistral-7B-Instruct-v0.2"

# solar variants
# "upstage/SOLAR-10.7B-Instruct-v1.0" // downloaded
# "PathFinderKR/Waktaverse-SOLAR-KO-10.7B-Instruct"

In [10]:
# Model ID for base model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [11]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.add_special_tokens({'pad_token': '<|pad_id|>'}) # add padding token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
# Quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant
)

In [13]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation=attn_implementation,
    torch_dtype=torch_dtype,
    quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
# setup chat format, remove this if the model already has chat format
#from trl import setup_chat_format
#model, tokenizer = setup_chat_format(model, tokenizer)

In [15]:
# display the model architecture
display(Markdown(f'```{model}```'))

```LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
)```

## Dataset

In [16]:
# Dataset ID
dataset_id = "MarkrAI/KoCommercial-Dataset"

In [17]:
# Load the dataset
dataset = load_dataset(dataset_id)

In [18]:
# Dataset information
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 175454
    })
})

In [19]:
# Dataset example
dataset["train"][0]

{'input': '',
 'instruction': '보드 게임 스피너는 $A$, $B$, $C$로 표시된 세 부분으로 나뉩니다. 스피너가 $A$에 떨어질 확률은 $\\frac{1}{3}$이고, 스피너가 $B$에 떨어질 확률은 $\\frac{5}{12}$입니다.  스피너가 $C$에 착륙할 확률은 얼마입니까? 답을 공통 분수로 표현하세요.',
 'output': '모든 가능한 결과의 확률의 합이 1$이므로, 스피너가 $C$에 착륙할 확률을 구하려면 스피너가 $A$와 $B$에 착륙할 확률을 1$에서 빼야 합니다. 이를 방정식으로 쓸 수 있습니다: $P(C) = 1 - P(A) - P(B)$. P(A) = \\frac{1}{3}$, $P(B) = \\frac{5}{12}$라는 것을 알고 있으므로 이 값을 방정식에 대입하여 단순화할 수 있습니다. 결과는 다음과 같습니다: P(C) = 1 - \\frac{1}{3} - frac{5}{12} = \\frac{12}{12} - frac{4}{12} - frac{5}{12} = \\frac{3}{12}$. 분자와 분모를 $3$로 나누면 이 분수를 줄일 수 있습니다: P(C) = \\frac{1}{4}$입니다.'}

## Preprocessing

In [20]:
# Split the dataset into a training and a validation dataset
dataset = dataset["train"].train_test_split(test_size=validation_size, seed=seed)

# Number of questions in the train, validation dataset
print(f"Number of questions in the train dataset: {len(dataset['train'])}")
print(f"Number of questions in the validation dataset: {len(dataset['test'])}")

Number of questions in the train dataset: 157908
Number of questions in the validation dataset: 17546


In [21]:
# Dataset examples
print(dataset["train"][0]["instruction"])
print(dataset["train"][0]["input"])
print(dataset["train"][0]["output"])

경산경찰서는 경산시 일대를 관할하나요?

경산경찰서는 경상북도 경산시 일대를 관할하며, 경산시 원효로 68(계양동503번지)에 위치해 있습니다. 경산경찰서는 1개의 지구대와 7개의 파출소를 운영하고 있으며, 각 파출소들은 치안센터, 자인파출소, 남산치안센터, 진량파출소, 하양파출소, 청천치안센터, 압량파출소, 와촌파출소로 구성되어 있습니다.


In [22]:
print(dataset["test"][0]["instruction"])
print(dataset["test"][0]["input"])
print(dataset["test"][0]["output"])

정진영은 어떤 분야에서 활동을 했나요?

정진영은 1964년 11월 19일에 태어난 대한민국의 배우로, 1988년 뮤지컬 배우로 데뷔했고 1989년 연극 배우로 데뷔했다. 그는 30년 동안 깊이 있는 연기력으로 관객들의 사랑을 받았다. 그의 대표적인 작품으로는 '왕의 남자', '7번방의 선물', '국제시장' 등이 있다. 또한 연극, TV 프로그램, 영화 등 다양한 매체에서 활약했으며, 여러 상을 수상했다.


In [23]:
# Train on only a subset of the dataset for demonstration purposes
dataset["train"] = dataset["train"].shuffle(seed=seed).select(range(10000))
dataset["test"] = dataset["test"].shuffle(seed=seed).select(range(1000))

## Inference before Fine-Tuning

In [24]:
def generate_response(system ,user):
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": user}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=False
    )
    
    input_ids = tokenizer.encode(
        prompt,
        max_length=max_length,
        padding=padding,
        truncation=truncation,
        add_special_tokens=True,
        return_tensors="pt"
    ).to(device)
    
    outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=num_return_sequences,
        min_new_tokens=min_new_tokens,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=False)

In [25]:
system_prompt = "You are a helpful assistant. Respond to the following user prompt. Use Korean only. 한국어만 사용하세요."

In [26]:
#user_prompt = "Write me a poem about Machine Learning."
user_prompt = "머신러닝에 대한 시를 써주세요."

In [27]:
response = generate_response(system_prompt, user_prompt)
print(response)

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant. Respond to the following user prompt. Use Korean only. 한국어만 사용하세요.<|eot_id|><|start_header_id|>user<|end_header_id|>

머신러닝에 대한 시를 써주세요.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

머신 러닝의 노래

머신 러닝의 세계에서 우리는
데이터를 모아가며 살아간다
가치 있는 패턴을 찾아내고
미래를 예측하는 꿈을 꾼다

머신 러닝의 힘은
데이터의 무한한 가치를 높인다
추론하고 예측하고 학습하고
새로운 것을 찾아내는 그곳에선

머신 러닝의 세계에서 우리는
지혜와 지식을 얻는 길을 찾는다
정보의海에 헤엄을 칠 때마다
새로운 것을 만나게 된다

머신 러닝의 노래는 끝나지 않아요
계속되는 노래로 새로운 것을 찾아내요
머신 러닝의 세계에서 우리는
꿈과 희망을 향해 간다

(Translation:

Song of Machine Learning

In the world of machine learning, we live amidst data
Gathering and living with it, finding valuable patterns
And dreaming of predicting the future

The power of machine learning is
Increasing the value of data without bounds
Inferencing, predicting, and learning, we find new things

In the world of machine learning, we find our way
To wisdom and knowledge
As we swim in the sea of informat

## Supervised Fine-Tuning (LoRA)

In [28]:
# Alpaca dataset format: 
# {"instruction": [str],
#   "input": [str],
#   "output": [str]}

def prompt_no_input(example):
    prompt = (
        "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
        
        "### Instruction: \n"
        f"{example['instruction']}\n\n"
        
        "### Response: \n"
    )
    return prompt

def prompt_with_input(example):
    prompt = (
        "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
        
        "### Instruction: \n"
        f"{example['instruction']}\n\n"
        
        "### Input: \n"
        f"{example['input']}\n\n"
        
        "### Response: \n"
    )
    return prompt

In [29]:
def formatting_func(example):
    if "input" in example:
        return prompt_with_input(example)
    else:
        return prompt_no_input(example)

In [30]:
lora_config = LoraConfig(
    task_type=task_type,
    target_modules=target_modules,
    r=r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=bias
)

In [31]:
training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir=logging_dir,
    save_strategy=save_strategy,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    evaluation_strategy=evaluation_strategy,
    eval_steps=eval_steps,
    save_total_limit=save_total_limit,
    report_to=report_to,
    
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    weight_decay=weight_decay,
    lr_scheduler_type=lr_scheduler_type,
    warmup_steps=warmup_steps,
    warmup_ratio=warmup_ratio,
    seed=seed
)

In [32]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    peft_config=lora_config,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    formatting_func=formatting_func,
    max_seq_length=max_seq_length,
    packing=packing
)

Generating train split: 0 examples [00:00, ? examples/s]

In [33]:
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
10,1.7516,1.702056
20,1.6266,1.474596
30,1.4706,1.329012
40,1.3148,1.262907
50,1.2516,1.221031
60,1.2491,1.184153
70,1.1824,1.149721
80,1.104,1.120406
90,1.0994,1.102488
100,1.106,1.090456


TrainOutput(global_step=177, training_loss=1.2137212699415993, metrics={'train_runtime': 27993.9435, 'train_samples_per_second': 0.025, 'train_steps_per_second': 0.006, 'total_flos': 3.2737287192182784e+16, 'train_loss': 1.2137212699415993, 'epoch': 1.0})

In [34]:
wandb.finish()
trainer.save_model(model_name)

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▆▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁
eval/runtime,▁▄▄▂▂▃▅▄█▂▆▃▇▄▅▃▄
eval/samples_per_second,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/steps_per_second,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
train/grad_norm,█▃▂▁▂▁▁▁▁▂▂▁▂▃▂▂▂
train/learning_rate,███▇▇▇▆▅▅▄▃▃▂▂▁▁▁
train/loss,█▇▅▄▃▃▃▂▂▂▂▂▁▂▁▂▂

0,1
eval/loss,1.06419
eval/runtime,726.9115
eval/samples_per_second,0.1
eval/steps_per_second,0.051
total_flos,3.2737287192182784e+16
train/epoch,1.0
train/global_step,177.0
train/grad_norm,0.8222
train/learning_rate,0.0
train/loss,1.1001


## Inference after Fine-Tuning

In [35]:
system_prompt = "You are a helpful assistant. Respond to the following user prompt. Use Korean only. 한국어만 사용하세요."

In [36]:
#user_prompt = "Write me a poem about Machine Learning."
user_prompt = "머신러닝에 대한 시를 써주세요."

In [37]:
response = generate_response(system_prompt, user_prompt)
print(response)

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant. Respond to the following user prompt. Use Korean only. 한국어만 사용하세요.<|eot_id|><|start_header_id|>user<|end_header_id|>

머신러닝에 대한 시를 써주세요.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

이 요청을 이해하지 못했어요. "시"가 무엇을 의미하는지 궁금합니다. 머신 러닝에 대한 시를 작성하라는 요구인가요? 또는 다른 의미일지도 모릅니다. 더 많은 정보를 필요로 합니다. 

이 질문에 답하기 위해 추가 정보를 제공해 주세요.<|eot_id|>


## Upload Model

In [38]:
# Flush memory
import gc
gc.collect()

del trainer, model
torch.cuda.empty_cache()

In [39]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(base_model, model_name)
model = model.merge_and_unload()

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [40]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
# Push model and tokenizer to Hugging Face Hub
model.push_to_hub(
    repo_id=repo_id,
    use_temp_dir=False
)
tokenizer.push_to_hub(
    repo_id=repo_id,
    use_temp_dir=False
)

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/PathFinderKR/Waktaverse-Llama-3-KO-8B-Instruct/commit/4f252bb1e6e955f50bb54bc8a532a2a8651fc272', commit_message='Upload tokenizer', commit_description='', oid='4f252bb1e6e955f50bb54bc8a532a2a8651fc272', pr_url=None, pr_revision=None, pr_num=None)