# Importing Libraries

In [1]:
import os
import random
import numpy as np
from dotenv import load_dotenv
from dataclasses import dataclass

# pytorch
import torch

# huggingface
import huggingface_hub
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer
from datasets import load_dataset

# wandb
import wandb

# Hyperparameters

In [None]:
@dataclass
class CONFIG:
    debug: bool = False
    
    # Model
    model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct"
    username: str = "PathFinderKR"
    repo_id: str = f"{username}/{model_id}"
    
    # Dataset
    dataset_id: str = "MarkrAI/KoCommercial-Dataset"
    
    # Tokenizer parameters
    max_length: int = 8192
    padding: str = "do_not_pad"
    truncation: bool = True
    
    # Generation parameters
    num_return_sequences: int = 1
    max_new_tokens: int = 1024
    do_sample: bool = True
    temperature: float = 0.6
    top_p: float = 0.9
    repetition_penalty: float = 1.1
    
    # Device
    device: torch.device = None
    attn_implementation: str = None
    
    # bitsandbytes parameters
    load_in_4bit: bool = True
    bnb_4bit_compute_dtype: torch.dtype = torch.bfloat16
    bnb_4bit_quant_type: str = "nf4"
    bnb_4bit_use_double_quant: bool = True
    
    # LoRA parameters
    task_type: str = "CAUSAL_LM"
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    r: int = 8
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    bias: str = "none"
    
    # Training parameters
    output_dir: str = "./results"
    logging_dir: str = "./logs"
    save_strategy: str = "epoch"
    logging_strategy: str = "steps"
    logging_steps: int = 10
    save_total_limit: int = 1
    report_to: str = "wandb"
    
    num_train_epochs: int = 1
    per_device_train_batch_size: int = 1
    gradient_accumulation_steps: int = 1
    gradient_checkpointing: bool = True
    bf16: bool = True
    learning_rate: float = 2e-5
    lr_scheduler_type: str = "cosine"
    warmup_ratio: float = 0.1
    optim: str = "paged_adamw_32bit"
    weight_decay: float = 0.01
    
    # SFT parameters
    max_seq_length: int = 4096
    packing: bool = True
    
    # Seed
    seed: int = 101

# Reproducibility

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    print(f"Seed: {seed}")
    
set_seed(CONFIG.seed)

# Device

In [None]:
def configure_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        num_gpu = torch.cuda.device_count()
        print("> Running on GPU", end=' | ')
        print("Num of GPUs: ", num_gpu)
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("> Running on MPS")
    else:
        device = torch.device("cpu")
        print("> Running on CPU")
    return device

CONFIG.device = configure_device()

In [None]:
def configure_attn_implementation(device):
    if device == "cuda":
        if torch.cuda.get_device_capability()[0] >= 8: # Ampere, Ada, or Hopper GPUs
            attn_implementation = "flash_attention_2"
            torch_dtype = torch.bfloat16
        else:
            attn_implementation = "eager"
            torch_dtype = torch.float16
    else:
        attn_implementation = "eager"
        torch_dtype = torch.float32
    return attn_implementation, torch_dtype

CONFIG.attn_implementation, CONFIG.torch_dtype = configure_attn_implementation(CONFIG.device)

# Debug

In [None]:
if CONFIG.debug:
    CONFIG.num_train_epochs = 1

# Hugging Face

In [2]:
if not CONFIG.debug:
    load_dotenv()
    token = os.getenv("HUGGINGFACE_TOKEN")
    huggingface_hub.login(token=token, add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


# Weights & Biases

In [4]:
if not CONFIG.debug:
    api_key = os.getenv("WANDB_API_KEY")
    wandb.login(key=api_key)
    wandb.init(project=CONFIG.model_id)

[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/pathfinder/.netrc


# Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG.model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
tokenizer.pad_token = tokenizer.eos_token

In [11]:
print(f"Vocabulary size: {len(tokenizer)}")
print(f"Special tokens: {tokenizer.special_tokens_map}")
print(f"Padding side: {tokenizer.padding_side}")

Vocabulary size: 128256
Special tokens: {'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '<|eot_id|>'}
Padding side: right


# Model

In [13]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=CONFIG.load_in_4bit,
    bnb_4bit_compute_dtype=CONFIG.bnb_4bit_compute_dtype,
    bnb_4bit_quant_type=CONFIG.bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=CONFIG.bnb_4bit_use_double_quant
)

In [14]:
model = AutoModelForCausalLM.from_pretrained(
    CONFIG.model_id,
    device_map=CONFIG.device,
    attn_implementation=CONFIG.attn_implementation,
    torch_dtype=CONFIG.torch_dtype,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
print(model)
print(f"Number of parameters: {model.num_parameters() / 1e9:.2f}")

```LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
)```

# Dataset

In [18]:
dataset = load_dataset(CONFIG.dataset_id)

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 175454
    })
})

In [20]:
print(dataset["train"][0]["instruction"])
print(dataset["train"][0]["input"])
print(dataset["train"][0]["output"])

보드 게임 스피너는 $A$, $B$, $C$로 표시된 세 부분으로 나뉩니다. 스피너가 $A$에 떨어질 확률은 $\frac{1}{3}$이고, 스피너가 $B$에 떨어질 확률은 $\frac{5}{12}$입니다.  스피너가 $C$에 착륙할 확률은 얼마입니까? 답을 공통 분수로 표현하세요.

모든 가능한 결과의 확률의 합이 1$이므로, 스피너가 $C$에 착륙할 확률을 구하려면 스피너가 $A$와 $B$에 착륙할 확률을 1$에서 빼야 합니다. 이를 방정식으로 쓸 수 있습니다: $P(C) = 1 - P(A) - P(B)$. P(A) = \frac{1}{3}$, $P(B) = \frac{5}{12}$라는 것을 알고 있으므로 이 값을 방정식에 대입하여 단순화할 수 있습니다. 결과는 다음과 같습니다: P(C) = 1 - \frac{1}{3} - frac{5}{12} = \frac{12}{12} - frac{4}{12} - frac{5}{12} = \frac{3}{12}$. 분자와 분모를 $3$로 나누면 이 분수를 줄일 수 있습니다: P(C) = \frac{1}{4}$입니다.


# Preprocessing

In [21]:
dataset = dataset.shuffle(seed=CONFIG.seed)

In [22]:
# Alpaca dataset format: 
# {"instruction": [str],
#   "input": [str],
#   "output": [str]}

# Korean
def prompt_without_input(example):
    text = (
        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        "다음은 작업을 설명하는 지시사항입니다. 요청을 적절하게 완료하는 응답을 작성하세요.<|eot_id|>"
        
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{example['instruction']}<|eot_id|>"
        
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
        f"{example['output']}<|eot_id|>"
        )
    return {'text': text}
    
def prompt_with_input(example):
    text = ( 
        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        "다음은 작업을 설명하는 지시사항과, 함께 쌍을 이루어 제공되는 입력입니다. 요청을 적절하게 완료하는 응답을 작성하세요.<|eot_id|>"
        
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{example['instruction']}"
        f"{example['input']}<|eot_id|>"
        
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
        f"{example['output']}<|eot_id|>"
        )
    return {'text': text}

def create_alpaca_prompt(example):
    # if input is not provided
    if example["input"] == "":
        return prompt_without_input(example)
    # if input is provided
    else:
        return prompt_with_input(example)
    
dataset = dataset.map(create_alpaca_prompt)
dataset = dataset.remove_columns(["instruction", "input", "output"])

In [23]:
print(dataset["train"][0]["text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

다음은 작업을 설명하는 지시사항입니다. 요청을 적절하게 완료하는 응답을 작성하세요.<|eot_id|><|start_header_id|>user<|end_header_id|>

정진영은 어떤 분야에서 활동을 했나요?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

정진영은 1964년 11월 19일에 태어난 대한민국의 배우로, 1988년 뮤지컬 배우로 데뷔했고 1989년 연극 배우로 데뷔했다. 그는 30년 동안 깊이 있는 연기력으로 관객들의 사랑을 받았다. 그의 대표적인 작품으로는 '왕의 남자', '7번방의 선물', '국제시장' 등이 있다. 또한 연극, TV 프로그램, 영화 등 다양한 매체에서 활약했으며, 여러 상을 수상했다.<|eot_id|>


# Supervised Fine-Tuning (LoRA)

In [24]:
model = prepare_model_for_kbit_training(model)

In [25]:
lora_config = LoraConfig(
    task_type=CONFIG.task_type,
    target_modules=CONFIG.target_modules,
    r=CONFIG.r,
    lora_alpha=CONFIG.lora_alpha,
    lora_dropout=CONFIG.lora_dropout,
    bias=CONFIG.bias
)

In [26]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 8,051,232,768 || trainable%: 0.2605


In [27]:
training_args = TrainingArguments(
    output_dir=CONFIG.output_dir,
    logging_dir=CONFIG.logging_dir,
    save_strategy=CONFIG.save_strategy,
    logging_strategy=CONFIG.logging_strategy,
    logging_steps=CONFIG.logging_steps,
    save_total_limit=CONFIG.save_total_limit,
    report_to=CONFIG.report_to,
    
    num_train_epochs=CONFIG.num_train_epochs,
    per_device_train_batch_size=CONFIG.per_device_train_batch_size,
    gradient_accumulation_steps=CONFIG.gradient_accumulation_steps,
    gradient_checkpointing=CONFIG.gradient_checkpointing,
    bf16=CONFIG.bf16,
    learning_rate=CONFIG.learning_rate,
    lr_scheduler_type=CONFIG.lr_scheduler_type,
    warmup_ratio=CONFIG.warmup_ratio,
    optim=CONFIG.optim,
    weight_decay=CONFIG.weight_decay
)

In [28]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    peft_config=lora_config,
    args=training_args,
    train_dataset=dataset["train"],
    dataset_text_field="text",
    max_seq_length=CONFIG.max_seq_length,
    packing=CONFIG.packing
)

Generating train split: 0 examples [00:00, ? examples/s]

In [29]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
10,2.3947
20,2.2446
30,2.2702
40,2.3241
50,2.2428
60,2.2621
70,2.2091
80,2.2812
90,2.3502
100,2.2655




TrainOutput(global_step=14393, training_loss=1.540078560273841, metrics={'train_runtime': 286180.9991, 'train_samples_per_second': 0.05, 'train_steps_per_second': 0.05, 'total_flos': 2.6620778223564227e+18, 'train_loss': 1.540078560273841, 'epoch': 1.0})

In [30]:
wandb.finish()
trainer.save_model(CONFIG.model_id)

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▅▁▄▅█▆▇▆▇▇▅▆▅▅▆▇▅▆▆▅▆▆▇▇▆▇▆▆▆█▇▆▆▆▇▇▆▇▆▆
train/learning_rate,▂▃▅▆██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁
train/loss,█▅▄▃▃▂▂▂▃▃▄▁▂▂▃▂▂▂▃▂▁▁▂▂▂▂▁▂▁▂▃▂▂▃▂▂▂▃▂▁

0,1
total_flos,2.6620778223564227e+18
train/epoch,1.0
train/global_step,14393.0
train/grad_norm,1.56807
train/learning_rate,0.0
train/loss,1.4811
train_loss,1.54008
train_runtime,286180.9991
train_samples_per_second,0.05
train_steps_per_second,0.05


