# Importing Libraries

In [1]:
import os
from dotenv import load_dotenv
import random
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass

# PyTorch
import torch

# Huggingface
import huggingface_hub
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

# Weights & Biases
import wandb

# Hyperparameters

In [2]:
@dataclass
class CONFIG:
    debug: bool = False
    
    # Model
    model_type: str = "base"  # "base", "instruct"
    model_size: str = "1B"  # "1B", "3B"
    if model_type == "base":
        if model_size == "1B":
            model_id: str = "meta-llama/Llama-3.2-1B"
        elif model_size == "3B":
            model_id: str = "meta-llama/Llama-3.2-3B"
    elif model_type == "instruct":
        if model_size == "1B":
            model_id: str = "meta-llama/Llama-3.2-1B-Instruct"
        elif model_size == "3B":
            model_id: str = "meta-llama/Llama-3.2-3B-Instruct"

    # HuggingFace Hub
    username: str = "PathFinderKR"
    model_name: str = f"Llama-3.2-KO-{model_size}-Instruct"
    repo_id: str = f"{username}/{model_name}"
    
    # Data
    dataset_id: str = "MarkrAI/KOpen-HQ-Hermes-2.5-60K"
    validation_size: float = 0.1
    
    # Training
    output_dir: str = "./results"
    logging_dir: str = "./logs"
    save_strategy: str = "epoch"
    logging_strategy: str = "steps"
    logging_steps: int = 10
    evaluation_strategy: str = "epoch"
    save_total_limit: int = 1
    report_to: str = "wandb" if not debug else None
    
    num_train_epochs: int = 1
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4
    gradient_checkpointing: bool = True
    bf16: bool = True
    learning_rate: float = 2e-5
    lr_scheduler_type: str = "cosine"
    warmup_ratio: float = 0.1
    optim: str = "adamw_torch"
    weight_decay: float = 0.01
    max_seq_length: int = 4086
    
    # Inference
    max_new_tokens: int = 128000
    do_sample: bool = True
    temperature: float = 0.7
    top_p: float = 0.9
    repetition_penalty: float = 1.1
    
    # Device
    device: torch.device = None
    attn_implementation: str = None
    torch_dtype: torch.dtype = torch.bfloat16
    
    # Seed
    seed: int = 42

# Reproducibility

In [3]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    print(f"Seed: {seed}")
    
set_seed(CONFIG.seed)

Seed: 42


# Device

In [4]:
def configure_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        num_gpu = torch.cuda.device_count()
        print("> Running on GPU", end=' | ')
        print("Num of GPUs: ", num_gpu)
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("> Running on MPS")
    else:
        device = torch.device("cpu")
        print("> Running on CPU")
    return device

CONFIG.device = configure_device()

> Running on GPU | Num of GPUs:  1


In [5]:
def configure_attn_implementation(device):
    if device == "cuda":
        if torch.cuda.get_device_capability()[0] >= 8: # Ampere, Ada, or Hopper GPUs
            attn_implementation = "flash_attention_2"
        else:
            attn_implementation = "eager"
    else:
        attn_implementation = None
    return attn_implementation

CONFIG.attn_implementation= configure_attn_implementation(CONFIG.device)

# Debugging

In [6]:
if CONFIG.debug:
    CONFIG.num_train_epochs = 1

# HuggingFace

In [7]:
load_dotenv()
huggingface_hub.login(
    token=os.getenv("HUGGINGFACE_TOKEN"),
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


# Weights & Biases

In [8]:
if not CONFIG.debug:
    wandb.login(
        key=os.getenv("WANDB_API_KEY")
    )
    wandb.init(
        project=CONFIG.model_name,
    )

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Utility Functions

In [9]:
def generate_base_model(prompt):
    input_ids = tokenizer.encode(
        prompt,
        add_special_tokens=True,
        return_tensors="pt"
    ).to(CONFIG.device)
    
    output = model.generate(
        input_ids,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        streamer=streamer
    )
    
    return tokenizer.decode(output[0], skip_special_tokens=False)

In [10]:
# Llama-3-Instruct template
def prompt_template(system, user):
    return (
        "<|start_header_id|>system<|end_header_id|>\n\n"
        f"{system}<|eot_id|>"
        
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{user}<|eot_id|>"
        
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
    )

def generate_instruct_model(system, user):
    prompt = prompt_template(system, user)
    
    input_ids = tokenizer.encode(
        prompt,
        add_special_tokens=True,
        return_tensors="pt"
    ).to(CONFIG.device)
    
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        streamer=streamer
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=False)

In [11]:
# Alpaca dataset format: 
# {"input": [str],
#  "instruction": [str],
#   "output": [str]}

def prompt_without_input(example):
    text = (
        f"<|start_header_id|>user<|end_header_id|>\n\n"
        f"{example['instruction']}<|eot_id|>"
        
        f"<|start_header_id|>assistant<|end_header_id|>\n\n"
        f"{example['output']}"
    )
    return {'text': text}
    
def prompt_with_input(example):
    text = (
        f"<|start_header_id|>system<|end_header_id|>\n\n"
        f"{example['input']}<|eot_id|>"
        
        f"<|start_header_id|>user<|end_header_id|>\n\n"
        f"{example['instruction']}<|eot_id|>"
        
        f"<|start_header_id|>assistant<|end_header_id|>\n\n"
        f"{example['output']}"
    )
    return {'text': text}
    
def formatting_func(example):
    # if input is not provided
    if example["input"] == "":
        return prompt_without_input(example)
    # if input is provided
    else:
        return prompt_with_input(example)

In [12]:
def plot_token_length(fields):
    for field in fields:
        token_lengths = [len(tokenizer.encode(example[field])) for example in dataset["train"] if example[field] != ""]
        
        plt.figure(figsize=(10, 5))
        plt.hist(token_lengths, bins=50, color='skyblue', edgecolor='black')
        plt.xlabel(f'{field.capitalize()} Length')
        plt.ylabel('Frequency')
        plt.title(f'{field.capitalize()} Token Length Distribution')
        plt.show()
        
        print(f"Max {field} token length: {max(token_lengths)}")
        print(f"Min {field} token length: {min(token_lengths)}")
        print(f"Mean {field} token length: {np.mean(token_lengths):.2f}")
        print(f"Standard deviation of {field} token length: {np.std(token_lengths):.2f}")

# Tokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained(
    CONFIG.model_id,
    padding_side="right"
)
tokenizer.pad_token_id = tokenizer.eos_token_id
streamer = TextStreamer(tokenizer)

In [14]:
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Special tokens: {tokenizer.all_special_tokens}")

Vocabulary size: 128000
Special tokens: ['<|begin_of_text|>', '<|end_of_text|>']


# Model

In [15]:
model = AutoModelForCausalLM.from_pretrained(
    CONFIG.model_id,
    device_map=CONFIG.device,
    attn_implementation=CONFIG.attn_implementation,
    torch_dtype=CONFIG.torch_dtype,
    use_cache=False
)

In [16]:
print(model)
print(f"Number of parameters: {model.num_parameters() / 1e9:.2f}B")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [17]:
if CONFIG.debug:
    sample_text = "Machine learning:"
    sample_generated_text = generate_base_model(sample_text)
    print(sample_generated_text)

# Dataset

In [18]:
dataset = load_dataset(CONFIG.dataset_id)

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 60061
    })
})

In [20]:
if CONFIG.debug:
    print(f"input:\n{dataset['train'][0]['input']}")
    print(f"instruction:\n{dataset['train'][0]['instruction']}")
    print(f"output:\n{dataset['train'][0]['output']}")

input:
귀하는 항상 설명을 제공하는 도움이 되는 조수입니다. 5살짜리 아이에게 대답한다고 생각하세요.
instruction:
Review:
자비로운 속임수에 대한 씁쓸한 현대 코미디 , 영화 제작자의 시대 작품에 필적하지는 않지만 여전히 볼만한 가치가 있습니다.
이 영화 리뷰 문장이 부정적인가요, 긍정적인가요?
output:
이 영화 리뷰 문장은 대부분 긍정적입니다. 리뷰어는 이 영화가 달콤함과 재미가 잘 어우러져 있으며, 감독의 다른 영화만큼 훌륭하지는 않지만 여전히 볼 만한 가치가 있다고 평가합니다.


In [21]:
if CONFIG.debug:
    plot_token_length(["input", "instruction", "output"])

KeyboardInterrupt: 

# Preprocessing

In [None]:
dataset = dataset.shuffle(seed=CONFIG.seed)

In [None]:
dataset = dataset.map(formatting_func)
dataset = dataset.remove_columns(["instruction", "input", "output"])

In [None]:
if CONFIG.debug:
    print(dataset["train"][0]["text"])
    print(dataset["train"][1]["text"])

In [None]:
if CONFIG.debug:
    plot_token_length(["text"])

In [None]:
dataset = dataset["train"].train_test_split(test_size=CONFIG.validation_size)

In [None]:
dataset

In [None]:
torch.cuda.empty_cache()

# Supervised Fine-Tuning

In [None]:
training_args = SFTConfig(
    output_dir=CONFIG.output_dir,
    logging_dir=CONFIG.logging_dir,
    save_strategy=CONFIG.save_strategy,
    logging_strategy=CONFIG.logging_strategy,
    logging_steps=CONFIG.logging_steps,
    evaluation_strategy=CONFIG.evaluation_strategy,
    save_total_limit=CONFIG.save_total_limit,
    report_to=CONFIG.report_to,
    
    num_train_epochs=CONFIG.num_train_epochs,
    per_device_train_batch_size=CONFIG.per_device_train_batch_size,
    gradient_accumulation_steps=CONFIG.gradient_accumulation_steps,
    gradient_checkpointing=CONFIG.gradient_checkpointing,
    bf16=CONFIG.bf16,
    learning_rate=CONFIG.learning_rate,
    lr_scheduler_type=CONFIG.lr_scheduler_type,
    warmup_ratio=CONFIG.warmup_ratio,
    optim=CONFIG.optim,
    weight_decay=CONFIG.weight_decay,
    dataset_text_field="text",
    max_seq_length=CONFIG.max_seq_length
)

response_template = "<|start_header_id|>assistant<|end_header_id|>\n\n"
collator = DataCollatorForCompletionOnlyLM(
    tokenizer=tokenizer,
    response_template=response_template
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=collator
)

In [None]:
trainer.train()

In [None]:
if not CONFIG.debug:
    trainer.save_model(CONFIG.model_name)
    wandb.finish()

# Inference

In [None]:
sample_system = "당신은 친절한 도우미입니다."
sample_user = "머신러닝이 무엇인가요?"
sample_generated_response = generate_instruct_model(sample_system, sample_user)

# Upload

In [None]:
if not CONFIG.debug:
    tokenizer.push_to_hub(
        repo_id=CONFIG.repo_id,
        use_temp_dir=False
    )
    model.push_to_hub(
        repo_id=CONFIG.repo_id,
        use_temp_dir=False
    )