## Importing Libraries

In [None]:
import os
from dotenv import load_dotenv
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
from datasets import load_dataset

# wandb
import wandb

## Login to Hugging Face

In [None]:
load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token,  # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

In [None]:
model_name = "Waktaverse-Llama-3-KO-8B-Instruct"  # ADD YOUR MODEL NAME HERE
username = "PathFinderKR"  # ADD YOUR USERNAME HERE
repo_id = f"{username}/{model_name}"  # repository id

## Login to Weights & Biases

In [None]:
api_key = os.getenv("WANDB_API_KEY")
wandb.login(
    key=api_key  # ADD YOUR API KEY HERE
)
wandb.init(project=model_name)

## Device

In [None]:
# Device setup
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

In [None]:
# Flash Attention Implementation
if device == "cuda:0":
    if torch.cuda.get_device_capability()[0] >= 8: # Ampere, Ada, or Hopper GPUs
        attn_implementation = "flash_attention_2"
        torch_dtype = torch.bfloat16
    else:
        attn_implementation = "eager"
        torch_dtype = torch.float16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float32
print(f"Attention Implementation = {attn_implementation}")

## Hyperparameters

In [None]:
################################################################################
# Tokenizer parameters
################################################################################
max_length=8192
padding="do_not_pad" # "max_length", "longest", "do_not_pad"
truncation=True

################################################################################
# Generation parameters
################################################################################
num_return_sequences=1
max_new_tokens=1024
do_sample=True # True for sampling, False for greedy decoding
temperature=0.6
top_p=0.9
repetition_penalty=1.1

################################################################################
# bitsandbytes parameters
################################################################################
load_in_4bit=True
bnb_4bit_compute_dtype=torch_dtype
bnb_4bit_quant_type="nf4" # "nf4", #fp4"
bnb_4bit_use_double_quant=True

################################################################################
# LoRA parameters
################################################################################
task_type="CAUSAL_LM"
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
r=8
lora_alpha=16
lora_dropout=0.1
bias="none"

################################################################################
# Training parameters
################################################################################
output_dir="./results"
logging_dir="./logs"
save_strategy="epoch"
logging_strategy="steps" # "steps", "epoch"
if logging_strategy == "steps":
    logging_steps=10
else:
    logging_steps=None
save_total_limit=1
report_to="wandb"

num_train_epochs=1
per_device_train_batch_size=2
gradient_accumulation_steps=1
gradient_checkpointing=True
bf16=True
learning_rate=2e-5
lr_scheduler_type="cosine" # "constant", "linear", "cosine"
warmup_ratio=0.1
optim = "paged_adamw_8bit"
weight_decay=0.1

################################################################################
# SFT parameters
################################################################################
max_seq_length=1024
packing=True

## Tokenizer

In [None]:
# Korean Tokenizer ID
tokenizer_id = "saltlux/Ko-Llama3-Luxia-8B"

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

In [None]:
# Add padding token
tokenizer.add_special_tokens({"pad_token": "<|pad_id|>"})

In [None]:
# Vocabulary size
print(f"Vocabulary size: {len(tokenizer)}")
# Special tokens
print(f"Special tokens: {tokenizer.special_tokens_map}")
# Padding side
print(f"Padding side: {tokenizer.padding_side}")

## Model

In [None]:
# Model ID for base model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
# Quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant
)

In [None]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation=attn_implementation,
    torch_dtype=torch_dtype,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True
)

In [None]:
# Resize token embeddings
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Display the model architecture
display(Markdown(f'```{model}```'))

In [None]:
# Number of parameters
print(f"Number of parameters (in billions): {model.num_parameters() / 1e9:.2f}")

## Dataset

In [None]:
# Dataset ID
dataset_id = "MarkrAI/KoCommercial-Dataset"

In [None]:
# Load the dataset
dataset = load_dataset(dataset_id)

In [None]:
# Dataset information
dataset

In [None]:
# Dataset example
print(dataset["train"][0]["instruction"])
print(dataset["train"][0]["input"])
print(dataset["train"][0]["output"])

## Preprocessing

In [None]:
# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

In [None]:
# Alpaca dataset format: 
# {"instruction": [str],
#   "input": [str],
#   "output": [str]}

# Korean
def prompt_without_input(example):
    text = (
        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        "다음은 작업을 설명하는 지시사항입니다. 요청을 적절하게 완료하는 응답을 작성하세요.<|eot_id|>"
        
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{example['instruction']}<|eot_id|>"
        
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
        f"{example['output']}<|eot_id|>"
        )
    return {'text': text}
    
def prompt_with_input(example):
    text = ( 
        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        "다음은 작업을 설명하는 지시사항과, 함께 쌍을 이루어 제공되는 입력입니다. 요청을 적절하게 완료하는 응답을 작성하세요.<|eot_id|>"
        
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{example['instruction']}"
        f"{example['input']}<|eot_id|>"
        
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
        f"{example['output']}<|eot_id|>"
        )
    return {'text': text}

def create_alpaca_prompt(example):
    # if input is not provided
    if example["input"] == "":
        return prompt_without_input(example)
    # if input is provided
    else:
        return prompt_with_input(example)
    
# Apply the alpaca prompt to the dataset
dataset = dataset.map(create_alpaca_prompt)

In [None]:
# Display the first example
print(dataset["train"][0]["text"])

## Supervised Fine-Tuning (LoRA)

In [None]:
# Prepare model for kbit training
model = prepare_model_for_kbit_training(model)

In [None]:
lora_config = LoraConfig(
    task_type=task_type,
    target_modules=target_modules,
    r=r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=bias
)

In [None]:
# Number of trainable parameters
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir=logging_dir,
    save_strategy=save_strategy,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    save_total_limit=save_total_limit,
    report_to=report_to,
    
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    bf16=bf16,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    optim=optim,
    weight_decay=weight_decay
)

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    peft_config=lora_config,
    args=training_args,
    train_dataset=dataset["train"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=packing
)

In [None]:
trainer.train()

In [None]:
wandb.finish()
trainer.save_model(model_name)

## Inference

In [None]:
def prompt_template(system, user):
    return (
        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        f"{system}<|eot_id|>"
        
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{user}<|eot_id|>"
        
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
    )

In [None]:
def generate_response(system ,user):
    prompt = prompt_template(system, user)
    
    input_ids = tokenizer.encode(
        prompt,
        max_length=max_length,
        padding=padding,
        truncation=truncation,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(device)
    
    outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=num_return_sequences,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=False)

In [None]:
#system_prompt = "You are a helpful assistant. Respond to the following user prompt."
system_prompt = "다음 지시사항에 대한 응답을 작성해주세요."
#user_prompt = "Write me a poem about Machine Learning."
user_prompt = "머신러닝에 대한 시를 써주세요."

In [None]:
response = generate_response(system_prompt, user_prompt)
print(response)