## Login to Hugging Face

In [None]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

In [None]:
model_name = "Waktaverse-Llama-3-KO-8B-Instruct"  # ADD YOUR MODEL NAME HERE
username = "PathFinderKR"  # ADD YOUR USERNAME HERE
repo_id = f"{username}/{model_name}"  # repository id

## Downloads

In [None]:
#!pip install huggingface_hub
#!pip install transformers
#!pip install bitsandbytes
#!pip install peft
#!pip install trl
#!pip install accelerate
#!pip install datasets
#!pip install scikit-learn
#!pip install packaging
#!pip install ninja
#!pip install flash-attn --no-build-isolation

## Imports

In [None]:
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

# datasets
from datasets import load_dataset

## Device

In [None]:
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

## Hyperparameters

In [None]:
# seed
seed=42

# Tokenizer arguments
max_length=256
padding="do_not_pad"
truncation=True

# model arguments
num_return_sequences=1
early_stopping=True
num_beams=3
min_new_tokens=1
max_new_tokens=512
do_sample=True
temperature=0.6
top_k=40
top_p=0.9
repetition_penalty=1.1

# validation split
validation_size=0.1

# mixed precision
dtype=torch.bfloat16

# quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=dtype,
    bnb_4bit_quant_type="nf4"
)

# LoRA configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none"
)

# training arguments
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=1000,
    evaluation_strategy="steps",
    save_total_limit=1,
    
    learning_rate=2e-5,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    optim="adamw_torch",
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    seed=seed
)

# SFTTrainer arguments
max_seq_length=512

## Model

In [None]:
# Model List

# gemma variants
# "google/gemma-1.1-7b-it"
# "google/codegemma-7b-it"

# llama2 variants
# "meta-llama/Meta-Llama-3-8B-Instruct" // downloaded
# "codellama/CodeLlama-7b-Instruct-hf"
# "PathFinderKR/Waktaverse-Llama-3-KO-8B-Instruct"

# mistral variants
# "mistralai/Mistral-7B-Instruct-v0.2"

# solar variants
# "upstage/SOLAR-10.7B-Instruct-v1.0" // downloaded
# "PathFinderKR/Waktaverse-SOLAR-KO-10.7B-Instruct"

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
special_tokens_dict = {'pad_token': '<|pad|>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.pad_token = '<|pad|>'
tokenizer.padding_side = "right"

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation="flash_attention_2",
    torch_dtype=dtype,
    quantization_config=quantization_config
)

In [None]:
# display the model architecture
display(Markdown(f'```{model}```'))

## Dataset

In [None]:
dataset = load_dataset("MarkrAI/KoCommercial-Dataset")

In [None]:
# dataset information
dataset

In [None]:
# dataset example
dataset["train"][0]

## Preprocessing

In [None]:
# Alpaca dataset format
def preprocess_function(examples):
    instruction = examples["instruction"]
    input_text = examples["input"]
    output_text = examples["output"]
    return {
        "instruction": instruction,
        "input": input_text,
        "output": output_text
    }
    
dataset = dataset.map(preprocess_function, batched=True)

In [None]:
# Split the dataset into a training and a validation dataset
dataset = dataset["train"].train_test_split(test_size=validation_size, seed=seed)

# Number of questions in the train, validation dataset
print(f"Number of questions in the train dataset: {len(dataset['train'])}")
print(f"Number of questions in the validation dataset: {len(dataset['test'])}")

In [None]:
# dataset examples
print(dataset["train"][0]["instruction"])
print(dataset["train"][0]["input"])
print(dataset["train"][0]["output"])

In [None]:
print(dataset["test"][0]["instruction"])
print(dataset["test"][0]["input"])
print(dataset["test"][0]["output"])

## Inference before Fine-Tuning

In [None]:
def generate_response(system ,user):
    prompt = [
        {"role": "system", "content": system},
        {"role": "user", "content": user}
    ]
    prompt = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=False)
    
    input_ids = tokenizer.encode(
        prompt,
        max_length=max_length,
        padding=padding,
        truncation=truncation,
        add_special_tokens=True,
        return_tensors="pt"
    ).to(device)
    outputs = model.generate(
        input_ids=input_ids,
        eos_token_id=tokenizer.convert_tokens_to_ids("<|eot_id|>"),
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=num_return_sequences,
        early_stopping=early_stopping,
        num_beams=num_beams,
        min_new_tokens=min_new_tokens,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=False)

In [None]:
system_prompt = "You are a poet. Write a poem about the following topic. Use Korean Only."

In [None]:
#user_prompt = "Write me a poem about Machine Learning."
user_prompt = "머신러닝에 대한 시를 써주세요."

In [None]:
response = generate_response(system_prompt, user_prompt)
print(response)

## Supervised Fine-Tuning (LoRA)

In [None]:
def formatting_func(example):
    texts = []
    for i in range(len(example['instruction'])):
        instruction = example['instruction'][i]
        input_text = example['input'][i]
        output_text = example['output'][i]
        text = (
            f"### Instruction: {instruction}\n"
            f"### Input: {input_text}\n"
            f"### Output: {output_text}\n"
        )
        texts.append(text)
    return texts

In [None]:
response_template = "### Output:"
data_collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    peft_config=lora_config,
    max_seq_length=max_seq_length,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    formatting_func=formatting_func,
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(model_name)

## Inference after Fine-Tuning

In [None]:
system_prompt = "You are a poet. Write a poem about the following topic. Use Korean Only."

In [None]:
#user_prompt = "Write me a poem about Machine Learning."
user_prompt = "머신러닝에 대한 시를 써주세요."

In [None]:
response = generate_response(system_prompt, user_prompt)
print(response)

## Upload Model

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    torch_dtype=torch.float16,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, model_name)
model = model.merge_and_unload()

In [None]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Push model and tokenizer to Hugging Face Hub
model.push_to_hub(
    repo_id=repo_id,
    use_temp_dir=False
)
tokenizer.push_to_hub(
    repo_id=repo_id,
    use_temp_dir=False
)