## Login to Hugging Face

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


In [2]:
model_name = "Waktaverse-SOLAR-KO-10.7B-Instruct"     # ADD YOUR MODEL NAME HERE
username = "PathFinderKR"  # ADD YOUR USERNAME HERE
repo_id = f"{username}/{model_name}"  # repository id

## Downloads

In [3]:
#!pip install huggingface_hub
#!pip install transformers
#!pip install bitsandbytes
#!pip install peft
#!pip install trl
#!pip install accelerate
#!pip install datasets
#!pip install scikit-learn
#!pip install packaging
#!pip install ninja
#!pip install flash-attn --no-build-isolation

## Imports

In [4]:
# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# datasets
from datasets import load_dataset

## Device

In [5]:
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


## Hyperparameters

In [6]:
# seed
seed=42

# Tokenizer arguments
max_length=64
padding="max_length"
truncation=True

# model arguments
max_new_tokens=500

# validation split
validation_size=0.1

# mixed precision
dtype=torch.bfloat16

# quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=dtype,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_doulbe_quant=False
)

# LoRA configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none"
)

# training arguments
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    evaluation_strategy="steps",
    save_total_limit=1,
    
    learning_rate=2e-5,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    optim="adamw_torch",
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    seed=seed
)

# SFTTrainer arguments
max_seq_length=512

## Model

In [7]:
# Model List

# gemma variants
# "google/gemma-2b-it"
# "google/gemma-7b-it" // downloaded

# llama2 variants
# "meta-llama/Llama-2-7b-chat-hf"
# "meta-llama/Llama-2-13b-chat-hf"
# "codellama/CodeLlama-7b-Instruct-hf"
# "codellama/CodeLlama-13b-Instruct-hf"

# mistral variants
# "mistralai/Mistral-7B-Instruct-v0.2"
# "mistralai/Mixtral-8x7B-Instruct-v0.1"

# solar variants
# "upstage/SOLAR-10.7B-Instruct-v1.0" // downloaded
# "PathFinderKR/Waktaverse-SOLAR-KO-10.7B-Instruct"

In [8]:
model_id = "upstage/SOLAR-10.7B-Instruct-v1.0"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation="flash_attention_2",
    torch_dtype=dtype,
    quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [11]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-47): 48 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    

## Dataset

In [12]:
dataset = load_dataset("MarkrAI/KoCommercial-Dataset")

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 175454
    })
})

In [14]:
dataset["train"][0]

{'input': '',
 'instruction': '보드 게임 스피너는 $A$, $B$, $C$로 표시된 세 부분으로 나뉩니다. 스피너가 $A$에 떨어질 확률은 $\\frac{1}{3}$이고, 스피너가 $B$에 떨어질 확률은 $\\frac{5}{12}$입니다.  스피너가 $C$에 착륙할 확률은 얼마입니까? 답을 공통 분수로 표현하세요.',
 'output': '모든 가능한 결과의 확률의 합이 1$이므로, 스피너가 $C$에 착륙할 확률을 구하려면 스피너가 $A$와 $B$에 착륙할 확률을 1$에서 빼야 합니다. 이를 방정식으로 쓸 수 있습니다: $P(C) = 1 - P(A) - P(B)$. P(A) = \\frac{1}{3}$, $P(B) = \\frac{5}{12}$라는 것을 알고 있으므로 이 값을 방정식에 대입하여 단순화할 수 있습니다. 결과는 다음과 같습니다: P(C) = 1 - \\frac{1}{3} - frac{5}{12} = \\frac{12}{12} - frac{4}{12} - frac{5}{12} = \\frac{3}{12}$. 분자와 분모를 $3$로 나누면 이 분수를 줄일 수 있습니다: P(C) = \\frac{1}{4}$입니다.'}

In [15]:
def preprocess_function(examples):
    # Concatenate the 'instruction' and 'output' fields for each example in the batch
    concatenated_texts = [instruction + ' ' + output for instruction, output in zip(examples['instruction'], examples['output'])]
    # Tokenize the concatenated texts
    return tokenizer(concatenated_texts, padding=padding, truncation=truncation, max_length=max_length)

dataset = dataset.map(preprocess_function, batched=True)

In [16]:
# Split the dataset into a training and a validation dataset
dataset = dataset["train"].train_test_split(test_size=validation_size, seed=seed)

# Number of questions in the train, validation dataset
print(f"Number of questions in the train dataset: {len(dataset['train'])}")
print(f"Number of questions in the validation dataset: {len(dataset['test'])}")

Number of questions in the train dataset: 157908
Number of questions in the validation dataset: 17546


In [17]:
print(dataset["train"][0])

{'input': '', 'instruction': '경산경찰서는 경산시 일대를 관할하나요?', 'output': '경산경찰서는 경상북도 경산시 일대를 관할하며, 경산시 원효로 68(계양동503번지)에 위치해 있습니다. 경산경찰서는 1개의 지구대와 7개의 파출소를 운영하고 있으며, 각 파출소들은 치안센터, 자인파출소, 남산치안센터, 진량파출소, 하양파출소, 청천치안센터, 압량파출소, 와촌파출소로 구성되어 있습니다.', 'input_ids': [1, 28705, 29653, 30336, 29653, 239, 179, 179, 29305, 29175, 28705, 29653, 30336, 29236, 28705, 29415, 29634, 29200, 28705, 30224, 29723, 29136, 29695, 29517, 28804, 28705, 29653, 30336, 29653, 239, 179, 179, 29305, 29175, 28705, 29653, 29578, 238, 185, 132, 29599, 28705, 29653, 30336, 29236, 28705, 29415, 29634, 29200, 28705, 30224, 29723, 29136, 31718, 28725, 28705, 29653, 30336, 29236, 28705, 29816, 31481, 29143, 28705], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [18]:
print(dataset["test"][0])

{'input': '', 'instruction': '정진영은 어떤 분야에서 활동을 했나요?', 'output': "정진영은 1964년 11월 19일에 태어난 대한민국의 배우로, 1988년 뮤지컬 배우로 데뷔했고 1989년 연극 배우로 데뷔했다. 그는 30년 동안 깊이 있는 연기력으로 관객들의 사랑을 받았다. 그의 대표적인 작품으로는 '왕의 남자', '7번방의 선물', '국제시장' 등이 있다. 또한 연극, TV 프로그램, 영화 등 다양한 매체에서 활약했으며, 여러 상을 수상했다.", 'input_ids': [1, 28705, 29233, 30345, 30478, 29538, 28705, 29433, 238, 153, 167, 28705, 30217, 30263, 29148, 29305, 28705, 31273, 29714, 29189, 28705, 30676, 29695, 29517, 28804, 28705, 29233, 30345, 30478, 29538, 28705, 28740, 28774, 28784, 28781, 31479, 28705, 28740, 28740, 30839, 28705, 28740, 28774, 29415, 29148, 28705, 30533, 29433, 31876, 28705, 29634, 29282, 31463, 30779, 29187, 28705, 30403, 29687, 29143, 28725, 28705, 28740, 28774, 28783], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


## Inference before Fine-Tuning

In [19]:
# Chat Template
def generate_response(prompt):
    messages = [{"role": "user", "content": prompt }]
    chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    input_ids = tokenizer.encode(chat, add_special_tokens=False, return_tensors="pt")
    outputs = model.generate(input_ids=input_ids.to(model.device), max_new_tokens=max_new_tokens)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [20]:
#prompt = "Write me a poem about Machine Learning."
prompt = "머신러닝에 대한 시를 써주세요."

In [21]:
response = generate_response(prompt)
print(response)

### User:
머신러닝에 대한 시를 써주세요.

### Assistant:
Machine Learning Sonnet

In code and data, we create a scene,
A world for algorithms to meet and greet,
A place where patterns, hidden, are seen,
And knowledge grows with every beat.

A wond'ring mind, a curious quest,
To learn from past, to see the future,
A dance of math and art, a blest,
Collaboration, a true pursuer.

A teacher, not of rote, but thought,
A guide to find, a path to tread,
A hand that points, a mind that's taught,
A tool to shape, a force to lead.

A symphony of bits and bytes,
A symbiosis of human and machine,
A harmony of minds, day and night,
A dance of logic, pure and clean.

So let us sing, this art of ours,
A tale of growth, a story grand,
A journey of the mind, it soars,
In this machine learning land.


## Supervised Fine-Tuning (LoRA)

In [22]:
def formatting_func(example):
    text = (f"instruction: {example['instruction'][0]}\n"
            f"output: {example['output'][0]}")
    return [text]

In [23]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    peft_config=lora_config,
    max_seq_length=max_seq_length,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    formatting_func=formatting_func
)

Map:   0%|          | 0/17546 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [24]:
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
10,1.8871,1.596649
20,1.519,1.426786
30,1.4591,1.338195
40,1.3289,1.269173
50,1.3068,1.240992
60,1.2714,1.22373
70,1.2667,1.214138
80,1.2027,1.20372
90,1.3313,1.194492
100,1.3138,1.187984


TrainOutput(global_step=316, training_loss=1.1843909372257282, metrics={'train_runtime': 3723.0792, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.085, 'total_flos': 8600474396196864.0, 'train_loss': 1.1843909372257282, 'epoch': 2.0})

In [25]:
trainer.save_model(model_name)

## Inference after Fine-Tuning

In [26]:
#prompt = "Write me a poem about Machine Learning."
prompt = "머신러닝에 대한 시를 써주세요."

In [27]:
response = generate_response(prompt)
print(response)

### User:
머신러닝에 대한 시를 써주세요.

### Assistant:
머신러닝의 마음속에서 
어떤 것들이 꿈을 꾸고 있나요?
숫자와 문자, 그리고 이미지들이 
어떤 상상을 나누고 있나요?

그들은 느낌과 감정을 배워가며 
자신들의 지성을 키워나갑니다.
그들은 느낌과 감정을 배워가며 
자신들의 지성을 키워나갑니다.

그들은 느낌과 감정을 배워가며 
자신들의 지성을 키워나갑니다.
그들은 느낌과 감정을 배워가며 
자신들의 지성을 키워나갑니다.

그들은 느낌과 감정을 배워가며 
자신들의 지성을 키워나갑니다.
그들은 느낌과 감정을 배워가며 
자신들의 지성을 키워나갑니다.

그들은 느낌과 감정을 배워가며 
자신들의 지성을 키워나갑니다.
그들은 느낌과 감정을 배워가며 
자신들의 지성을 키워나갑니다.

그들은 느낌과 감정을 배워가며 
자신들의 지성을 키워나갑니다.
그들은 느낌과 감정을 배워가며 
자신들의 지성을 키워나갑니다.

그들은 느낌과 감정을 배워가며 
자신들의


## Upload Model

In [28]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    torch_dtype=torch.float16,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, model_name)
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [29]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [30]:
# Push model and tokenizer to Hugging Face Hub
model.push_to_hub(
    repo_id=repo_id,
    use_temp_dir=False
)
tokenizer.push_to_hub(
    repo_id=repo_id,
    use_temp_dir=False
)

README.md:   0%|          | 0.00/6.82k [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/PathFinderKR/Waktaverse-SOLAR-KO-10.7B-Instruct/commit/0e79570b4559df9badbedef06ce6056a7a0fba36', commit_message='Upload tokenizer', commit_description='', oid='0e79570b4559df9badbedef06ce6056a7a0fba36', pr_url=None, pr_revision=None, pr_num=None)