In [None]:
pip install --upgrade pip
!pip install trl wandb
!pip install -U bitsandbytes
!pip install "unsloth[cu121-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git"
!pip install datasets

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn.functional as F
from datasets import load_dataset, Dataset
import huggingface_hub
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    EarlyStoppingCallback,
    DataCollatorForSeq2Seq
)

from trl.core import LengthSampler
from trl import (
    PPOTrainer,
    PPOConfig,
    AutoModelForCausalLMWithValueHead,
    create_reference_model,
    DPOConfig,
    DPOTrainer,
)
from peft import (
    get_peft_model,
    AutoPeftModel,
    AutoPeftModelForCausalLM,
    PeftModel,
    LoraConfig,
    LoftQConfig,
    TaskType,
)
from trl import SFTTrainer, SFTConfig
# from unsloth import is_bfloat16_supported
# from unsloth import FastLanguageModel
# from unsloth.chat_templates import (
#     get_chat_template,
#     train_on_responses_only,
#     standardize_sharegpt,
# )

  warn(


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


### Model load

In [None]:
huggingface_hub.login()

In [3]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
model_id = 'qa_kor_v11'
model_id = 'Bllossom/llama-3.2-Korean-Bllossom-3B'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

==((====))==  Unsloth 2024.12.8: Fast Llama patching. Transformers: 4.46.3.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

### LoRA

In [4]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Llam

In [5]:
lora_config = LoraConfig(
    r=4,  # Low-rank 매트릭스의 차원
    lora_alpha=4,  # LoRA에서 사용되는 스케일링 계수 -
    lora_dropout=0,  # Dropout 비율
    task_type=TaskType.CAUSAL_LM,  # 작업 유형: 시퀀스 분류
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],  # LoRA 적용할 모듈
)

# LoRA 모델 생성
#model = get_peft_model(model, lora_config)

Unsloth 2024.12.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


### Dataset load

In [None]:
dataset = load_dataset('HoJL/law_expc')
dataset

### Data preprocessing

In [8]:
def format_example(row):
    instruction = """너는 주어지는 내용만 보고 질문에 답을 하고 왜 이런 답을 했는지 추론도 해주는 역할이야. 
    반드시 한국어로 답변해줘.
    """
    
    inputs = f"""
## 내용:
{row['관계법령_정리']}

## 질문:
{row['질의요지']}
    """
    answer = f"""
## 추론:
{row['이유_요약']}

## 답:
{row['회답']}
    """
    
    messages = [
    {"role": "system", "content": f"{instruction}"},
    {"role": "user", "content": f"{inputs}"},
    {"role": "assistant", "content":f"{answer}"},
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False
    )
    return {"text": text}

In [None]:
dataset_train = dataset['train'].map(format_example, batch_size=True)
dataset_valid = dataset['validation'].map(format_example, batch_size=True)

In [None]:
dataset_train['text'][3]

### Set trainer

In [12]:
training_args = SFTConfig(
    per_device_train_batch_size=4,
    #gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    #eval_accumulation_steps=4,
    num_train_epochs = 6,
    warmup_steps=10,
    eval_strategy="steps",
    do_eval=True,
    eval_steps=20,
    save_steps=500,
    learning_rate=2e-5,
    logging_steps=20,
    output_dir='./llama3.2_3b_qa_kor_v16',
    report_to='wandb',
    dataset_text_field='text',
    packing=False,
)

Map (num_proc=16):   0%|          | 0/60407 [00:00<?, ? examples/s]

In [None]:
trainer = SFTTrainer(
    model=model,
    peft_config=lora_config,
    tokenizer=tokenizer,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    args=training_args,
)

In [None]:
model

### Train

In [13]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 60,407 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 7,551
 "-____-"     Number of trainable parameters = 12,156,928
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
300,2.6034
600,2.4214
900,2.3666
1200,2.312
1500,2.2526
1800,2.2021
2100,2.1541
2400,2.1053
2700,2.0738
3000,2.0392


TrainOutput(global_step=7551, training_loss=1.9925331587571053, metrics={'train_runtime': 7084.6241, 'train_samples_per_second': 8.526, 'train_steps_per_second': 1.066, 'total_flos': 4.658930097637171e+17, 'train_loss': 1.9925331587571053, 'epoch': 1.0})

### Save model

In [14]:
trainer.save_model('./llama_3.2_3b_qa_v1')

In [16]:
model.save_pretrained("./llama_3.2_3b_qa_v1")

### VLLM을 위한 float16변환

In [18]:
base_model = "./llama_3.2_3b_qa_v1_model" # 병합을 수행할 베이스 모델
save_method = (
    "merged_16bit"  # "merged_4bit", "merged_4bit_forced", "merged_16bit", "lora"
)


In [19]:
model.save_pretrained_merged(
    base_model,
    tokenizer,
    save_method=save_method,  # 저장 방식을 16비트 병합으로 설정
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 646.71 out of 944.44 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 105.21it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [15]:
!zip -r llama_3.2_3b_qa_v1.zip ./llama_3.2_3b_qa_v1

/bin/bash: line 1: zip: command not found


### GGUF변환

In [None]:
!python llama.cpp/convert_hf_to_gguf.py qa --outtype bf16 --outfile qa_law.gguf

### Inference Test

In [29]:
instruction = """너는 주어지는 내용만을 보고 질문에 답을 하고 왜 이런 답을 했는지 추론도 해주는 역할이야. 
반드시 한국어로 답변해줘.
답변은 아래형식과 같이 추론과 답으로 이루어져 있고 마크다운으로 내줘.

### 추론:
### 답:
"""
inputs = f"""
### 내용:
{dataset['test'][2].get('관계법령_정리')}

### 질문:
{dataset['test'][2].get('질의요지')}
"""
messages = [
{"role": "system", "content": f"{instruction}"},
{"role": "user", "content": f"{inputs}"}
]
input = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False
)

In [None]:
tokenizer.pad_token_id = tokenizer.eos_token_id 
inputs = tokenizer(
    input,
    return_tensors="pt",
).to("cuda")


In [None]:
from transformers import TextStreamer


model.generation_config.pad_token_id = tokenizer.pad_token_id
terminators = [
    tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
#skip_special_tokens=True,
text_streamer = TextStreamer(tokenizer,  skip_prompt=True)
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=4096,  # 최대 생성 토큰 수를 설정합니다.
    eos_token_id=terminators,  # 생성을 멈출 기준을 설정합니다.
    temperature=0.1,
    repetition_penalty=1.1,
)