### 필수 패키지 설치

In [1]:
%pip install torch transformers datasets peft bitsandbytes trl huggingface_hub flash-attn

Collecting transformers
  Downloading transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.18.1-py3-none-any.whl.metadata (14 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.27.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-1.3.2-py3-none-any.whl.metadata (13 kB)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2026.1.15-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Do

In [None]:
import os
import torch
from datasets import load_dataset

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)

from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

from huggingface_hub import login
from dotenv import load_dotenv

### 1. 설정

In [None]:
MODEL_ID = "kakaocorp/kanana-nano-2.1b-instruct"
DATASET_ID = "BCCard/BCAI-Finance-Kor"
OUTPUT_DIR = "./kanana-finance-adapter-a100"

MAX_SAMPLES = 20000 
NUM_EPOCHS = 1
BATCH_SIZE = 8        # A100은 메모리가 크므로 배치를 더 늘려도 됨
GRAD_ACC_STEPS = 2    # 배치를 늘린 만큼 스텝은 줄여서 균형 맞춤
LEARNING_RATE = 2e-4

### 2. 모델/토크나이저 로드

In [None]:
print(f">>> 모델 '{MODEL_ID}' 로드")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, # 연산 타입을 bfloat16으로 설정
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map={"": torch.cuda.current_device()},
    trust_remote_code=True,
    torch_dtype=torch.bfloat16, # 모델 로드도 bfloat16
    
    # Flash Attention 활성화 -> 학습 속도 향상
    attn_implementation="flash_attention_2" 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# A100에서 2B 모델이면 체크포인팅 꺼도 됨 -> 학습 속도 향상
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

### 3. LoRA 설정

In [None]:
print(">>> LoRA 어댑터 장착")

peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(model, peft_config)

### 4. 데이터셋 로드 및 포맷팅

In [None]:
print(f">>> 데이터셋 로드 및 포맷팅")

full_dataset = load_dataset(DATASET_ID, split="train")
sampled_dataset = full_dataset.shuffle(seed=42).select(range(MAX_SAMPLES))
dataset_dict = sampled_dataset.train_test_split(test_size=0.1)

def apply_chat_template(examples):
    output_texts = []
    for instruction, output in zip(examples['instruction'], examples['output']):
        messages = [
            {"role": "system", "content": "당신은 금융 전문 AI 상담사입니다."},
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": output}
        ]
        try:
            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        except:
            text = f"User: {instruction}\nAssistant: {output}<|end_of_text|>"
        output_texts.append(text)
    return {"text": output_texts}

dataset_dict['train'] = dataset_dict['train'].map(apply_chat_template, batched=True, remove_columns=dataset_dict['train'].column_names)
dataset_dict['test'] = dataset_dict['test'].map(apply_chat_template, batched=True, remove_columns=dataset_dict['test'].column_names)

### 5. 학습 진행 후 저장

In [None]:
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    
    # fp16 끄고 bf16 켜기
    fp16=False,    
    bf16=True,   
    
    logging_steps=1,
    logging_first_step=True,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    optim="paged_adamw_32bit",
    group_by_length=True,
    packing=False,
    dataset_text_field="text",
    gradient_checkpointing=False 
)

training_args.max_seq_length = 1024

trainer = SFTTrainer(
    model=model, 
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['test'],
    processing_class=tokenizer,
    args=training_args,
)

print(">>> 학습 시작")
trainer.train()

print(f">>> 모델 저장: {OUTPUT_DIR}")
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(">>> 모델 저장 완료")

>>> A100 모드: 모델 'kakaocorp/kanana-nano-2.1b-instruct' 로드 중...
>>> LoRA 어댑터 장착...
>>> 데이터셋 로드 및 포맷팅...


Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Tokenizing eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128001}.


>>> A100 학습 시작...


Casting fp32 inputs back to torch.bfloat16 for flash-attn compatibility.


Step,Training Loss,Validation Loss
100,0.9838,0.849949
200,1.1445,0.79202
300,0.9891,0.760536
400,0.9379,0.734244
500,0.9922,0.720133


>>> 모델 저장 중: ./kanana-finance-adapter-a100
>>> 완료!


### HuggingFace 업로드

In [None]:
# 경로 설정
BASE_MODEL_ID = "kakaocorp/kanana-nano-2.1b-instruct"
ADAPTER_DIR = "./kanana-finance-adapter-a100"

NEW_REPO_ID = "YOUR_ID/kanana-finance-adapter-a100" 

# 모델 로드 및 업로드
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
login(token=HF_TOKEN)

print(f">>> 베이스 모델 로드")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)

# LoRA 어댑터 병합
print(f">>> LoRA 어댑터 병합: {ADAPTER_DIR}")
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model = model.merge_and_unload()
print(">>> 모델 타입:", model.dtype)

# 업로드
print(f">>> {NEW_REPO_ID} 리포지토리로 업로드")
model.push_to_hub(NEW_REPO_ID, use_temp_dir=False, safe_serialization=True)
tokenizer.push_to_hub(NEW_REPO_ID, use_temp_dir=False)

print(">>> 업로드 완료")