##### Reference
https://blog.naver.com/PostView.naver?blogId=se2n&logNo=223443729640&parentCategoryNo=&categoryNo=74&viewDate=&isShowPopularPosts=false&from=postView

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from transformers import EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import gc

In [None]:
train_data = load_dataset('csv', data_files='./data/train.csv')['train']
eval_data = load_dataset('csv', data_files='./data/dev.csv')['train']

In [None]:
def prompts(example):
    USER_PROMPT = "다음 대화를 요약해주세요.\n\n{dialogue}"
    ASSISTANT_PROMPT = "{summary}"

    prompts = []

    for dialogue, summary in zip(example["dialogue"], example["summary"]):
        prompt = f'''<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{USER_PROMPT.format(dialogue=dialogue)}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n{ASSISTANT_PROMPT.format(summary=summary)}<|eot_id|>'''
        # print(prompt)
        prompts.append(prompt)
    return prompts

In [None]:
BASE_MODEL = "MLP-KTLim/llama-3-Korean-Bllossom-8B"

# BitsAndBytes 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# 모델 로드
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
)

In [None]:
# 모델의 그래디언트 체크포인팅을 활성화하여 메모리 사용량 최적화
model.gradient_checkpointing_enable()

# 모델의 파라미터가 k-비트 양자화에 적합하도록 조정 준비
model = prepare_model_for_kbit_training(model)

# LoRA 설정 : 양자화된 모델에서 Adaptor를 붙여서 학습할 파라미터만 따로 구성함
lora_config = LoraConfig(
    r=8,
    lora_alpha = 32,
    lora_dropout = 0.1,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

# freezing
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [None]:
from transformers.trainer_callback import TrainerCallback

class CUDAMemoryCleanupCallback(TrainerCallback):
    def __init__(self, cleanup_interval=100):
        self.cleanup_interval = cleanup_interval
    
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.cleanup_interval == 0:
            torch.cuda.empty_cache()
            gc.collect()
            print(f"Step {state.global_step}: CUDA memory cleaned up.")
            print(f"Current GPU memory usage: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

cuda_cleanup_callback = CUDAMemoryCleanupCallback(cleanup_interval=50)

In [None]:
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=5,  # 5번의 evaluation동안 개선되지 않으면 학습 중단
    early_stopping_threshold=0.001  # 개선 폭이 0.001 미만이면 개선되지 않는 것으로 간주
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="outputs",
        num_train_epochs = 10,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_ratio=0.05,
        learning_rate=2e-5,
        fp16=True,
        logging_steps=100,
        push_to_hub=False,
        report_to='none',
        save_strategy="steps",
        save_total_limit=3,  # 최대 3개의 체크포인트 유지
        save_steps=300,  # 300 스텝마다 저장
        eval_steps=300,  # 300 스텝마다 평가
        evaluation_strategy="steps",  # 평가 전략 (체크포인트 저장과 동일한 주기로 설정)
        load_best_model_at_end=True,  # 학습 종료 시 최고 성능 모델 로드
        metric_for_best_model="loss",  # loss를 기준으로 최고 성능 판단
    ),
    peft_config=lora_config,
    formatting_func=prompts,
    callbacks=[cuda_cleanup_callback, early_stopping]
)

trainer.train()

In [None]:
output_dir = "./fine_tuned_model"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"파인튜닝된 모델이 {output_dir}에 저장되었습니다.")