In [6]:
!pip install -q -U transformers datasets accelerate peft bitsandbytes trl

In [7]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("✅ 모델 및 토크나이저 로딩 완료")

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

ImportError: The installed version of bitsandbytes (<0.43.1) requires CUDA, but CUDA is not available. You may need to install PyTorch with CUDA support or upgrade bitsandbytes to >=0.43.1.

In [ ]:
dataset = load_dataset("json", data_files="alpaca_dataset.jsonl", split="train")

train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(f"훈련 데이터셋 크기: {len(train_dataset)}")
print(f"테스트 데이터셋 크기: {len(test_dataset)}")

In [ ]:
# LoRA 설정
lora_config = LoraConfig(
    r=64,
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,                     
    per_device_train_batch_size=8,          
    gradient_accumulation_steps=2,          
    optim="paged_adamw_32bit",              
    save_steps=50,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    bf16=True,                              
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,                   
    lr_scheduler_type="cosine",
    dataloader_num_workers=4,               
    remove_unused_columns=False,
    
    # A100 최적화 추가 설정
    dataloader_pin_memory=True,            
    tf32=True,                             
    save_total_limit=5,
    eval_strategy="steps" if test_dataset else "no",
    eval_steps=50 if test_dataset else None,
    load_best_model_at_end=True if test_dataset else False,
    metric_for_best_model="eval_loss" if test_dataset else None,
    greater_is_better=False if test_dataset else None,
    
    # 메모리 최적화 (80GB라 여유 있음)
    per_device_eval_batch_size=8,          
    prediction_loss_only=True,
)

In [ ]:
def formatting_prompts_func(example):
    """단일 예제를 포맷팅"""
    instruction = example["instruction"]
    input_text = example["input"] if example["input"] else ""
    output = example["output"]
    
    if input_text:
        text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
    else:
        text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
    
    return text
    
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=lora_config,
    args=training_arguments,
    formatting_func=formatting_prompts_func
)

In [ ]:
print("🚀 훈련을 시작합니다...")
trainer.train()
print("✅ 훈련이 완료되었습니다.")

new_model_name = "deepseek-coder-finetuned-r64a64-qkvo"
trainer.model.save_pretrained(new_model_name)
print(f"✅ 모델이 '{new_model_name}'에 저장되었습니다.")