In [1]:
!pip install -q -U transformers datasets accelerate peft bitsandbytes
!pip install -q -U trl  
!pip install -q -U sentencepiece  

In [1]:
import transformers
import datasets
import peft
import torch

print("=" * 80)
print("패키지 버전 확인")
print("=" * 80)
print(f"transformers: {transformers.__version__}")
print(f"datasets: {datasets.__version__}")
print(f"peft: {peft.__version__}")
print(f"torch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
print("=" * 80)

패키지 버전 확인
transformers: 4.57.1
datasets: 4.4.1
peft: 0.18.0
torch: 2.8.0+cu128
CUDA available: True
GPU: NVIDIA RTX 4000 Ada Generation
GPU Memory: 19.7 GB


In [2]:
from datasets import load_dataset
import numpy as np

print("=" * 80)
print("데이터셋 로딩")
print("=" * 80)

quiz_dataset = load_dataset("Alleinzellgaenger/sherlock-holmes-qa")

print(f"데이터셋 로드 완료")
print(f"총 샘플 수: {len(quiz_dataset['train'])}")
print("\n데이터셋 구조:")
print(quiz_dataset)

데이터셋 로딩
데이터셋 로드 완료
총 샘플 수: 140

데이터셋 구조:
DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'story_id', 'story_title'],
        num_rows: 140
    })
})


In [3]:
print("\n" + "=" * 80)
print("데이터 샘플 확인 (처음 3개)")
print("=" * 80)

for i in range(3):
    sample = quiz_dataset['train'][i]
    print(f"\n{'='*80}")
    print(f"샘플 {i+1}")
    print(f"{'='*80}")
    print(f"출처: {sample['story_title']}")
    print(f"질문: {sample['question'][:100]}...")
    print(f"정답 길이: {len(sample['answer'])} 글자")


데이터 샘플 확인 (처음 3개)

샘플 1
출처: I. A SCANDAL IN BOHEMIA
질문: How does Holmes regard emotions and love, and in what way is Irene Adler an exception to his usual s...
정답 길이: 379 글자

샘플 2
출처: II. THE RED-HEADED LEAGUE
질문: How did the Red-Headed League scheme function, and what was its ultimate purpose?...
정답 길이: 389 글자

샘플 3
출처: III. A CASE OF IDENTITY
질문: Which concrete clues and observations led Holmes to conclude that "Mr. Hosmer Angel" was actually Mi...
정답 길이: 480 글자


In [4]:
print("\n" + "=" * 80)
print("데이터 형식 변환 (Gemma instruction format)")
print("=" * 80)

def format_for_gemma(example):
    """
    Gemma-2 instruction format으로 변환
    """
    text = f"""<start_of_turn>user
{example['question']}<end_of_turn>
<start_of_turn>model
{example['answer']}<end_of_turn>"""
    
    return {"text": text}

# 데이터 변환
formatted_dataset = quiz_dataset['train'].map(format_for_gemma)

print(f"데이터 변환 완료: {len(formatted_dataset)}개")


데이터 형식 변환 (Gemma instruction format)
데이터 변환 완료: 140개


In [5]:
# 변환된 샘플 확인
print("\n변환된 데이터 샘플 (처음 500자):")
print(formatted_dataset[0]['text'][:500])
print("...\n")

# Train/Val 분할
split_dataset = formatted_dataset.train_test_split(test_size=0.1, seed=42)

print(f"Train: {len(split_dataset['train'])}개")
print(f"Val: {len(split_dataset['test'])}개")
print("=" * 80)


변환된 데이터 샘플 (처음 500자):
<start_of_turn>user
How does Holmes regard emotions and love, and in what way is Irene Adler an exception to his usual stance?<end_of_turn>
<start_of_turn>model
Holmes regards emotions—especially love—as abhorrent intrusions that would disturb his perfectly balanced reasoning, likening them to grit in a sensitive instrument; nevertheless Irene Adler is uniquely remarkable to him—“the” woman who eclipses all others in his eyes, though he claims no romantic attachment and remembers her only as bei
...

Train: 126개
Val: 14개


In [6]:
import gc
import torch

# 기존 객체 삭제
if 'trainer' in globals():
    del trainer
if 'model' in globals():
    del model
    
gc.collect()
torch.cuda.empty_cache()

print("메모리 완전 정리")

메모리 완전 정리


In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

print("\n" + "=" * 80)
print("모델 로딩")
print("=" * 80)

model_name = "google/gemma-2-2b-it"

# 토크나이저
print("토크나이저 로딩...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"토크나이저 로드 완료")

# 모델
print("\n모델 로딩 중...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)

print("모델 로드 완료!")

# 파라미터 수
total_params = sum(p.numel() for p in model.parameters())
print(f"\n총 파라미터: {total_params:,} ({total_params/1e9:.2f}B)")

# GPU 메모리
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated(0) / 1024**3
    reserved = torch.cuda.memory_reserved(0) / 1024**3
    print(f"\nGPU 메모리: {allocated:.2f} GB / {reserved:.2f} GB")

print("=" * 80)


모델 로딩
토크나이저 로딩...


`torch_dtype` is deprecated! Use `dtype` instead!


토크나이저 로드 완료

모델 로딩 중...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

모델 로드 완료!

총 파라미터: 2,614,341,888 (2.61B)

GPU 메모리: 4.87 GB / 4.95 GB


In [8]:
print("\n" + "=" * 80)
print("데이터 토크나이징")
print("=" * 80)

def tokenize_function(examples):
    """토크나이징 함수"""
    result = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

# 토크나이징 실행
print("토크나이징 중...")
tokenized_train = split_dataset['train'].map(
    tokenize_function,
    batched=True,
    remove_columns=split_dataset['train'].column_names
)

tokenized_val = split_dataset['test'].map(
    tokenize_function,
    batched=True,
    remove_columns=split_dataset['test'].column_names
)

print(f"토크나이징 완료")
print(f"  - Train: {len(tokenized_train)}개")
print(f"  - Val: {len(tokenized_val)}개")

# 샘플 확인
sample = tokenized_train[0]
print(f"\n토큰 길이: {len(sample['input_ids'])}")
print(f"디코딩 샘플: {tokenizer.decode(sample['input_ids'][:50])}...")

print("=" * 80)


데이터 토크나이징
토크나이징 중...


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

토크나이징 완료
  - Train: 126개
  - Val: 14개

토큰 길이: 512
디코딩 샘플: <bos><start_of_turn>user
How did Holmes structure his investigation using “two lines” of inquiry, and why was that distinction important to solving the case?<end_of_turn>
<start_of_turn>model
Holmes split the problem into line A (the domestic scandal around Lady Beatrice — moods...


In [9]:
from peft import LoraConfig, get_peft_model

print("\n" + "=" * 80)
print("LoRA 설정 및 적용")
print("=" * 80)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

print("LoRA 설정:")
print(f"  - rank: {lora_config.r}")
print(f"  - alpha: {lora_config.lora_alpha}")
print(f"  - target_modules: {lora_config.target_modules}")

# LoRA 적용
model = get_peft_model(model, lora_config)

print("\nLoRA 적용 완료!")
model.print_trainable_parameters()

# gradient 활성화
model.enable_input_require_grads()

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"\n학습 파라미터: {trainable_params:,} ({trainable_params/1e6:.2f}M)")
print(f"학습 비율: {100 * trainable_params / total_params:.4f}%")
print("=" * 80)


LoRA 설정 및 적용
LoRA 설정:
  - rank: 16
  - alpha: 32
  - target_modules: {'k_proj', 'q_proj', 'v_proj', 'o_proj'}

LoRA 적용 완료!
trainable params: 6,389,760 || all params: 2,620,731,648 || trainable%: 0.2438

학습 파라미터: 6,389,760 (6.39M)
학습 비율: 0.2438%


In [10]:
from transformers import TrainingArguments, EarlyStoppingCallback

print("\n" + "=" * 80)
print("Training Arguments 설정")
print("=" * 80)

training_args = TrainingArguments(
    output_dir="../models/sherlock-lora",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,  # LoRA는 좀 더 높은 LR
    weight_decay=0.01,
    fp16=True,
    logging_steps=10,
    logging_dir="../outputs/logs",
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    save_total_limit=2,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
)

print("Training Arguments 설정 완료")
print(f"\n주요 설정:")
print(f"  - Epochs: {training_args.num_train_epochs}")
print(f"  - Batch size: {training_args.per_device_train_batch_size}")
print(f"  - Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  - Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  - Learning rate: {training_args.learning_rate}")
print("=" * 80)


Training Arguments 설정
Training Arguments 설정 완료

주요 설정:
  - Epochs: 10
  - Batch size: 2
  - Gradient accumulation: 4
  - Effective batch size: 8
  - Learning rate: 0.0002


In [11]:
from transformers import Trainer, default_data_collator

print("\n" + "=" * 80)
print("Trainer 초기화")
print("=" * 80)

# Early Stopping
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=default_data_collator,
    callbacks=[early_stopping],
)

print("Trainer 초기화 완료!")
print(f"\n학습 준비:")
print(f"  - 모델: Gemma-2-2B + LoRA (r=16)")
print(f"  - Train: {len(tokenized_train)}개")
print(f"  - Val: {len(tokenized_val)}개")
print(f"  - Early Stopping: patience=2")

The model is already on multiple devices. Skipping the move to device specified in `args`.



Trainer 초기화
Trainer 초기화 완료!

학습 준비:
  - 모델: Gemma-2-2B + LoRA (r=16)
  - Train: 126개
  - Val: 14개
  - Early Stopping: patience=2


In [None]:
# 메모리 최종 정리
gc.collect()
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()

print("\n" + "=" * 80)
print("학습 시작!")
print("=" * 80)

# 학습 실행
trainer.train()

print("\n" + "=" * 80)
print("학습 완료!")
print("=" * 80)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.



학습 시작!


Epoch,Training Loss,Validation Loss
1,1.4754,0.853224
2,0.7191,0.76074
3,0.6535,0.738366
4,0.5674,0.74124


In [19]:
print("\n모델 저장 중...")
model.save_pretrained("../models/sherlock-lora-final")
tokenizer.save_pretrained("../models/sherlock-lora-final")
print("모델 저장 완료: ../models/sherlock-lora-final")

print("\n" + "=" * 80)
print("모든 작업 완료!")
print("=" * 80)


모델 저장 중...
모델 저장 완료: ./sherlock-lora-final

모든 작업 완료!
