In [1]:
!pip install transformers peft datasets bitsandbytes accelerate ipywidgets




In [4]:
# 출력 개수 제한 + linear 계층만 필터링
for name, module in model.named_modules():
    if "linear" in str(type(module)).lower():
        print(name)


gpt_neox.layers.0.attention.query_key_value
gpt_neox.layers.0.attention.dense
gpt_neox.layers.0.mlp.dense_h_to_4h
gpt_neox.layers.0.mlp.dense_4h_to_h
gpt_neox.layers.1.attention.query_key_value
gpt_neox.layers.1.attention.dense
gpt_neox.layers.1.mlp.dense_h_to_4h
gpt_neox.layers.1.mlp.dense_4h_to_h
gpt_neox.layers.2.attention.query_key_value
gpt_neox.layers.2.attention.dense
gpt_neox.layers.2.mlp.dense_h_to_4h
gpt_neox.layers.2.mlp.dense_4h_to_h
gpt_neox.layers.3.attention.query_key_value
gpt_neox.layers.3.attention.dense
gpt_neox.layers.3.mlp.dense_h_to_4h
gpt_neox.layers.3.mlp.dense_4h_to_h
gpt_neox.layers.4.attention.query_key_value
gpt_neox.layers.4.attention.dense
gpt_neox.layers.4.mlp.dense_h_to_4h
gpt_neox.layers.4.mlp.dense_4h_to_h
gpt_neox.layers.5.attention.query_key_value
gpt_neox.layers.5.attention.dense
gpt_neox.layers.5.mlp.dense_h_to_4h
gpt_neox.layers.5.mlp.dense_4h_to_h
gpt_neox.layers.6.attention.query_key_value
gpt_neox.layers.6.attention.dense
gpt_neox.layers.6.mlp.

### [Step 1] 모델 로드 및 QloRA 테스트

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

# 1. 모델 지정
model_id = "beomi/KoAlpaca-Polyglot-5.8B"  # 또는 1.3B, 1.3B도 충분!

# 2. 토크나이저 & 모델 불러오기 (8bit 로딩 + GPU 자동 할당)
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_8bit=True,
    device_map="auto"
)

# 3. LoRA 학습을 위한 모델 준비
model = prepare_model_for_kbit_training(model)

# 4. KoAlpaca 전용 target_modules로 LoRA 설정

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
# 5. LoRA 적용된 모델 생성
model = get_peft_model(model, lora_config)

# 6. 확인용 출력
model.print_trainable_parameters()


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

trainable params: 14,680,064 || all params: 5,899,739,136 || trainable%: 0.2488


###  [Step 2] 데이터셋 로딩 및 전처리 코드

In [7]:
from datasets import load_dataset

# 1. JSONL 로딩
dataset = load_dataset("json", data_files="/app/workspace/prompt_pairs.jsonl", split="train")

# 2. 전처리 함수 정의
def format_prompt(example):
    prompt = f"### 질문: {example['instruction']}\n### 답변:"
    return tokenizer(prompt, text_target=example['output'], truncation=True, padding="max_length", max_length=512)

# 3. 전처리 적용
tokenized_dataset = dataset.map(format_prompt)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

### [Step 3] Trainer 구성 및 학습 실행

In [8]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# 1. 학습 설정
training_args = TrainingArguments(
    output_dir="/app/workspace/qlora_outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=1,
    bf16=True,
    save_strategy="no",
    report_to="none"
)

# 2. Trainer 구성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# 3. 학습 시작
trainer.train()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,6.2108
2,6.1466
3,5.9164
4,4.8853
5,4.954
6,4.0224
7,4.0065
8,3.5958
9,3.2665
10,2.8737


TrainOutput(global_step=36, training_loss=2.776077366537518, metrics={'train_runtime': 163.5681, 'train_samples_per_second': 0.899, 'train_steps_per_second': 0.22, 'total_flos': 2555352667127808.0, 'train_loss': 2.776077366537518, 'epoch': 2.938775510204082})

### [Step 4] 학습된 LoRA adapter 저장

In [None]:
model.save_pretrained("/app/workspace/qlora_outputs/lora_adapter")
tokenizer.save_pretrained("/app/workspace/qlora_outputs/lora_adapter")


### [Step 5] 추론 코드

In [None]:
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# 1. 8bit 로딩 설정
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_skip_modules=None,
    llm_int8_enable_fp32_cpu_offload=True  # 핵심!
)

# 2. Base 모델 로딩
base_model = AutoModelForCausalLM.from_pretrained(
    "beomi/KoAlpaca-Polyglot-5.8B",
    device_map="auto",
    quantization_config=bnb_config
)

# 3. LoRA adapter 붙이기
model = PeftModel.from_pretrained(base_model, "/app/workspace/qlora_outputs/lora_adapter")
model.eval()

# 4. Tokenizer 불러오기
tokenizer = AutoTokenizer.from_pretrained("/app/workspace/qlora_outputs/lora_adapter", use_fast=False)

#5. 추론함수
def rewrite_prompt(input_ko):
    prompt = f"### 질문: {input_ko}\n### 답변:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    if 'token_type_ids' in inputs:
        inputs.pop('token_type_ids')
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Some parameters are on the meta device because they were offloaded to the cpu.


In [None]:
# 5. 테스트!
example = "창문 너머를 바라보는 고양이를 그리고싶은데, 미드저니 고도화 프롬프트 작성해줘"
print("💬 결과:\n", rewrite_prompt(example))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


💬 결과:
 ### 질문: 창문 너머를 바라보는 고양이를 그리고싶은데, 미드저니 고도화 프롬프트 작성해줘
### 답변: 고양이를 그리려면 고양이의 시선처리를 위한 사진첩을 만들어줘


: 

In [None]:
# # GPU 사용 가능여부 확인
# import torch
# print(torch.__version__)
# print("GPU 사용 가능:", torch.cuda.is_available())
