## gemma-2-2b-it, chatgpt 3o-mini-high 코드

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


# Accelerator 패치: __init__ 메서드를 안전하게 한 번만 패치 (커널 재시작 후 실행 권장)
from accelerate import Accelerator
if not hasattr(Accelerator, "_original_init"):
    Accelerator._original_init = Accelerator.__init__
    def patched_accelerator_init(self, *args, **kwargs):
        kwargs["device_placement"] = False
        Accelerator._original_init(self, *args, **kwargs)
    Accelerator.__init__ = patched_accelerator_init

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda:0


In [2]:
print("Using device:", torch.cuda.current_device())

Using device: 0


In [3]:
# 4-bit 양자화 설정 (BitsAndBytesConfig 사용)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",            # NF4 양자화 방식 사용
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# QLoRA용 LoRA 구성 설정
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    #target_modules=["q_proj", "v_proj"],  # 모델 구조에 따라 조정 필요할 수 있음
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

from peft import LoraConfig

lora_config = LoraConfig(
    r=6,  # LoRA 랭크 (작을수록 적은 파라미터를 학습하지만 표현력이 감소) 작은 모델에선 8~12 쓰기도함
    lora_alpha=8,  # LoRA의 scaling factor (r과 함께 조정하여 학습 효과를 조절)
    lora_dropout=0.05,  # LoRA 레이어에 적용할 드롭아웃 비율 (과적합 방지)
    
    # LoRA를 적용할 모델 내 특정 모듈 (주로 Self-Attention & FFN 관련 모듈)
    target_modules=[
        "q_proj",  # Query Projection: Self-Attention에서 Query 벡터 생성
        "k_proj",  # Key Projection: Self-Attention에서 Key 벡터 생성
        "v_proj",  # Value Projection: Self-Attention에서 Value 벡터 생성
        "o_proj",  # Output Projection: Self-Attention 결과를 출력하는 모듈
        "gate_proj",  # FFN의 게이트 연산 모듈 (활성화 함수 조정)
        "up_proj",  # FFN에서 차원을 확장하는 모듈 (hidden_dim → expanded_dim)
        "down_proj",  # FFN에서 확장된 차원을 다시 줄이는 모듈 (expanded_dim → hidden_dim)
    ],

    task_type="CAUSAL_LM",  # 인과적 언어 모델링 (GPT 계열 모델에 적합)
)

In [4]:
BASE_MODEL = "google/gemma-2b-it"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, 
    quantization_config=bnb_config,
    device_map={"": torch.cuda.current_device()}
	#device_map={"": 1}
    )
# k-bit 학습을 위한 모델 준비 (QLoRA 방식 적용 전)
model = prepare_model_for_kbit_training(model)
# 모델에 LoRA 적용
model = get_peft_model(model, lora_config)

# 토크나이저 초기화
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]


In [5]:
# CSV 파일 경로
data_file = 'extracted_documents_신문기사.csv'

# CSV 파일로부터 데이터셋 로드
dataset = load_dataset("csv", data_files={"train": data_file})
print("데이터셋 로드 완료.")
dataset

데이터셋 로드 완료.


DatasetDict({
    train: Dataset({
        features: ['original_text', 'summary_text'],
        num_rows: 243983
    })
})

In [6]:
def preprocess_function(example):
    # 프롬프트 생성: 원문과 요약 요청을 명시
    prompt = "원문:\n" + example["original_text"] + "\n\n요약:\n"
    target = example["summary_text"]
    full_text = prompt + target

    # 전체 텍스트 토큰화 (truncation, padding 적용)
    tokenized_full = tokenizer(full_text, truncation=True) #, padding="max_length", max_length=512)
    
    # 프롬프트 토큰 길이 계산 (패딩 전 실제 길이)
    prompt_ids = tokenizer(prompt, add_special_tokens=False)["input_ids"]
    prompt_length = len(prompt_ids)
    
    # 전체 토큰 시퀀스에서 프롬프트 부분은 손실 계산에서 제외 (-100 처리)
    labels = tokenized_full["input_ids"].copy()
    for i in range(prompt_length):
        if i < len(labels):
            labels[i] = -100
    tokenized_full["labels"] = labels
    return tokenized_full

In [7]:
def filter_none_and_empty(example):
    # None이거나 공백 문자열인 경우 False 반환
    orig = example.get("original_text")
    summ = example.get("summary_text")
    if orig is None or summ is None:
        return False
    # 문자열이지만 공백만 있는 경우도 제거
    if orig.strip() == "" or summ.strip() == "":
        return False
    return True

# 필터 적용하여 None이나 빈 문자열인 행 제거
dataset["train"] = dataset["train"].filter(filter_none_and_empty)


In [23]:
# 5. 데이터셋에 전처리 함수 적용 (각 예제를 개별적으로 처리)
tokenized_dataset = dataset["train"].map(preprocess_function, batched=False)#, device_map={"": 1})
# 학습에 필요한 "input_ids", "attention_mask", "labels"만 남김
cols_to_remove = [col for col in tokenized_dataset.column_names if col not in ["input_ids", "attention_mask", "labels"]]
tokenized_dataset = tokenized_dataset.remove_columns(cols_to_remove)
print("토큰화 및 전처리 완료.")

tokenized_dataset


토큰화 및 전처리 완료.


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 243979
})

In [9]:
# 저장 경로 지정
save_path = "/home/wanted-1/potenup-workspace/Project/project3/team2/SY/Gemma/tokenized_dataset"

# tokenized_dataset을 디스크에 저장
tokenized_dataset.save_to_disk(save_path)
print("Tokenized dataset saved to", save_path)

Saving the dataset (6/6 shards): 100%|██████████| 243979/243979 [00:00<00:00, 248932.06 examples/s]

Tokenized dataset saved to /home/wanted-1/potenup-workspace/Project/project3/team2/SY/Gemma/tokenized_dataset





In [28]:
shards = [tokenized_dataset.shard(num_shards=4, index=i) for i in range(4)]

# 각 샤드의 예제 수 확인
for i, shard in enumerate(shards):
    print(f"Shard {i}: {len(shard)} examples")

Shard 0: 60995 examples
Shard 1: 60995 examples
Shard 2: 60995 examples
Shard 3: 60994 examples


In [29]:
# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir="./qlora_gemma",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=2000,
    save_steps=1000,
    save_total_limit=2,
    report_to="none",
    optim="adamw_torch"
)

# Trainer 초기화
trainer = Trainer(
    model=model,#.to(device),
    args=training_args,
    train_dataset=shards[0],
    data_collator=default_data_collator
)
print("Trainer 초기화 완료.")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainer 초기화 완료.


In [30]:
# 학습 시작
trainer.train()
print("학습 완료.")


  return fn(*args, **kwargs)


Step,Training Loss
2000,0.8416
4000,0.8017
6000,0.7754
8000,0.7783
10000,0.772
12000,0.7496
14000,0.7652
16000,0.7561
18000,0.7386
20000,0.7455



Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Access to model google/gemma-2b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-2b-it to ask for access. - silently ignoring the lookup for the file config.json in google/gemma-2b-it.
  return fn(*args, **kwargs)

Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Access to model google/gemma-2b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-2b-it to ask for access. - silently ignoring the lookup for the file config.json in google/gemma-2b-it.
  return fn(*args, **kwargs)

Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Access to model google/gemma-2b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-2b-it to ask for access. - silently

학습 완료.


In [31]:
# LoRA 어댑터만 저장
trainer.model.save_pretrained("path/to/save/lora_adapters_news")



Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Access to model google/gemma-2b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-2b-it to ask for access. - silently ignoring the lookup for the file config.json in google/gemma-2b-it.


In [None]:
ADAPTER_MODEL = "lora_adapter"

trainer.model.save_pretrained(ADAPTER_MODEL)


Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in google/gemma-2b-it.


: 

### Q-LoRA 와 모델 합치기 

In [22]:
from peft import PeftModel

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, torch_dtype=torch.float16)

model = model.merge_and_unload()
model.save_pretrained('gemma-2b-it-sum')

# !ls -alh ./gemma-2b-it-sum-ko

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.35it/s]


### Fine-tuned 모델 로드 

In [24]:
BASE_MODEL = "google/gemma-2-2b-it"
FINETUNE_MODEL = "gemma-2-2b-it-sum"

finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map={"": torch.cuda.current_device()})
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)

OSError: There was a specific connection error when trying to load gemma-2-2b-it-sum:
401 Client Error: Unauthorized for url: https://huggingface.co/gemma-2-2b-it-sum/resolve/main/config.json (Request ID: Root=1-67c40ece-3317012c7e6d85e30ba2e096;100a0a21-48ad-4b7f-aaa3-afcdb6936b43)

Invalid credentials in Authorization header

In [None]:
pipe_finetuned = pipeline("text-generation", model=finetune_model, tokenizer=tokenizer, max_new_tokens=512)
doc = dataset['test']['document'][10]

messages = [
    {
        "role": "user",
        "content": "다음 글을 요약해주세요:\n\n{}".format(doc)
    }
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)


In [None]:
outputs = pipe_finetuned(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
print(outputs[0]["generated_text"][len(prompt):])