In [1]:
%pip install -U "transformers>=4.43.0" "datasets>=2.19.0" "accelerate>=0.33.0" \
"trl>=0.9.6" "peft>=0.11.1" "bitsandbytes>=0.43.1" \
"sentencepiece" "pandas" "python-docx" "tqdm" "scikit-learn"


Collecting transformers>=4.43.0
  Using cached transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting datasets>=2.19.0
  Using cached datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting accelerate>=0.33.0
  Using cached accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting trl>=0.9.6
  Using cached trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting peft>=0.11.1
  Using cached peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting bitsandbytes>=0.43.1
  Using cached bitsandbytes-0.47.0-py3-none-win_amd64.whl.metadata (11 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp312-cp312-win_amd64.whl.metadata (10 kB)
Using cached transformers-4.56.2-py3-none-any.whl (11.6 MB)
Using cached datasets-4.1.1-py3-none-any.whl (503 kB)
Using cached accelerate-1.10.1-py3-none-any.whl (374 kB)
Using cached trl-0.23.0-py3-none-any.whl (564 kB)
Using cached peft-0.17.1-py3-none-any.whl (504 kB)
Using cached bitsandbytes-0.47.0-py3-none-win_amd64.whl (60.7 

In [4]:
# 주피터 커널에 설치 (런타임 재시작 불필요)
import sys, subprocess
def pip_install(pkgs):
    print("Installing:", pkgs)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U"] + pkgs)

pip_install(["huggingface_hub>=0.24.6", "safetensors"])
# (옵션) 없으면 같이 깔기
# pip_install(["transformers>=4.43.0"])
# pip_install(["tqdm"])
import huggingface_hub, safetensors
print("huggingface_hub:", huggingface_hub.__version__)
print("safetensors    :", safetensors.__version__)


Installing: ['huggingface_hub>=0.24.6', 'safetensors']
huggingface_hub: 0.35.1
safetensors    : 0.6.2


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [5]:
# === One-Shot: 환경 세팅 → 안전 다운로드(순차/재개) → 검증 ===
import os, json, shutil, pathlib, subprocess
from huggingface_hub import snapshot_download

REPO_ID   = "upstage/solar-pro-preview-instruct"
MODEL_DIR = pathlib.Path("/workspace/solar_model/model")
BASE      = pathlib.Path("/workspace")

# 0) 환경: 캐시/임시를 영구 볼륨(/workspace)로 고정 + Xet/CAS 비활성화
TMP = BASE / "tmp"
HF  = BASE / "hf_cache"
for p in [TMP, HF/"huggingface/hub", HF/"datasets", HF/"transformers", MODEL_DIR]:
    p.mkdir(parents=True, exist_ok=True)

os.environ["TMPDIR"]                 = str(TMP)
os.environ["HF_HOME"]                = str(HF/"huggingface")
os.environ["HUGGINGFACE_HUB_CACHE"]  = str(HF/"huggingface/hub")
os.environ["TRANSFORMERS_CACHE"]     = str(HF/"transformers")
os.environ["HF_DATASETS_CACHE"]      = str(HF/"datasets")
os.environ["HF_HUB_DISABLE_XET"]     = "1"   # CAS/Xet 끄기
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"  # 안정 우선(느려도 OK)

# 루트 캐시에 다시 쓰지 않도록 심볼릭 링크
subprocess.run("rm -rf /root/.cache/huggingface 2>/dev/null || true", shell=True, check=False)
subprocess.run("mkdir -p /root/.cache && ln -s /workspace/hf_cache/huggingface /root/.cache/huggingface", shell=True, check=False)

# 1) 실패 찌꺼기(.incomplete) 제거
subprocess.run('find /workspace -name "*.incomplete" -delete', shell=True, check=False)

# 2) 전체 다운로드(재개 가능, 단일 스레드)
allow = [
    "config.json",
    "model.safetensors.index.json",
    "special_tokens_map.json",
    "tokenizer.json",
    "tokenizer.model",
    "tokenizer_config.json",
    "added_tokens.json",
    "model-*-of-*.safetensors",
]
print("== Download start ==")
local_path = snapshot_download(
    repo_id=REPO_ID,
    local_dir=str(MODEL_DIR),
    local_dir_use_symlinks=False,   # 실제 파일 저장
    allow_patterns=allow,
    resume_download=True,
    max_workers=1,                  # 단일 스레드(안정)
)
print("== Download done to:", local_path)

# 3) 무결성 검증
index_path = MODEL_DIR / "model.safetensors.index.json"
assert index_path.exists(), "index 파일이 없습니다."
index = json.loads(index_path.read_text(encoding="utf-8"))
shards = sorted(set(index.get("weight_map", {}).values()))
missing = [s for s in shards if not (MODEL_DIR / s).exists()]
sizes   = {s: (MODEL_DIR/s).stat().st_size if (MODEL_DIR/s).exists() else 0 for s in shards}
tiny    = [s for s, sz in sizes.items() if sz < 100*1024*1024]  # 100MB 미만 의심

print("\n== Verify ==")
print("총 샤드:", len(shards))
print("누락 샤드:", missing)
print("의심(너무 작은) 샤드 수:", len(tiny))
for s in tiny[:3]:
    print(" - tiny:", s, f"{sizes[s]/1e9:.2f} GB")

# 4) 디스크 요약
print("\n== Disk ==")
print(subprocess.check_output(["df","-h","/workspace"]).decode())

# 5) 최종 요약
if not missing and not tiny:
    print("✅ 모든 샤드/파일 정상 다운로드로 보입니다.")
else:
    print("⚠️ 일부 파일이 미완료/의심입니다. 이 셀을 다시 실행하면 이어받기(resume) 됩니다.")


== Download start ==


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

model-00002-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00009.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00004-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00005-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00006-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00007-of-00009.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00008-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00009-of-00009.safetensors:   0%|          | 0.00/4.79G [00:00<?, ?B/s]

== Download done to: /workspace/solar_model/model

== Verify ==
총 샤드: 9
누락 샤드: []
의심(너무 작은) 샤드 수: 0

== Disk ==
Filesystem                    Size  Used Avail Use% Mounted on
mfs#ca-mtl-3.runpod.net:9421  420T  329T   91T  79% /workspace

✅ 모든 샤드/파일 정상 다운로드로 보입니다.


In [7]:
from huggingface_hub import snapshot_download
from pathlib import Path
import os

REPO_ID = "upstage/solar-pro-preview-instruct"
MODEL_DIR = Path("/workspace/solar_model/model")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# 안정 옵션(느려도 OK)
os.environ["HF_HUB_DISABLE_XET"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

# .py 파일들과 generation_config.json만 추가로 받습니다.
extra_allow = [
    "*.py",
    "generation_config.json",
]
path = snapshot_download(
    repo_id=REPO_ID,
    local_dir=str(MODEL_DIR),
    local_dir_use_symlinks=False,
    allow_patterns=extra_allow,
    resume_download=True,
    max_workers=1,
)
print("✅ code files downloaded to:", path)

# 확인
py_files = sorted(p.name for p in MODEL_DIR.glob("*.py"))
print("PY files:", py_files)
assert any("configuration_" in f for f in py_files), "configuration_*.py가 아직 없어요"
assert any("modeling_" in f for f in py_files), "modeling_*.py가 아직 없어요"


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

configuration_solar.py: 0.00B [00:00, ?B/s]

generation_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

modeling_solar.py: 0.00B [00:00, ?B/s]

vllm_solar.py: 0.00B [00:00, ?B/s]

✅ code files downloaded to: /workspace/solar_model/model
PY files: ['configuration_solar.py', 'modeling_solar.py', 'vllm_solar.py']


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch, os

# 경고 줄이려면(선택): TRANSFORMERS_CACHE 대신 HF_HOME만 쓰도록 설정
os.environ.pop("TRANSFORMERS_CACHE", None)

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tok = AutoTokenizer.from_pretrained("/workspace/solar_model/model", use_fast=False, local_files_only=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    "/workspace/solar_model/model",
    quantization_config=bnb,
    device_map="auto",
    trust_remote_code=True,   # 이제 .py 파일이 있으니 OK
    local_files_only=True
)
print("✅ 로컬 모델 로드 OK")


Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

✅ 로컬 모델 로드 OK


In [9]:
import os, pathlib, subprocess, json

BASE = pathlib.Path("/workspace/solar_model")
RAW  = BASE / "dataset" / "raw"
PROC = BASE / "dataset" / "processed"
OUTS = BASE / "outputs" / "solar22b_qLoRA_dapt"
MODEL_DIR = BASE / "model"   # 이미 받아둔 SOLAR 22B 로컬 경로

for p in [PROC, OUTS]: p.mkdir(parents=True, exist_ok=True)

# 캐시/임시 모두 /workspace로 고정 (컨테이너 디스크 사용 방지)
TMP = pathlib.Path("/workspace/tmp"); TMP.mkdir(parents=True, exist_ok=True)
HF  = pathlib.Path("/workspace/hf_cache")
for p in [HF/"huggingface/hub", HF/"datasets", HF/"transformers"]: p.mkdir(parents=True, exist_ok=True)

os.environ["TMPDIR"] = str(TMP)
os.environ["HF_HOME"] = str(HF/"huggingface")
os.environ["HUGGINGFACE_HUB_CACHE"] = str(HF/"huggingface/hub")
os.environ["TRANSFORMERS_CACHE"] = str(HF/"transformers")
os.environ["HF_DATASETS_CACHE"] = str(HF/"datasets")
os.environ["HF_HUB_DISABLE_XET"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
os.environ["HF_HUB_OFFLINE"] = "1"  # 로컬만 사용

# 루트 캐시로 새는 것 방지
!rm -rf /root/.cache/huggingface 2>/dev/null || true
!mkdir -p /root/.cache && ln -s /workspace/hf_cache/huggingface /root/.cache/huggingface

print("BASE:", BASE)
print("RAW :", RAW, "exists:", RAW.exists())
print("MODEL_DIR:", MODEL_DIR, "exists:", MODEL_DIR.exists())
!df -h /workspace


BASE: /workspace/solar_model
RAW : /workspace/solar_model/dataset/raw exists: True
MODEL_DIR: /workspace/solar_model/model exists: True
Filesystem                    Size  Used Avail Use% Mounted on
mfs#ca-mtl-3.runpod.net:9421  420T  329T   91T  79% /workspace


In [10]:
import importlib, sys, subprocess, os
need = []
for m in ["transformers","datasets","accelerate","trl","peft","bitsandbytes","sentencepiece","pandas","docx","tqdm","sklearn","safetensors"]:
    try:
        importlib.import_module(m)
    except Exception:
        need.append(m)
print("Missing:", need or "None")

if need:
    os.environ["PIP_CACHE_DIR"] = "/workspace/pip_cache"
    subprocess.check_call([sys.executable,"-m","pip","install","-U",
        "transformers>=4.43.0","datasets>=2.19.0","accelerate>=0.33.0",
        "trl>=0.9.6","peft>=0.11.1","bitsandbytes>=0.43.1",
        "sentencepiece","pandas","python-docx","tqdm","scikit-learn","safetensors"])


Missing: None


In [11]:
from pathlib import Path
loan_files   = sorted((RAW/"loan").glob("*.docx"))
notice_files = sorted((RAW/"notice").glob("*.docx"))
vac_files    = sorted((RAW/"vocancy").glob("*.txt"))  # 폴더명: vocancy

print("[counts] loan:", len(loan_files), "notice:", len(notice_files), "vacancy:", len(vac_files))
for p in (loan_files[:2] + notice_files[:2] + vac_files[:2]):
    print(" -", p)
assert loan_files or notice_files or vac_files, "❌ 원시 파일을 찾지 못했습니다."


[counts] loan: 12 notice: 3 vacancy: 3
 - /workspace/solar_model/dataset/raw/loan/1.청년전용_보증부월세대출.docx
 - /workspace/solar_model/dataset/raw/loan/10.버팀목전세자금.docx
 - /workspace/solar_model/dataset/raw/notice/서울지역본부 청년매입임대주택 예비입주자 모집공고 (1).docx
 - /workspace/solar_model/dataset/raw/notice/서울지역본부 청년매입임대주택 예비입주자 모집공고.docx
 - /workspace/solar_model/dataset/raw/vocancy/25년3차_신혼·신생아매입임대Ⅰ_공급주택목록(서울지역본부).txt
 - /workspace/solar_model/dataset/raw/vocancy/25년3차_신혼·신생아매입임대Ⅱ(전세형)_공급주택목록(서울지역본부).txt


In [12]:
import re, os
from docx import Document

def read_docx(path):
    doc = Document(str(path))
    paras = []
    for p in doc.paragraphs:
        t = p.text.strip()
        if t:
            paras.append(t)
    return "\n".join(paras)

def read_txt(path, encs=("utf-8","cp949","euc-kr")):
    for enc in encs:
        try:
            return Path(path).read_text(encoding=enc)
        except Exception:
            continue
    return Path(path).read_text(encoding="utf-8", errors="ignore")

ZW = re.compile(r"[\u200b-\u200f\u202a-\u202e]")  # zero-width 등
def clean_text(s: str) -> str:
    s = ZW.sub("", s)
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s).strip()
    return s

def save_proc(name, text):
    p = PROC / f"{name}.txt"
    p.write_text(text, encoding="utf-8")
    return p

proc_paths = []
for p in loan_files:
    txt = clean_text(read_docx(p))
    proc_paths.append(save_proc(f"loan__{p.stem}", txt))
for p in notice_files:
    txt = clean_text(read_docx(p))
    proc_paths.append(save_proc(f"notice__{p.stem}", txt))
for p in vac_files:
    txt = clean_text(read_txt(p))
    proc_paths.append(save_proc(f"vacancy__{p.stem}", txt))

print("processed files:", len(proc_paths))
print("sample:", proc_paths[:3])


processed files: 18
sample: [PosixPath('/workspace/solar_model/dataset/processed/loan__1.청년전용_보증부월세대출.txt'), PosixPath('/workspace/solar_model/dataset/processed/loan__10.버팀목전세자금.txt'), PosixPath('/workspace/solar_model/dataset/processed/loan__11.갱신만료_임차인_지원_버팀목전세자금.txt')]


In [13]:
import json, random
from pathlib import Path

random.seed(42)
all_txts = sorted(PROC.glob("*.txt"))
random.shuffle(all_txts)

split = int(len(all_txts) * 0.9) if len(all_txts) > 1 else len(all_txts)
train_files = all_txts[:split]
val_files   = all_txts[split:]

def to_jsonl(files, out_path):
    with open(out_path, "w", encoding="utf-8") as f:
        for fp in files:
            text = Path(fp).read_text(encoding="utf-8").strip()
            if not text: 
                continue
            # 문서별 헤더를 넣어 문맥 경계 명확히
            hdr = f"[도메인:{'대출' if 'loan__' in fp.name else '공고' if 'notice__' in fp.name else '주택목록'}] 문서명:{fp.stem}\n\n"
            rec = {"text": hdr + text}
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    return out_path

train_jsonl = PROC / "dapt_train.jsonl"
val_jsonl   = PROC / "dapt_val.jsonl"
to_jsonl(train_files, train_jsonl)
to_jsonl(val_files, val_jsonl)

print("train:", train_jsonl, train_jsonl.stat().st_size, "bytes")
print("val  :", val_jsonl,   val_jsonl.stat().st_size, "bytes")
print("n_train docs:", len(train_files), "n_val docs:", len(val_files))


train: /workspace/solar_model/dataset/processed/dapt_train.jsonl 1267147 bytes
val  : /workspace/solar_model/dataset/processed/dapt_val.jsonl 31967 bytes
n_train docs: 16 n_val docs: 2


In [14]:
from datasets import load_dataset
ds = load_dataset("json", data_files={"train": str(train_jsonl), "validation": str(val_jsonl)})
print(ds)
print("sample train text snippet:\n", ds["train"][0]["text"][:300])


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 16
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2
    })
})
sample train text snippet:
 [도메인:공고] 문서명:notice__서울지역본부 청년매입임대주택 예비입주자 모집공고

서울지역본부 청년매입임대주택 예비입주자 모집공고
청년 매입임대주택은 LH에서 주택을 매입하여 청년(19세~39세), 대학생 및 취업준비생을 대상으로시중시세 40~50% 수준으로 임대하는 주택입니다.
LH에서는 마이홈센터(☏1600-1004, 내선번호 2번→3번) 및 서울지역본부 매입임대 상담센터(☏02-2015-1040)를 통해 모집공고에 대한 안내가 이루어질 수 있도록 상담을 실시하고 있습니다. (다만, 상담내용은 신청 참고자료로만 활용하여 주


In [15]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,  # A100 OK
)

tok = AutoTokenizer.from_pretrained(str(MODEL_DIR), use_fast=False, local_files_only=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

base = AutoModelForCausalLM.from_pretrained(
    str(MODEL_DIR),
    quantization_config=bnb,
    device_map="auto",
    trust_remote_code=True,
    local_files_only=True,
)
base.config.use_cache = False  # grad checkpointing 호환

print("✅ base model ready")


Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

✅ base model ready


In [24]:
# TRL SFTTrainer가 받는 "키워드"를 런타임에 자동 감지해 맞춰주고,
# 안 맞으면 HF Trainer 파이프라인으로 자동 폴백합니다.

import inspect, sys, math, torch
from pathlib import Path

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
try:
    from trl import SFTTrainer, SFTConfig
    HAS_TRL = True
except Exception:
    HAS_TRL = False

# ==== 공통: LoRA 타깃(구조는 LLaMA 계열이라 아래가 정석) ====
TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

# ==== QLoRA 준비 ====
base.config.use_cache = False
try:
    base.gradient_checkpointing_enable()
except Exception:
    pass
base = prepare_model_for_kbit_training(base)

lora_config = LoraConfig(
    r=32, lora_alpha=32, lora_dropout=0.05, bias="none",
    target_modules=TARGET_MODULES, task_type="CAUSAL_LM",
)
base = get_peft_model(base, lora_config)
base.print_trainable_parameters()

print(f"TRL available? {HAS_TRL}")

# ==== TRL 경로 시도 ====
def try_trl():
    from trl import SFTTrainer, SFTConfig
    # SFTConfig: 버전마다 받는 인자가 다르니 필터링
    base_cfg = dict(
        output_dir=str(OUTS),
        num_train_epochs=2,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        logging_steps=10,
        eval_steps=200,
        save_steps=200,
        save_total_limit=2,
        gradient_checkpointing=True,
        bf16=True,           # A100이면 bf16 OK (안되면 밑에서 자동 대체)
        report_to="none",
    )
    sig_cfg = set(inspect.signature(SFTConfig.__init__).parameters.keys())
    cfg = {k:v for k,v in base_cfg.items() if k in sig_cfg}

    # 평가 전략 키워드 호환
    if "evaluation_strategy" in sig_cfg:
        cfg["evaluation_strategy"] = "steps"
    elif "eval_strategy" in sig_cfg:
        cfg["eval_strategy"] = "steps"
    elif "do_eval" in sig_cfg:
        cfg["do_eval"] = True

    # bf16 미지원 시 fp16 사용
    if "bf16" not in sig_cfg and "fp16" in sig_cfg:
        cfg["fp16"] = True

    train_args = SFTConfig(**cfg)

    # SFTTrainer: 지원 키워드만 전달
    sig_tr = set(inspect.signature(SFTTrainer.__init__).parameters.keys())
    tr_kwargs = dict(model=base, args=train_args)

    if "train_dataset" in sig_tr:  tr_kwargs["train_dataset"] = ds["train"]
    if "eval_dataset"  in sig_tr and len(ds["validation"])>0: tr_kwargs["eval_dataset"] = ds["validation"]
    if "peft_config"   in sig_tr:  tr_kwargs["peft_config"] = lora_config
    if "tokenizer"     in sig_tr:  tr_kwargs["tokenizer"] = tok
    if "dataset_text_field" in sig_tr: tr_kwargs["dataset_text_field"] = "text"
    if "packing"       in sig_tr:  tr_kwargs["packing"] = True
    if "max_seq_length" in sig_tr: tr_kwargs["max_seq_length"] = 4096  # Solar 2는 4096까지 OK(메모리 여유 없으면 3072/2048)

    trainer = SFTTrainer(**tr_kwargs)
    print("✅ TRL SFTTrainer ready")
    print("  • SFTConfig keys:", sorted(cfg.keys()))
    used = {k for k in tr_kwargs.keys() if k in sig_tr}
    print("  • SFTTrainer keys:", sorted(used))
    return trainer

trainer = None
if HAS_TRL:
    try:
        trainer = try_trl()
    except Exception as e:
        print(f"[TRL 경로 실패] -> {type(e).__name__}: {e}")
        trainer = None

# ==== 폴백: HF Trainer (사전 토크나이즈/패킹) ====
if trainer is None:
    print("↩️  HF Trainer로 자동 폴백합니다 (사전 토크나이즈/패킹 방식).")
    from datasets import load_dataset
    from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

    # 토큰화 + 패킹
    block_size = 3072  # OOM 시 2048로 낮추세요
    def tokenize_fn(batch):
        return tok(batch["text"], add_special_tokens=False, truncation=False)

    tokenized = ds.map(tokenize_fn, batched=True, remove_columns=[c for c in ds["train"].column_names if c!="text"])
    def group_texts(examples):
        concat = []
        for ids in examples["input_ids"]:
            concat.extend(ids)
        total_len = (len(concat)//block_size)*block_size
        concat = concat[:total_len]
        chunks = [concat[i:i+block_size] for i in range(0,total_len,block_size)]
        return {"input_ids": chunks, "labels": chunks.copy(), "attention_mask":[[1]*len(x) for x in chunks]}

    train_tok = tokenized["train"].map(group_texts, batched=True, remove_columns=["text"])
    eval_tok  = tokenized["validation"].map(group_texts, batched=True, remove_columns=["text"])

    optim_name = "paged_adamw_8bit"
    args = TrainingArguments(
        output_dir=str(OUTS),
        num_train_epochs=2,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=200,
        save_steps=200,
        save_total_limit=2,
        gradient_checkpointing=True,
        bf16=True,                # 안되면 fp16=True로
        optim=optim_name,
        report_to="none",
    )
    collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

    trainer = Trainer(
        model=base,
        args=args,
        train_dataset=train_tok,
        eval_dataset=eval_tok if len(eval_tok)>0 else None,
        data_collator=collator,
    )
    print("✅ HF Trainer ready (fallback)")


trainable params: 209,715,200 || all params: 22,349,747,200 || trainable%: 0.9383
TRL available? True


The repository /workspace/solar_model/model contains custom code which must be executed to correctly load the model. You can inspect the repository content at /workspace/solar_model/model .
 You can inspect the repository content at https://hf.co//workspace/solar_model/model.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y




Adding EOS to train dataset:   0%|          | 0/16 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (18618 > 4096). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/16 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

✅ TRL SFTTrainer ready
  • SFTConfig keys: ['bf16', 'eval_steps', 'eval_strategy', 'gradient_accumulation_steps', 'gradient_checkpointing', 'learning_rate', 'logging_steps', 'lr_scheduler_type', 'num_train_epochs', 'output_dir', 'per_device_eval_batch_size', 'per_device_train_batch_size', 'report_to', 'save_steps', 'save_total_limit', 'warmup_ratio']
  • SFTTrainer keys: ['args', 'eval_dataset', 'model', 'peft_config', 'train_dataset']


In [26]:
# 메모리 정리
import gc, torch
try:
    del trainer
except:
    pass
try:
    del base
except:
    pass
gc.collect(); torch.cuda.empty_cache()

# ① Base(4bit)만 다시 로드 — LoRA 부착 X
from transformers import AutoModelForCausalLM
base = AutoModelForCausalLM.from_pretrained(
    str(MODEL_DIR),
    quantization_config=bnb,         # FT-6에서 만든 BitsAndBytesConfig
    device_map="auto",
    trust_remote_code=True,
    local_files_only=True,
)
base.config.use_cache = False
try:
    base.gradient_checkpointing_enable()
except Exception:
    pass

# ② TRL SFTTrainer를 'peft_config'만 주고 생성(TRL이 알아서 LoRA 부착)
from trl import SFTTrainer, SFTConfig
from inspect import signature
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    r=32, lora_alpha=32, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type=TaskType.CAUSAL_LM,
)

# SFTConfig가 허용하는 키만 적용(버전 호환)
base_cfg = dict(
    output_dir=str(OUTS),
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=10,
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    gradient_checkpointing=True,
    bf16=True,
    report_to="none",
)
cfg_keys = set(signature(SFTConfig.__init__).parameters.keys())
cfg = {k:v for k,v in base_cfg.items() if k in cfg_keys}
if "evaluation_strategy" in cfg_keys:
    cfg["evaluation_strategy"] = "steps"
elif "eval_strategy" in cfg_keys:
    cfg["eval_strategy"] = "steps"
elif "do_eval" in cfg_keys:
    cfg["do_eval"] = True
if "bf16" not in cfg_keys and "fp16" in cfg_keys:
    cfg["fp16"] = True

train_args = SFTConfig(**cfg)

# SFTTrainer도 허용 키만
tr_sig = set(signature(SFTTrainer.__init__).parameters.keys())
tr_kwargs = dict(model=base, args=train_args, peft_config=lora_config,
                 train_dataset=ds["train"])
if "eval_dataset" in tr_sig and len(ds["validation"])>0:
    tr_kwargs["eval_dataset"] = ds["validation"]
# (너의 TRL 버전에선 tokenizer/dataset_text_field/max_seq_length가 인자로 없어서 자동 처리됨)

trainer = SFTTrainer(**tr_kwargs)
print("✅ Clean SFTTrainer ready (no double-LoRA)")


Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

The repository /workspace/solar_model/model contains custom code which must be executed to correctly load the model. You can inspect the repository content at /workspace/solar_model/model .
 You can inspect the repository content at https://hf.co//workspace/solar_model/model.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Tokenizing eval dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (10700 > 4096). Running this sequence through the model will result in indexing errors


Truncating eval dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

✅ Clean SFTTrainer ready (no double-LoRA)


In [27]:
train_result = trainer.train()
trainer.save_model(str(OUTS / "checkpoint-last"))
print("✅ 학습 완료")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 32007}.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


✅ 학습 완료


In [28]:
from pathlib import Path
adapter_dir = OUTS / "lora_adapter"
adapter_dir.mkdir(parents=True, exist_ok=True)
trainer.model.save_pretrained(str(adapter_dir))  # LoRA 가중치
tok.save_pretrained(str(adapter_dir))
print("✅ LoRA 어댑터 저장:", adapter_dir)


✅ LoRA 어댑터 저장: /workspace/solar_model/outputs/solar22b_qLoRA_dapt/lora_adapter


In [None]:
import torch
from transformers import AutoModelForCausalLM
from peft import PeftModel

base_eval = AutoModelForCausalLM.from_pretrained(
    str(MODEL_DIR),
    quantization_config=bnb,     # FT-6에서 만든 BitsAndBytesConfig 재사용
    device_map="auto",
    trust_remote_code=True,
    local_files_only=True,
)
model_lora = PeftModel.from_pretrained(base_eval, str(adapter_dir))

def generate(prompt, max_new_tokens=300, temperature=0.2, top_p=0.9):
    x = tok(prompt, return_tensors="pt").to(model_lora.device)
    with torch.no_grad():
        y = model_lora.generate(**x, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True)
    print(tok.decode(y[0], skip_special_tokens=True))

generate("‘청년전용 보증부월세대출’의 대상과 대출한도를 핵심만 요약해줘.")


Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]