In [None]:
# ==============================================
# RunPod Jupyter — Solar 22B + LoRA 최소 환경 원샷
#  * Torch(2.4.1+cu121) -> LLM 스택 -> flash-attn 순서 고정
# ==============================================
import os, sys
os.environ["HF_HOME"] = "/workspace/hf"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# 1) Torch 먼저 (CUDA 12.1)
%pip install -U --no-cache-dir --index-url https://download.pytorch.org/whl/cu121 \
  torch==2.4.1+cu121

# 2) LLM 스택 (필요한 것만)
%pip install -U --no-cache-dir \
  transformers==4.44.2 \
  peft==0.12.0 \
  accelerate==0.34.2 \
  huggingface-hub==0.35.1 \
  hf_transfer==0.1.9 \
  hf-xet==1.1.10 \
  safetensors==0.6.2 \
  sentencepiece==0.2.1 \
  einops==0.8.1 \
  tqdm==4.67.1 \
  bitsandbytes==0.43.3

# 3) flash-attn 은 반드시 Torch 설치 후
%pip install -U --no-cache-dir --no-build-isolation flash-attn==2.6.3
!pip install --upgrade pip
!pip install -q sentence-transformers chromadb tiktoken kss

# 4) 버전 한 줄 점검(실패 시 바로 에러)
import torch, transformers, peft, bitsandbytes as bnb, flash_attn, huggingface_hub
print(f"[Python] {sys.version.split()[0]}")
print(f"[Torch ] {torch.__version__} | CUDA: {torch.version.cuda} | GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print("transformers:", transformers.__version__)
print("peft:", peft.__version__)
print("bitsandbytes:", bnb.__version__)
print("flash_attn:", flash_attn.__version__)
print("huggingface-hub:", huggingface_hub.__version__)
print("HF_HOME:", os.environ.get("HF_HOME"))

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.4.1+cu121
  Downloading https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (799.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m799.0/799.0 MB[0m [31m322.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.4.1+cu121)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m490.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.4.1+cu121)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m611.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting

In [3]:
# Solar 22B + LoRA (QLoRA 4bit)
import os, pathlib, torch, importlib
from huggingface_hub import hf_hub_download
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

# ---- env/cache
os.environ["HF_HOME"] = os.environ.get("HF_HOME", "/workspace/hf")
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# ---- repos
BASE    = "upstage/solar-pro-preview-instruct"
ADAPTER = "venus141004/Solar_Finetuned"
SUBF    = "lora_adapter"

# ---- 4-bit quant config
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# ---- perf toggles (safe on Ampere+)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# ---- load base
base = AutoModelForCausalLM.from_pretrained(
    BASE,
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=os.environ["HF_HOME"],
)

# ---- tokenizer (+ optional chat template from adapter)
tok = AutoTokenizer.from_pretrained(
    BASE, use_fast=False, trust_remote_code=True, cache_dir=os.environ["HF_HOME"]
)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

tpl_path = hf_hub_download(
    ADAPTER, "chat_template.jinja",
    repo_type="model", subfolder=SUBF,
    local_dir=os.environ["HF_HOME"], local_dir_use_symlinks=False
)
tok.chat_template = pathlib.Path(tpl_path).read_text(encoding="utf-8")

# ---- attach LoRA
model = PeftModel.from_pretrained(
    base, ADAPTER, subfolder=SUBF, cache_dir=os.environ["HF_HOME"]
)

# ---- eos id for chat template
end_id = tok.convert_tokens_to_ids("<|im_end|>") if "<|im_end|>" in tok.get_vocab() else tok.eos_token_id

# Now ready: use `tok`, `model`, `end_id` for generation later.


config.json: 0.00B [00:00, ?B/s]

configuration_solar.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/upstage/solar-pro-preview-instruct:
- configuration_solar.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_solar.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/upstage/solar-pro-preview-instruct:
- modeling_solar.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/9 [00:00<?, ?it/s]

model-00001-of-00009.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00009.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00004-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00005-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00006-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00007-of-00009.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00008-of-00009.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00009-of-00009.safetensors:   0%|          | 0.00/4.79G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


chat_template.jinja:   0%|          | 0.00/291 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

lora_adapter/adapter_model.safetensors:   0%|          | 0.00/839M [00:00<?, ?B/s]

In [None]:
# --- Chroma 빌드 ---
import torch
import os, re, json, uuid
from pathlib import Path
from typing import List, Dict, Any

DATA_DIR   = Path("/workspace/data")
CHROMA_DIR = Path("/workspace/chroma")
COLL_NAME  = "lh_corpus"

EMBED_ID   = "intfloat/multilingual-e5-small"

BATCH      = 64        
MAX_TOK    = 480       
OVERLAP    = 96

CHROMA_DIR.mkdir(parents=True, exist_ok=True)

# 토크나이저/문장분리
import tiktoken, kss
enc = tiktoken.get_encoding("cl100k_base")

def filename_meta(p: Path) -> Dict[str, Any]:
    m = re.match(r"(\w+)__([^\.]+)\.(txt|jsonl)$", p.name)
    category, raw = (m.group(1), m.group(2)) if m else ("unknown", p.stem)
    m2 = re.match(r"(\d+)[\.\)]\s*(.+)", raw)
    order = int(m2.group(1)) if m2 else None
    title = (m2.group(2) if m2 else raw).replace("_", " ").strip()
    return {"category": category, "order": order, "title": title, "fname": p.name}

def chunk_text(text: str, max_tokens=MAX_TOK, overlap=OVERLAP):
    sents = kss.split_sentences(text, backend="punct")
    chunks, cur, cur_tok = [], [], 0
    for s in sents:
        t = len(enc.encode(s))
        if cur_tok + t > max_tokens and cur:
            chunks.append(" ".join(cur))
            if overlap > 0:
                keep, toks = [], 0
                for ss in reversed(cur):
                    n = len(enc.encode(ss))
                    if toks + n <= overlap:
                        keep.insert(0, ss); toks += n
                    else:
                        break
                cur = keep; cur_tok = sum(len(enc.encode(x)) for x in cur)
            else:
                cur, cur_tok = [], 0
        cur.append(s); cur_tok += t
    if cur:
        chunks.append(" ".join(cur))
    return chunks

def iter_docs(root: Path):
    files = sorted(root.glob("*"))
    print(f"[scan] {len(files)} files found in {root}")
    for p in files:
        if p.suffix.lower() not in (".txt", ".jsonl"):
            continue
        print(f"[start] {p.name}")
        if p.suffix.lower() == ".txt":
            text = p.read_text(encoding="utf-8", errors="ignore")
            print(f"[loaded] {p.name} ({len(text)} chars)")
            yield p, text
        else:
            lines = p.read_text(encoding="utf-8", errors="ignore").splitlines()
            buf = []
            for line in lines:
                line = line.strip()
                if not line: 
                    continue
                obj = json.loads(line)
                t = obj.get("text") or obj.get("content")
                if t:
                    buf.append(t)
            text = "\n\n".join(buf)
            print(f"[loaded] {p.name} ({len(text)} chars, {len(buf)} items)")
            if text:
                yield p, text

# 임베딩/Chroma
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

DEVICE_EMB = "cuda"
emb = SentenceTransformer(EMBED_ID, device=DEVICE_EMB, trust_remote_code=True)

client = chromadb.PersistentClient(path=str(CHROMA_DIR), settings=Settings(allow_reset=True))
coll   = client.get_or_create_collection(COLL_NAME, metadata={"hnsw:space": "cosine"})

before = coll.count()
added  = 0

print(f"[BUILD] data: {DATA_DIR}")
for p, raw in iter_docs(DATA_DIR):
    meta   = filename_meta(p)
    chunks = chunk_text(raw)
    print(f"[chunk] {p.name}: {len(chunks)}")
    if not chunks:
        continue

    ids   = [str(uuid.uuid4()) for _ in chunks]
    vecs  = emb.encode(
        chunks,
        normalize_embeddings=True,
        batch_size=BATCH,
        show_progress_bar=False,
    )
    metas = [dict(meta, chunk_idx=i) for i in range(len(chunks))]
    coll.add(documents=chunks, embeddings=vecs.tolist(), ids=ids, metadatas=metas)
    added += len(chunks)
    print(f"[add]   {p.name}: {len(chunks)} chunks")

after = coll.count()

print("\n=== Chroma build done ===")
print(f"collection : {COLL_NAME}")
print(f"index_dir  : {CHROMA_DIR}")
print(f"chunks     : {before} -> {after}  (+{added})")
print(f"device(emb): {DEVICE_EMB} | GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]