In [None]:
# !pip install --upgrade pip
# !pip install torch --index-url https://download.pytorch.org/whl/cu118  # CUDA 11.8 기준
# !pip install transformers datasets peft bitsandbytes huggingface_hub python-dotenv chromadb
# !pip install einops timm

In [49]:
import json, os, re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModel
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training, PeftModel
from datasets import load_dataset
import chromadb
from huggingface_hub import login

In [3]:
from dotenv import load_dotenv
login(token=os.getenv('HF_TOKEN'))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=quant_config,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


`torch_dtype` is deprecated! Use `dtype` instead!


image_processing_hyperclovax.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B:
- image_processing_hyperclovax.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B:
- image_processing_hyperclovax.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
#  k-bit 학습 준비
base_model = prepare_model_for_kbit_training(base_model)

In [8]:
# LoRA 설정
lora_config = LoraConfig(
    task_type='CAUSAL_LM',
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias='none',
    target_modules=["q_proj","v_proj"]
)

# LoRA 적용
model = get_peft_model(base_model, lora_config)
model.enable_input_require_grads()
model.gradient_checkpointing_enable()
model.print_trainable_parameters()

trainable params: 3,616,768 || all params: 3,724,860,288 || trainable%: 0.0971


In [24]:
# 토크나이징 함수
def tokenize_function(example):
    input_text = example["instruction"] + "\n" + example["input"] + "\n"
    target_text = example["output"]

    # input_text 토크나이징
    input_ids_only = tokenizer(input_text, truncation=True, padding=False, add_special_tokens=False)["input_ids"]
    input_len = len(input_ids_only)

    # 전체 텍스트 토크나이징
    full_text = input_text + target_text
    tokenized = tokenizer(full_text, truncation=True, padding="max_length", max_length=256)

    # labels 생성, input 부분 -100 처리
    tokenized["labels"] = tokenized["input_ids"].copy()
    tokenized["labels"][:input_len] = [-100] * input_len

    return tokenized

In [23]:
# 데이터셋 로드
dataset = load_dataset("json", data_files={
    "train": "train.jsonl",
    "validation": "valid.jsonl"
})

print(dataset["train"][0])


{'instruction': '다음 질문에서 법령 검색 키워드를 추출하라.', 'input': '자동차의 등화장치 색상이 규정을 어겼을 경우 어떻게 되나요?', 'output': '등화장치, 색상, 위반'}


In [27]:
tokenized_datasets = dataset.map(tokenize_function, batched=False)
tokenized_datasets = tokenized_datasets.remove_columns(["instruction", "input", "output"])

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/129 [00:00<?, ? examples/s]

In [32]:
training_args = TrainingArguments(
    output_dir="./hyperclovax_lora",
    save_strategy="epoch",           # step보다 epoch 기준 체크포인트 추천
    save_total_limit=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=50,
    fp16=True,
    push_to_hub=False,
    report_to='none',
    remove_unused_columns=True        # tokenized dataset 컬럼만 사용
)


In [34]:
# Trainer 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer
)

  trainer = Trainer(


In [35]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
50,86.847
100,71.2631


TrainOutput(global_step=111, training_loss=77.81674991642032, metrics={'train_runtime': 571.7702, 'train_samples_per_second': 6.071, 'train_steps_per_second': 0.194, 'total_flos': 1.8043149806862336e+16, 'train_loss': 77.81674991642032, 'epoch': 3.0})

In [36]:
model.save_pretrained("./hyperclovax_lora_final")
tokenizer.save_pretrained("./hyperclovax_lora_final")

('./hyperclovax_lora_final/tokenizer_config.json',
 './hyperclovax_lora_final/special_tokens_map.json',
 './hyperclovax_lora_final/chat_template.jinja',
 './hyperclovax_lora_final/vocab.json',
 './hyperclovax_lora_final/merges.txt',
 './hyperclovax_lora_final/added_tokens.json',
 './hyperclovax_lora_final/tokenizer.json')

In [41]:
base_model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained("./hyperclovax_lora_final")
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
model = PeftModel.from_pretrained(base_model, "./hyperclovax_lora_final")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [42]:
# Hugging Face Hub에 push
model.push_to_hub("poketmon/hyperclovax_lora_3B")
tokenizer.push_to_hub("poketmon/hyperclovax_lora_3B")

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/poketmon/hyperclovax_lora_3B/commit/abadcc89dbf5d2d592aac951b0ab8570b2ad3036', commit_message='Upload tokenizer', commit_description='', oid='abadcc89dbf5d2d592aac951b0ab8570b2ad3036', pr_url=None, repo_url=RepoUrl('https://huggingface.co/poketmon/hyperclovax_lora_3B', endpoint='https://huggingface.co', repo_type='model', repo_id='poketmon/hyperclovax_lora_3B'), pr_revision=None, pr_num=None)

In [43]:
# 테스트
model.eval()
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): HCXVisionForCausalLM(
      (vision_model): SiglipVisionModel(
        (vision_model): SiglipVisionTransformer(
          (embeddings): SiglipVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
            (position_embedding): Embedding(729, 1152)
          )
          (encoder): SiglipEncoder(
            (layers): ModuleList(
              (0-26): 27 x SiglipEncoderLayer(
                (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
                (self_attn): SiglipAttention(
                  (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
                  (v_proj): lora.Linear(
                    (base_layer): Linear(in_features=1152, out_features=1152, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
               

In [None]:
prompt = (
    "다음 질문에서 법령 키워드만 JSON 배열 형태로 출력하라. "
    "추가 설명 없이 키워드만:\n"
    "가끔 이륜자동차를 탈 때 보호모를 안 하면 단속될 수 있나요?"
)

inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=20,   # 키워드만 나오도록 제한
        temperature=0.0,     # deterministic
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id
    )

raw_result = tokenizer.decode(outputs[0], skip_special_tokens=True)

cleaned_result = raw_result.replace(prompt, "")

# 3️⃣ 한글 단어만 추출 (키워드)
keywords = re.findall(r"[가-힣]+", cleaned_result)
keywords = json.dumps(keywords, ensure_ascii=False)
# 4️⃣ JSON 배열로 출력
print(json.dumps(keywords, ensure_ascii=False))

["이륜자동차", "보호모", "단속", "이륜자동차"]


# 크로마 세팅

In [65]:
file_path = "vector_chunks.json"
with open(file_path, "r", encoding="utf-8") as f:
    vector_chunks = json.load(f)

print(len(vector_chunks))

1059


In [66]:
# KURE-v1 로드
tokenizer = AutoTokenizer.from_pretrained("nlpai-lab/KURE-v1")
model = AutoModel.from_pretrained("nlpai-lab/KURE-v1")

def get_embedding(text: str):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    
    embeddings = outputs.last_hidden_state.mean(dim=1)

    return embeddings[0].tolist()

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/807 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

In [68]:
chroma_client = chromadb.PersistentClient(path="./law_db")

collection = chroma_client.get_or_create_collection(name="laws")

In [69]:
for chunk in vector_chunks:
    # 1) content 벡터
    collection.add(
        embeddings=[get_embedding(chunk["content"])],
        ids=[f"content_{chunk['id']}"],
        documents=[chunk["content"]],
        metadatas=[chunk["metadata"]]
    )

    # 2) title 벡터
    collection.add(
        embeddings=[get_embedding(chunk["metadata"]["article_title"])],
        ids=[f"title_{chunk['id']}"],
        documents=[chunk['id']],
        metadatas=[{"source_type": "title", "link_to": f"content_{chunk['id']}"}]
    )

KeyboardInterrupt: 

In [None]:
def hybrid_search(query_text, top_k=5, title_weight=0.7, content_weight=0.3):
    query_embedding = get_embedding(query_text)
    
    # title 검색
    title_results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["metadatas", "documents", "distances"],
        where={"source_type":"title"}
    )

    # content 검색
    content_results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["metadatas", "documents", "distances"],
        where={"source_type":"law"}
    )

    combined_scores = {}
    combined_docs = {}

    # score 계산 및 title/content 합산
    for results, weight in [(title_results, title_weight), (content_results, content_weight)]:
        docs = results["documents"][0]
        dists = results["distances"][0]
        metas = results["metadatas"][0]
        ids = results["ids"][0]

        for doc, dist, meta, vid in zip(docs, dists, metas, ids):
            similarity = 1 - dist / 2  # cosine distance → similarity
            score = similarity * weight

            if meta.get("source_type") == "title":
                vid = vid.replace('title_', '')
                content_id = meta.get("link_to")
                if content_id:
                    doc_data = collection.get(ids=[content_id])
                    if doc_data["documents"]:
                        doc = doc_data["documents"][0]
            else:
                vid = vid.replace('content_', '')

            combined_scores[vid] = combined_scores.get(vid, 0) + score
            combined_docs[vid] = doc

    # min–max 정규화 (0~1)
    scores = list(combined_scores.values())
    min_s, max_s = min(scores), max(scores)
    for vid in combined_scores:
        combined_scores[vid] = (combined_scores[vid] - min_s) / (max_s - min_s + 1e-8)

    # 정렬
    sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    results = [{"Article_no": vid, "Score": combined_scores[vid], "Content": combined_docs[vid]} 
               for vid, _ in sorted_results[:top_k]]
    return results

# 아무튼 결과

In [None]:
prompt = (
    "다음 질문에서 법령 키워드만 JSON 배열 형태로 출력하라. "
    "추가 설명 없이 키워드만:\n"
    "가끔 이륜자동차를 탈 때 보호모를 안 하면 단속될 수 있나요?"
)

inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=20,   # 키워드만 나오도록 제한
        temperature=0.0,     # deterministic
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id
    )

raw_result = tokenizer.decode(outputs[0], skip_special_tokens=True)

cleaned_result = raw_result.replace(prompt, "")

# 3️⃣ 한글 단어만 추출 (키워드)
keywords = re.findall(r"[가-힣]+", cleaned_result)
keywords = json.dumps(keywords, ensure_ascii=False)
# 4️⃣ JSON 배열로 출력
print(json.dumps(keywords, ensure_ascii=False))

In [None]:
keywords = json.dumps(keywords, ensure_ascii=False)

for k in keywords:
    results = hybrid_search(k)
    for r in results:
        print(f"Article_no: {r['Article_no']}, Score: {r['Score']:.4f}")
        print(f"Content: {r['Content']}\n")