In [None]:
!pip install -U transformers datasets peft accelerate bitsandbytes trl huggingface_hub

In [4]:
from huggingface_hub import notebook_login

# Hugging Face 로그인
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [6]:
model_id = 'LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct'
output_dir = "./exaone-customer-anlysis"

In [5]:
# 4-bit 양자화된 모델 로드를 위한 설정
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',             # 양자화 타입 지정
    bnb_4bit_use_double_quant=True,        # 이중 양자화(양자화된 가중치를 한번 더 양자화)
    bnb_4bit_compute_dtype=torch.bfloat16  # 16bit의 부동소수점 데이터 타입 사용
)

# 모델 및 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/563 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_exaone.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct:
- configuration_exaone.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_exaone.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct:
- modeling_exaone.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [6]:
# 모델 학습 준비
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=16, 
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

In [7]:
SYSTEM_PROMPT = """
상담 스크립트에서 고객의 성향을 분류하세요
전체적인 맥락을 참고하되 판단의 근거는 고객의 발화에 한정합니다

### 분류 규칙
1. 제시된 성향 키워드 중 가장 적절한 한가지만 선택
2. 설명이나 판단 근거는 절대 출력하지말고 오직 하나의 키워드만 출력한다
3. 여러 개의 성향을 가질 경우 S3 > S2 > S1 > N3 > N2 > N1 의 순서로 우선 순위를 가진다

### 성향 키워드 목록
- N1: 실용주의형. 불필요한 말 없이 바로 문의사항을 말함
- N2: 수다형. 사적인 이야기나 본인 상황을 길게 설명함
- N3: 신중형. 신중하고 의심을 보임
- S1: 급한성격형. 빠른 처리를 선호함
- S2: 이해부족형. 설명을 잘 이해하지 못하여 반복적으로 확인함
- S3: 불만형. 분노, 짜증을 드러냄
"""

In [12]:
def tokenize_function(examples):
    tokenized_inputs = {
        "input_ids": [],
        "labels": [],
        "attention_mask": []
    }
    
    for script, output in zip(examples['consulting_content'], examples['personality']):
        # EXAONE 전용 템플릿
        prompt = f"[|system|]\n{SYSTEM_PROMPT}[|user|]\n{script}[|assistant|]\n"
        answer = f"{output}[|end|]"
        
        # 프롬프트와 답변을 각각 토크나이즈 (패딩 없이)
        p_tokens = tokenizer(prompt, truncation=True, max_length=1024)
        a_tokens = tokenizer(answer, truncation=True, max_length=1024)
        
        # 합치기 (최대 길이 제한)
        input_ids = (p_tokens["input_ids"] + a_tokens["input_ids"])[:1024]
        # Labels 생성: 프롬프트 영역은 -100으로 마스킹 (Loss 계산 제외)
        labels = ([-100] * len(p_tokens["input_ids"]) + a_tokens["input_ids"])[:1024]
        
        # 수동 패딩 처리 (모든 시퀀스를 1024로 맞춤)
        padding_len = 1024 - len(input_ids)
        if padding_len > 0:
            input_ids += [tokenizer.pad_token_id] * padding_len
            labels += [-100] * padding_len # 패딩 영역도 Loss 계산 제외
            
        attention_mask = [1] * (1024 - padding_len) + [0] * padding_len
        
        tokenized_inputs["input_ids"].append(input_ids)
        tokenized_inputs["labels"].append(labels)
        tokenized_inputs["attention_mask"].append(attention_mask)
        
    return tokenized_inputs

In [13]:
df = pd.read_csv("train.csv")
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [14]:
dataset

Dataset({
    features: ['source_id', 'consulting_content', 'personality', 'evidence'],
    num_rows: 6000
})

In [15]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    bf16=True,
    optim="paged_adamw_32bit",
    remove_unused_columns=False,
    warmup_ratio=0.1,
)

# Trainer 실행
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,3.1142
20,2.9679
30,2.5326
40,2.2156
50,1.9823
60,1.6704
70,1.5122
80,1.4517
90,1.4124
100,1.3644


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=1125, training_loss=1.1411112234327527, metrics={'train_runtime': 20226.8112, 'train_samples_per_second': 0.89, 'train_steps_per_second': 0.056, 'total_flos': 8.1931593056256e+17, 'train_loss': 1.1411112234327527, 'epoch': 3.0})

In [16]:
# 모델 저장
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"학습 완료. 모델이 {output_dir}에 저장되었습니다.")

학습 완료. 모델이 ./exaone-customer-anlysis에 저장되었습니다.


In [17]:
# 로컬에 저장된 모델과 토크나이저 업로드
repo_id = 'ansui/exaone-customer-analysis'

trainer.model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ansui/exaone-customer-analysis/commit/5b61750e8b12f2af36f7a92892d821e0a261cb34', commit_message='Upload tokenizer', commit_description='', oid='5b61750e8b12f2af36f7a92892d821e0a261cb34', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ansui/exaone-customer-analysis', endpoint='https://huggingface.co', repo_type='model', repo_id='ansui/exaone-customer-analysis'), pr_revision=None, pr_num=None)

In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

base_model_id = 'LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct'
adapter_model_id = "ansui/exaone-customer-analysis"

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

# 베이스 모델 4-bit 로드
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# LoRA 어댑터 입히기
model = PeftModel.from_pretrained(base_model, adapter_model_id)
model.eval()

# 테스트 함수 정의
def analyze_customer(script):
    prompt = f"[|system|]\n{SYSTEM_PROMPT}[|user|]\n{script}[|assistant|]\n"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=10, 
            # do_sample=False일 때는 temperature를 넣지 않아야 경고가 안 납니다.
            do_sample=False, 
            eos_token_id=tokenizer.eos_token_id
        )
    
    # 생성된 텍스트만 디코딩 (skip_special_tokens=True를 해도 [|end|]가 남을 수 있음)
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 1. assistant 뒤쪽 답변만 추출
    if "[|assistant|]" in full_text:
        result = full_text.split("[|assistant|]")[-1]
    else:
        result = full_text.replace(prompt, "")
        
    # 2. 특수 토큰 [|end|] 및 공백 깔끔하게 제거
    result = result.replace("[|end|]", "").strip()
    
    # 3. 혹시라도 남을 수 있는 잔여 태그 제거 (예: [|)
    if "[" in result:
        result = result.split("[")[0].strip()
        
    return result


test_script = """상담사: 상담원 ▲▲▲입니다.
손님: 네, 저 ▲▲카드 문의좀 드릴려고요.
상담사: 네, 고객님. 그럼 본인 확인 후 안내를 해드리겠습니다. 성함 말씀해 주시겠어요?
손님: ▲▲▲입니다.
상담사: 네, 확인 감사합니다. 어떤 점이 궁금하신가요?
손님: 제가 카드를 새로 받았는데, 기존 카드는 자동으로 해지가 되는 건지 궁금해서요.
상담사: 네, 고객님. 갱신 카드를 수령하셨다면 기존 카드는 잘라서 폐기해 주시면 되고요, 새 카드를 사용하시는 시점부터 기존 카드는 사용이 제한됩니다.
손님: 아, 그렇군요. 알겠습니다. 감사합니다.
상담사: 네, 다른 문의 사항 있으실까요? 감사합니다."""

print(f"분류 결과: {analyze_customer(test_script)}")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/563 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_exaone.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct:
- configuration_exaone.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_exaone.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct:
- modeling_exaone.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/37.8M [00:00<?, ?B/s]

분류 결과: N1


In [21]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

base_model_id = 'LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct' 
adapter_model_id = "ansui/exaone-customer-analysis"
new_repo_id = "ansui/exaone-customer-analysis-merged"

# 베이스 모델과 토크나이저 로드
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.bfloat16,
    device_map="cpu",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(adapter_model_id, trust_remote_code=True)

# 어댑터 연결
model = PeftModel.from_pretrained(base_model, adapter_model_id)

# 모델 병합
print("Merging layers...")
merged_model = model.merge_and_unload()

# 허깅페이스에 병합된 모델 업로드
print("Pushing to Hub...")
merged_model.push_to_hub(new_repo_id)
tokenizer.push_to_hub(new_repo_id)

print("업로드 완료")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/457 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/339 [00:00<?, ?B/s]

Merging layers...
Pushing to Hub...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

업로드 완료
