# KcELECTRA 모델 경량화 및 최적화

1. F16으로 경량화
2. bitsandbytes 8bit

추론 속도 자체는 충분히 빨라서 굳이 최적화를 해야할 지 모르겠음. 일단 time으로 찍어 볼 예정.

In [1]:
!pip show safetensors

Name: safetensors
Version: 0.6.2
Summary: 
Home-page: https://github.com/huggingface/safetensors
Author: 
Author-email: Nicolas Patry <patry.nicolas@protonmail.com>
License: 
Location: c:\users\ssafy\desktop\wang\kcvenv\lib\site-packages
Requires: 
Required-by: accelerate, peft, transformers




In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
import torch, os

model_path = r"C:\Users\SSAFY\Desktop\WANG\S13P31A106\ai\Classifier_Model"
dst_path = "./best_model_INT8"

# ✅ bitsandbytes 설정 (8비트 양자화)
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,             # 8비트 양자화 활성화
    llm_int8_threshold=6.0,        # outlier threshold (기본값 6.0)
    llm_int8_has_fp16_weight=False # 완전 INT8로 저장
)

# ✅ 모델 로드 + 양자화 적용
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto"   # 자동으로 GPU/CPU 분배
)

# ✅ 저장
model.save_pretrained(dst_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.save_pretrained(dst_path)

print("✅ INT8 모델 변환 완료:", dst_path)

✅ INT8 모델 변환 완료: ./best_model_INT8


In [9]:
# 감정 카테고리 (모델의 class 순서)
CATEGORIES = [
    "happy",
    "embarrass",
    "anger",
    "unrest",
    "damaged",
    "sadness",
]

# 이건 실제 csv에서 뽑아올 것.
label_map = {
    "기쁨": 0,     # happy
    "당황": 1,     # embarrass
    "분노": 2,     # anger
    "불안": 3,     # unrest
    "상처": 4,     # damaged
    "슬픔": 5,     # sadness
}

NUM_LABELS = len(CATEGORIES)

import re
import emoji
from soynlp.normalizer import repeat_normalize

pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

def clean(x): 
    x = pattern.sub(' ', x)
    x = emoji.replace_emoji(x, replace='') #emoji 삭제
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import os
import time

# 모델이랑 토크나이저 불러오기.
# MODEL_DIR = r"C:\Users\SSAFY\Desktop\WANG\S13P31A106\ai\best_model_INT8"
MODEL_DIR = r"C:\Users\SSAFY\Desktop\WANG\S13P31A106\ai\Classifier_Model"
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval().to("cpu")

# model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, device_map="cpu")

def predict_emotion(texts):
    if isinstance(texts, str):
        texts = [texts]

    device = "cpu"
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=64,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        preds = probs.argmax(dim=-1)
    
    results = {
        "text": clean(text),
        "pred_label": CATEGORIES[preds.item()],
        "probabilities": {CATEGORIES[j]: round(probs[0][j].item(), 4) for j in range(len(CATEGORIES))}
        }
    
    #여러 문장인 경우
    # for i, text in enumerate(texts):
    #     results.append({
    #         "text": clean(text),
    #         "pred_label": CATEGORIES[preds[i].item()],
    #         "probabilities": {CATEGORIES[j]: round(probs[i][j].item(), 4) for j in range(len(CATEGORIES))}
    #     })
    return results

In [21]:
import time
import mlflow

sample = ["요즘 너무 무기력해요.", "불안해서 잠이 안 와요.", "오늘 집에 가다가 오토바이에 치여서 다칠 뻔했어. 너무 짜증나"]
mlflow.set_tracking_uri(r"file:\\\C:\Users\SSAFY\Desktop\WANG\S13P31A106\ai\mlruns")

with mlflow.start_run(run_name="inference_time_last"):
    try:
        times = []
        for i, text in enumerate(sample):
            start = time.time()
            result = predict_emotion(text)
            end = time.time()
            infer_time = end - start
            times.append(infer_time)
            
            print(f'타겟 문장은 {text}, 감정 분석은 {result["probabilities"]}, 걸린 시간은{infer_time}')
            
            mlflow.log_metric("inference_time", infer_time, step=i)
            
        avg_time = sum(times) / len(times)
        mlflow.log_metric("avg_inference_time", avg_time)
        print(f"평균 추론 시간: {avg_time:.3f}초")
    finally:
        mlflow.end_run()

타겟 문장은 요즘 너무 무기력해요., 감정 분석은 {'happy': 0.0022, 'embarrass': 0.0375, 'anger': 0.1249, 'unrest': 0.4777, 'damaged': 0.141, 'sadness': 0.2167}, 걸린 시간은0.11554694175720215
타겟 문장은 불안해서 잠이 안 와요., 감정 분석은 {'happy': 0.0023, 'embarrass': 0.0365, 'anger': 0.1329, 'unrest': 0.0528, 'damaged': 0.4974, 'sadness': 0.2781}, 걸린 시간은0.07091546058654785
타겟 문장은 오늘 집에 가다가 오토바이에 치여서 다칠 뻔했어. 너무 짜증나, 감정 분석은 {'happy': 0.0012, 'embarrass': 0.0043, 'anger': 0.9781, 'unrest': 0.0033, 'damaged': 0.0076, 'sadness': 0.0055}, 걸린 시간은0.07818722724914551
평균 추론 시간: 0.088초


In [7]:
import torch
import bitsandbytes as bnb

print("Torch CUDA available:", torch.cuda.is_available())
print("Torch CUDA version:", torch.version.cuda)
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")
print("Bitsandbytes version:", bnb.__version__)


Torch CUDA available: True
Torch CUDA version: 12.1
GPU name: NVIDIA GeForce RTX 4050 Laptop GPU
Bitsandbytes version: 0.43.2


# 경량화 평가