In [2]:
# ============================================
# 저장된 VotingClassifier (.pkl) 불러오기 + 예측
# ============================================

import joblib
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

# -------------------------------
# 설정
# -------------------------------
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
MAX_LEN = 256
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

# -------------------------------
# 1) 저장된 pkl 불러오기
# -------------------------------
SAVE_PKL = "./models.pkl"
data = joblib.load(SAVE_PKL)

clf = data["classifier"]
mlb = data["mlb"]
thresholds = data["thresholds"]

print(f"[Loaded model from {SAVE_PKL}]")
print(f"Labels: {list(mlb.classes_)}")

# -------------------------------
# 2) MiniLM 로드 (임베딩 추출용)
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
base_model.eval()

def encode_texts(texts, batch_size=32):
    """텍스트를 MiniLM 임베딩으로 변환"""
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(device)
        with torch.no_grad():
            model_out = base_model(**enc)
            emb = model_out.last_hidden_state.mean(dim=1)
        all_embeddings.append(emb.cpu().numpy())
    return np.vstack(all_embeddings)

# -------------------------------
# 3) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    emb = encode_texts([text], batch_size=1)
    proba = clf.predict_proba(emb)[0]

    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:  # 어떤 것도 threshold 못 넘으면 topk 선택
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]

    return [mlb.classes_[i] for i in pick]


  from .autonotebook import tqdm as notebook_tqdm


[Device] cpu


  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.
  setstate(state)
  setstate(state)
  setstate(state)
  setstate(state)


[Loaded model from ./models.pkl]
Labels: ['Amber', 'Aromatic', 'Blossom', 'Bouquet', 'Citrus', 'Classical', 'Crisp', 'Dry', 'Floral', 'Flower', 'Fougère', 'Fresh', 'Fresher', 'Fruity', 'Gourmand', 'Green', 'Iris', 'Jasmine', 'Lily', 'Mossy', 'Musk', 'Orange', 'Rich', 'Richer', 'Rose', 'Soft', 'Spicy', 'Tuberose', 'Valley', 'Violet', 'Water', 'White', 'Woods', 'Woody']


In [3]:

# -------------------------------
# 4) 예측 실행
# -------------------------------
example_text = "여자친구 달달한향좋아하는데 추천좀"
print("\n[Example Prediction]")
print(predict_multilingual(example_text, topk=3, thresholds=thresholds))


[Example Prediction]
['Amber', 'Floral', 'Fresher', 'Fruity']
