In [None]:
# ============================================
# 설치 (Runpod A40 / 로컬)
# ============================================

# CPU 전용
# python -m pip install -U "torch>=2.2,<3.0" scikit-learn pandas numpy joblib sentence-transformers transformers

# GPU (CUDA 12.1, Runpod A40)
!python -m pip install -U "torch>=2.2,<3.0" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!python -m pip install -U scikit-learn pandas numpy joblib sentence-transformers transformers

Collecting pandas
  Using cached pandas-2.3.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.3.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Using cached pandas-2.3.2-cp312-cp312-win_amd64.whl (11.0 MB)
Using cached numpy-2.3.2-cp312-cp312-win_amd64.whl (12.8 MB)
Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached sentence_transformers-5.1.0-py3-none-any.whl (483 kB)
Downloading transformers-4.56.0-py3-none-an

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-tests 0.3.20 requires httpx<1,>=0.25.0, but you have httpx 0.13.3 which is incompatible.


In [2]:
!pip uninstall transformers -y
!pip uninstall huggingface-hub -y
!pip uninstall tokenizers -y

!pip install --no-cache-dir transformers

Found existing installation: transformers 4.56.0
Uninstalling transformers-4.56.0:
  Successfully uninstalled transformers-4.56.0
[0mFound existing installation: huggingface-hub 0.34.4
Uninstalling huggingface-hub-0.34.4:
  Successfully uninstalled huggingface-hub-0.34.4
[0mFound existing installation: tokenizers 0.22.0
Uninstalling tokenizers-0.22.0:
  Successfully uninstalled tokenizers-0.22.0
[0mCollecting transformers
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.56.0-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m348.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingfa

In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.4-py3-none-manylinux_2_28_x86_64.whl (94.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 MB[0m [31m231.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.4
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
# ============================================
# MiniLM 임베딩 + VotingClassifier (앙상블) + Threshold 최적화
# Noise 라벨 제거 + 모델 저장 (.pt + .pkl)
# ============================================

import os, numpy as np, pandas as pd, joblib
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

import torch
from transformers import AutoTokenizer, AutoModel

# -------------------------------
# 설정
# -------------------------------
DATA_CSV = "perfumes_huggingface.csv"
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
TOP_K = 3
RARE_MIN_COUNT = 7
MAX_LEN = 256
BATCH_SIZE = 16
NOISE_LABELS = {"$$$", "of", "the"}   # 불용 라벨 제거 대상
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

# -------------------------------
# 유틸 함수
# -------------------------------
def split_labels(s: str):
    s = str(s)
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, " ")
    return [t.strip() for t in s.split() if t.strip()]

# -------------------------------
# 1) 데이터 로드 & 전처리
# -------------------------------
df = pd.read_csv(DATA_CSV, sep="|", engine="python", on_bad_lines="skip")
df = df[~df["description"].isna()].copy()
df["labels"] = df["fragrances"].apply(split_labels)

# 희소 라벨 + 불용 라벨 제거
cnt = Counter([l for L in df["labels"] for l in L])
rare = {k for k, v in cnt.items() if v <= RARE_MIN_COUNT}
df["labels"] = df["labels"].apply(
    lambda L: [l for l in L if l not in rare and l not in NOISE_LABELS]
)
df = df[df["labels"].map(len) > 0].copy()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["labels"])

X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["description"].tolist(), Y, test_size=0.2, random_state=42
)

# -------------------------------
# 2) MiniLM 임베딩 추출
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
base_model.eval()

def encode_texts(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(device)
        with torch.no_grad():
            model_out = base_model(**enc)
            emb = model_out.last_hidden_state.mean(dim=1)
        all_embeddings.append(emb.cpu().numpy())
    return np.vstack(all_embeddings)

print("[Encoding Train Texts]")
X_train_emb = encode_texts(X_train_text, batch_size=BATCH_SIZE)
print("[Encoding Validation Texts]")
X_val_emb = encode_texts(X_val_text, batch_size=BATCH_SIZE)

# -------------------------------
# 3) VotingClassifier 앙상블 학습
# -------------------------------
logreg = LogisticRegression(max_iter=200)
xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    tree_method="gpu_hist" if device=="cuda" else "hist"
)

ensemble = VotingClassifier(
    estimators=[("lr", logreg), ("xgb", xgb)],
    voting="soft"   # soft 투표 (확률 평균)
)

clf = OneVsRestClassifier(ensemble, n_jobs=-1)
clf.fit(X_train_emb, y_train)

# -------------------------------
# 4) 검증 예측 & Threshold 최적화
# -------------------------------
y_val_proba = clf.predict_proba(X_val_emb)
y_val_proba = np.array(y_val_proba)

thresholds = {}
y_val_pred_opt = np.zeros_like(y_val)
for i, label in enumerate(mlb.classes_):
    best_thr, best_f1 = 0.5, -1
    for thr in np.linspace(0.2, 0.5, 16):
        pred = (y_val_proba[:, i] >= thr).astype(int)
        f1 = f1_score(y_val[:, i], pred, zero_division=0)
        if f1 > best_f1:
            best_thr, best_f1 = thr, f1
    thresholds[label] = best_thr
    y_val_pred_opt[:, i] = (y_val_proba[:, i] >= best_thr).astype(int)

print("\n[Best Thresholds per label]")
for k, v in thresholds.items():
    print(f"{k}: {v:.2f}")

# -------------------------------
# 5) 평가
# -------------------------------
print("\n=== Threshold-based ===")
print(f"Micro-F1: {f1_score(y_val, y_val_pred_opt, average='micro'):.4f}")
print(f"Macro-F1: {f1_score(y_val, y_val_pred_opt, average='macro'):.4f}")
print(f"Sample-F1: {f1_score(y_val, y_val_pred_opt, average='samples'):.4f}")
print("\n[classification_report @thr]")
print(classification_report(y_val, y_val_pred_opt, target_names=mlb.classes_, zero_division=0))

# -------------------------------
# 6) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    emb = encode_texts([text], batch_size=1)
    proba = clf.predict_proba(emb)[0]
    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]
    return [mlb.classes_[i] for i in pick]

print("\n[Example Prediction]")
print(predict_multilingual("바닷가에서 느껴지는 시원하고 약간 달콤한 향이 좋아요", topk=3, thresholds=thresholds))

# -------------------------------
# 7) 모델 저장
# -------------------------------
SAVE_PT = "minilm_model.pt"
SAVE_PKL = "label_info.pkl"

# ① MiniLM Transformer 저장 (.pt)
torch.save(base_model.state_dict(), SAVE_PT)
print(f"[Saved MiniLM model to {SAVE_PT}]")

# ② Sklearn 분류기 + 라벨 인코더 + 임계값 저장 (.pkl)
joblib.dump({
    "classifier": clf,
    "mlb": mlb,
    "thresholds": thresholds
}, SAVE_PKL)
print(f"[Saved classifier & label info to {SAVE_PKL}]")

[Device] cuda
[Encoding Train Texts]
[Encoding Validation Texts]



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.updat


[Best Thresholds per label]
Amber: 0.20
Aromatic: 0.22
Blossom: 0.20
Bouquet: 0.20
Citrus: 0.20
Classical: 0.20
Crisp: 0.20
Dry: 0.20
Floral: 0.32
Flower: 0.30
Fougère: 0.22
Fresh: 0.20
Fresher: 0.28
Fruity: 0.20
Gourmand: 0.20
Green: 0.28
Iris: 0.20
Jasmine: 0.20
Lily: 0.20
Mossy: 0.20
Musk: 0.20
Orange: 0.20
Rich: 0.20
Richer: 0.20
Rose: 0.22
Soft: 0.20
Spicy: 0.20
Tuberose: 0.20
Valley: 0.20
Violet: 0.20
Water: 0.20
White: 0.30
Woods: 0.20
Woody: 0.20

=== Threshold-based ===
Micro-F1: 0.4802
Macro-F1: 0.1860
Sample-F1: 0.4730

[classification_report @thr]
              precision    recall  f1-score   support

       Amber       0.43      0.72      0.54       316
    Aromatic       0.39      0.27      0.31        83
     Blossom       0.00      0.00      0.00         6
     Bouquet       0.00      0.00      0.00         8
      Citrus       0.34      0.29      0.32       156
   Classical       0.38      0.44      0.41       209
       Crisp       0.34      0.22      0.26       153


In [None]:
# ============================================
# 저장된 VotingClassifier (.pkl) 불러오기 + 예측
# ============================================

import joblib
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

# -------------------------------
# 설정
# -------------------------------
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
MAX_LEN = 256
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

# -------------------------------
# 1) 저장된 pkl 불러오기
# -------------------------------
SAVE_PKL = "label_info.pkl"
data = joblib.load(SAVE_PKL)

clf = data["classifier"]
mlb = data["mlb"]
thresholds = data["thresholds"]

print(f"[Loaded model from {SAVE_PKL}]")
print(f"Labels: {list(mlb.classes_)}")

# -------------------------------
# 2) MiniLM 로드 (임베딩 추출용)
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
base_model.eval()

def encode_texts(texts, batch_size=32):
    """텍스트를 MiniLM 임베딩으로 변환"""
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(device)
        with torch.no_grad():
            model_out = base_model(**enc)
            emb = model_out.last_hidden_state.mean(dim=1)
        all_embeddings.append(emb.cpu().numpy())
    return np.vstack(all_embeddings)

# -------------------------------
# 3) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    emb = encode_texts([text], batch_size=1)
    proba = clf.predict_proba(emb)[0]

    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:  # 어떤 것도 threshold 못 넘으면 topk 선택
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]

    return [mlb.classes_[i] for i in pick]

# -------------------------------
# 4) 예측 실행
# -------------------------------
example_text = "바닷가에서 느껴지는 시원하고 약간 달콤한 향이 좋아요"
print("\n[Example Prediction]")
print(predict_multilingual(example_text, topk=3, thresholds=thresholds))


[Device] cuda
[Loaded model from label_info.pkl]
Labels: ['Amber', 'Aromatic', 'Blossom', 'Bouquet', 'Citrus', 'Classical', 'Crisp', 'Dry', 'Floral', 'Flower', 'Fougère', 'Fresh', 'Fresher', 'Fruity', 'Gourmand', 'Green', 'Iris', 'Jasmine', 'Lily', 'Mossy', 'Musk', 'Orange', 'Rich', 'Richer', 'Rose', 'Soft', 'Spicy', 'Tuberose', 'Valley', 'Violet', 'Water', 'White', 'Woods', 'Woody']

[Example Prediction]
['Fresher', 'Water']



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  if len(data.s