In [1]:
# ============================================
# 설치 (Runpod A40 / 로컬)
# ============================================

# CPU 전용
# python -m pip install -U "torch>=2.2,<3.0" scikit-learn pandas numpy joblib sentence-transformers transformers

# GPU (CUDA 12.1, Runpod A40)
!python -m pip install -U "torch>=2.2,<3.0" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!python -m pip install -U scikit-learn pandas numpy joblib sentence-transformers transformers

Looking in indexes: https://download.pytorch.org/whl/cu121
[0mCollecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting numpy
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting joblib
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting pytz

In [2]:
!pip uninstall transformers -y
!pip uninstall huggingface-hub -y
!pip uninstall tokenizers -y

!pip install --no-cache-dir transformers

Found existing installation: transformers 4.56.0
Uninstalling transformers-4.56.0:
  Successfully uninstalled transformers-4.56.0
[0mFound existing installation: huggingface-hub 0.34.4
Uninstalling huggingface-hub-0.34.4:
  Successfully uninstalled huggingface-hub-0.34.4
[0mFound existing installation: tokenizers 0.22.0
Uninstalling tokenizers-0.22.0:
  Successfully uninstalled tokenizers-0.22.0
[0mCollecting transformers
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.56.0-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m348.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingfa

In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.4-py3-none-manylinux_2_28_x86_64.whl (94.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 MB[0m [31m231.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.4
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [4]:
# ============================================
# MiniLM 임베딩 + VotingClassifier (앙상블) + Threshold 최적화
# 모델 저장 (.pt + .pkl)
# ============================================

import os, numpy as np, pandas as pd, joblib
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

import torch
from transformers import AutoTokenizer, AutoModel

# -------------------------------
# 설정
# -------------------------------
DATA_CSV = "perfumes_huggingface.csv"
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
TOP_K = 3
RARE_MIN_COUNT = 7
MAX_LEN = 256
BATCH_SIZE = 16
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

# -------------------------------
# 유틸 함수
# -------------------------------
def split_labels(s: str):
    s = str(s)
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, " ")
    return [t.strip() for t in s.split() if t.strip()]

# -------------------------------
# 1) 데이터 로드 & 전처리
# -------------------------------
df = pd.read_csv(DATA_CSV, sep="|", engine="python", on_bad_lines="skip")
df = df[~df["description"].isna()].copy()
df["labels"] = df["fragrances"].apply(split_labels)

# 희소 라벨 제거
cnt = Counter([l for L in df["labels"] for l in L])
rare = {k for k, v in cnt.items() if v <= RARE_MIN_COUNT}
df["labels"] = df["labels"].apply(lambda L: [l for l in L if l not in rare])
df = df[df["labels"].map(len) > 0].copy()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["labels"])

X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["description"].tolist(), Y, test_size=0.2, random_state=42
)

# -------------------------------
# 2) MiniLM 임베딩 추출
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
base_model.eval()

def encode_texts(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(device)
        with torch.no_grad():
            model_out = base_model(**enc)
            emb = model_out.last_hidden_state.mean(dim=1)
        all_embeddings.append(emb.cpu().numpy())
    return np.vstack(all_embeddings)

print("[Encoding Train Texts]")
X_train_emb = encode_texts(X_train_text, batch_size=BATCH_SIZE)
print("[Encoding Validation Texts]")
X_val_emb = encode_texts(X_val_text, batch_size=BATCH_SIZE)

# -------------------------------
# 3) VotingClassifier 앙상블 학습
# -------------------------------
logreg = LogisticRegression(max_iter=200)
xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    tree_method="gpu_hist" if device=="cuda" else "hist"
)

ensemble = VotingClassifier(
    estimators=[("lr", logreg), ("xgb", xgb)],
    voting="soft"   # soft 투표 (확률 평균)
)

clf = OneVsRestClassifier(ensemble, n_jobs=-1)
clf.fit(X_train_emb, y_train)

# -------------------------------
# 4) 검증 예측 & Threshold 최적화
# -------------------------------
y_val_proba = clf.predict_proba(X_val_emb)
y_val_proba = np.array(y_val_proba)

thresholds = {}
y_val_pred_opt = np.zeros_like(y_val)
for i, label in enumerate(mlb.classes_):
    best_thr, best_f1 = 0.5, -1
    for thr in np.linspace(0.2, 0.5, 16):
        pred = (y_val_proba[:, i] >= thr).astype(int)
        f1 = f1_score(y_val[:, i], pred, zero_division=0)
        if f1 > best_f1:
            best_thr, best_f1 = thr, f1
    thresholds[label] = best_thr
    y_val_pred_opt[:, i] = (y_val_proba[:, i] >= best_thr).astype(int)

print("\n[Best Thresholds per label]")
for k, v in thresholds.items():
    print(f"{k}: {v:.2f}")

# -------------------------------
# 5) 평가
# -------------------------------
print("\n=== Threshold-based ===")
print(f"Micro-F1: {f1_score(y_val, y_val_pred_opt, average='micro'):.4f}")
print(f"Macro-F1: {f1_score(y_val, y_val_pred_opt, average='macro'):.4f}")
print(f"Sample-F1: {f1_score(y_val, y_val_pred_opt, average='samples'):.4f}")
print("\n[classification_report @thr]")
print(classification_report(y_val, y_val_pred_opt, target_names=mlb.classes_, zero_division=0))

# -------------------------------
# 6) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    emb = encode_texts([text], batch_size=1)
    proba = clf.predict_proba(emb)[0]
    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]
    return [mlb.classes_[i] for i in pick]

print("\n[Example Prediction]")
print(predict_multilingual("바닷가에서 느껴지는 시원하고 약간 달콤한 향이 좋아요", topk=3, thresholds=thresholds))

# -------------------------------
# 7) 모델 저장
# -------------------------------
SAVE_PT = "minilm_model.pt"
SAVE_PKL = "voting_classifier.pkl"

# ① MiniLM Transformer 저장 (.pt)
torch.save(base_model.state_dict(), SAVE_PT)
print(f"[Saved MiniLM model to {SAVE_PT}]")

# ② Sklearn 분류기 + 라벨 인코더 + 임계값 저장 (.pkl)
joblib.dump({
    "classifier": clf,
    "mlb": mlb,
    "thresholds": thresholds
}, SAVE_PKL)
print(f"[Saved VotingClassifier pipeline to {SAVE_PKL}]")


[Device] cuda


tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

[Encoding Train Texts]
[Encoding Validation Texts]



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  return super().dump(obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iterati


[Best Thresholds per label]
$$$: 0.20
Amber: 0.22
Aromatic: 0.20
Blossom: 0.28
Bouquet: 0.20
Carnation: 0.20
Citrus: 0.20
Classical: 0.24
Crisp: 0.20
Dry: 0.20
Floral: 0.30
Flower: 0.20
Fougère: 0.20
Fresh: 0.20
Fresher: 0.28
Fruity: 0.20
Gardenia: 0.20
Gourmand: 0.20
Green: 0.20
Honeysuckle: 0.20
Iris: 0.42
Jasmine: 0.20
Lilac: 0.20
Lily: 0.26
Magnolia: 0.20
Mimosa: 0.20
Mossy: 0.20
Musk: 0.20
Orange: 0.22
Oriental: 0.20
Rich: 0.40
Richer: 0.22
Rose: 0.20
Soft: 0.20
Spicy: 0.20
Tuberose: 0.20
Valley: 0.26
Violet: 0.40
Water: 0.20
White: 0.20
Woods: 0.20
Woody: 0.20
info: 0.20
of: 0.26
the: 0.26

=== Threshold-based ===
Micro-F1: 0.5001
Macro-F1: 0.2513
Sample-F1: 0.4985

[classification_report @thr]
              precision    recall  f1-score   support

         $$$       0.00      0.00      0.00         1
       Amber       0.41      0.84      0.55      1744
    Aromatic       0.32      0.31      0.32       420
     Blossom       0.60      0.12      0.19        26
     Bouquet      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[Saved MiniLM model to minilm_model.pt]
[Saved VotingClassifier pipeline to voting_classifier.pkl]
