In [16]:
!python -m pip install -U "torch>=2.2,<3.0" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!python -m pip install -U sentence-transformers scikit-learn pandas numpy joblib

Looking in indexes: https://download.pytorch.org/whl/cu121
[0m

In [17]:
!pip install -U scikit-learn

[0m

In [18]:
!pip uninstall -y xgboost
!pip install xgboost==1.7.6

Found existing installation: xgboost 3.0.4
Uninstalling xgboost-3.0.4:
  Successfully uninstalled xgboost-3.0.4
[0mCollecting xgboost==1.7.6
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m155.1 MB/s[0m  [33m0:00:01[0m0:00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-1.7.6
[0m

In [28]:
# ============================================
# LaBSE 임베딩 + XGBoost OvR + Threshold 최적화
# ============================================

import os, time
import numpy as np
import pandas as pd
from collections import Counter

import torch
from sentence_transformers import SentenceTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report

import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

# -------------------------------
# 설정
# -------------------------------
DATA_CSV = "perfumes_huggingface.csv"  # 경로 맞게 수정
MODEL_NAME = "sentence-transformers/LaBSE"
TOP_K = 3
RARE_MIN_COUNT = 10

# -------------------------------
# 유틸 함수
# -------------------------------
def split_labels(s: str):
    s = str(s)
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, " ")
    return [t.strip() for t in s.split() if t.strip()]

def encode_with_auto_batch(embedder: SentenceTransformer, texts, init_bs=1024, min_bs=64):
    bs = init_bs
    Xs = []
    i = 0
    n = len(texts)
    while i < n:
        j = min(i + bs, n)
        chunk = texts[i:j]
        try:
            emb = embedder.encode(chunk, batch_size=bs, convert_to_numpy=True, show_progress_bar=False)
            Xs.append(emb)
            i = j
        except RuntimeError as e:
            if "CUDA out of memory" in str(e) and bs > min_bs:
                torch.cuda.empty_cache()
                bs = max(min_bs, bs // 2)
                print(f"[WARN] CUDA OOM → batch_size 축소: {bs}")
                continue
            raise
    return np.vstack(Xs)

# -------------------------------
# 1) 데이터 로드 & 전처리
# -------------------------------
df = pd.read_csv(DATA_CSV, sep="|", engine="python", on_bad_lines="skip")
df = df[~df["description"].isna()].copy()
df["labels"] = df["fragrances"].apply(split_labels)

cnt = Counter([l for L in df["labels"] for l in L])
rare = {k for k, v in cnt.items() if v <= RARE_MIN_COUNT}
df["labels"] = df["labels"].apply(lambda L: [l for l in L if l not in rare])
df = df[df["labels"].map(len) > 0].copy()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["labels"])

# -------------------------------
# 2) 데이터 분할
# -------------------------------
X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["description"].tolist(), Y, test_size=0.2, random_state=42
)

# -------------------------------
# 3) 임베딩
# -------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

embedder = SentenceTransformer(MODEL_NAME, device=device)
init_bs = 1024 if device == "cuda" else 128

X_train = encode_with_auto_batch(embedder, X_train_text, init_bs=init_bs, min_bs=64)
X_val = encode_with_auto_batch(embedder, X_val_text, init_bs=init_bs, min_bs=64)

# -------------------------------
# 4) XGBoost OvR 학습
# -------------------------------
clf = OneVsRestClassifier(
    xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1,
        tree_method="hist"
    )
)
clf.fit(X_train, y_train)
y_val_proba = clf.predict_proba(X_val)

# -------------------------------
# 5) 라벨별 Threshold 최적화
# -------------------------------
thresholds = {}
y_val_pred_opt = np.zeros_like(y_val)

for i, label in enumerate(mlb.classes_):
    best_thr, best_f1 = 0.5, -1
    for thr in np.linspace(0.2, 0.5, 16):
        pred = (y_val_proba[:, i] >= thr).astype(int)
        f1 = f1_score(y_val[:, i], pred, zero_division=0)
        if f1 > best_f1:
            best_thr, best_f1 = thr, f1
    thresholds[label] = best_thr
    y_val_pred_opt[:, i] = (y_val_proba[:, i] >= best_thr).astype(int)

print("\n[Best Thresholds per label]")
for k, v in thresholds.items():
    print(f"{k}: {v:.2f}")

# -------------------------------
# 6) 평가
# -------------------------------
print("\n=== Threshold-based ===")
print(f"Micro-F1: {f1_score(y_val, y_val_pred_opt, average='micro'):.4f}")
print(f"Macro-F1: {f1_score(y_val, y_val_pred_opt, average='macro'):.4f}")
print(f"Sample-F1: {f1_score(y_val, y_val_pred_opt, average='samples'):.4f}")
print("\n[classification_report @thr]")
print(classification_report(y_val, y_val_pred_opt, target_names=mlb.classes_, zero_division=0))

# -------------------------------
# 7) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    v = encode_with_auto_batch(embedder, [text], init_bs=64 if device=="cpu" else 256, min_bs=32)
    proba = clf.predict_proba(v)[0]

    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]

    return [mlb.classes_[i] for i in pick]

# -------------------------------
# 예시 실행
# -------------------------------
print("\n[Example Prediction]")
print(predict_multilingual("바닷가에서 느껴지는 시원하고 약간 달콤한 향이 좋아요", topk=3, thresholds=thresholds))


[Device] cuda

[Best Thresholds per label]
Amber: 0.28
Aromatic: 0.20
Blossom: 0.20
Bouquet: 0.34
Carnation: 0.20
Citrus: 0.20
Classical: 0.20
Crisp: 0.20
Dry: 0.20
Floral: 0.30
Flower: 0.20
Fougère: 0.20
Fresh: 0.20
Fresher: 0.24
Fruity: 0.20
Gardenia: 0.20
Gourmand: 0.20
Green: 0.20
Iris: 0.20
Jasmine: 0.20
Lily: 0.26
Magnolia: 0.20
Mimosa: 0.20
Mossy: 0.20
Musk: 0.20
Orange: 0.20
Oriental: 0.20
Rich: 0.20
Richer: 0.44
Rose: 0.20
Soft: 0.20
Spicy: 0.24
Tuberose: 0.20
Valley: 0.20
Violet: 0.20
Water: 0.22
White: 0.20
Woods: 0.22
Woody: 0.20
of: 0.20
the: 0.20

=== Threshold-based ===
Micro-F1: 0.4856
Macro-F1: 0.1722
Sample-F1: 0.4819

[classification_report @thr]
              precision    recall  f1-score   support

       Amber       0.45      0.71      0.55      1738
    Aromatic       0.38      0.26      0.31       450
     Blossom       0.00      0.00      0.00        25
     Bouquet       0.50      0.04      0.08        47
   Carnation       0.00      0.00      0.00         2
 

In [29]:
print("\n[Example Prediction]")
print(predict_multilingual("깨끗하게 빨래하고 말린 상쾌한 향", topk=3, thresholds=thresholds))


[Example Prediction]
['Citrus', 'Classical', 'Fresher']


In [30]:
print("\n[Example Prediction]")
print(predict_multilingual("바다향", topk=3, thresholds=thresholds))


[Example Prediction]
['Fresher', 'Water', 'Woods']
