In [1]:
!python -m pip install -U "torch>=2.2,<3.0" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!python -m pip install -U sentence-transformers scikit-learn pandas numpy joblib

Looking in indexes: https://download.pytorch.org/whl/cu121
[0mCollecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting numpy
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting joblib
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
Collecting tqdm (from sentence-transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.m

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.4-py3-none-manylinux_2_28_x86_64.whl (94.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.4
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [6]:
# ============================================
# XGBoost OvR + 라벨별 최적 threshold 적용 + epoch 확장
# ============================================

import os, time
import numpy as np
import pandas as pd
from collections import Counter

import torch
from sentence_transformers import SentenceTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report

import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

# -------------------------------
# 설정
# -------------------------------
DATA_CSV = "perfumes_huggingface.csv"  # 경로 맞게 수정
MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
TOP_K = 3
RARE_MIN_COUNT = 10

# 라벨별 최적 threshold (앞에서 찾은 값 복붙)
thresholds = {
    "Amber":0.24,"Aromatic":0.20,"Blossom":0.20,"Bouquet":0.20,"Carnation":0.20,"Citrus":0.20,
    "Classical":0.22,"Crisp":0.20,"Dry":0.20,"Floral":0.32,"Flower":0.20,"Fougère":0.20,
    "Fresh":0.20,"Fresher":0.30,"Fruity":0.20,"Gardenia":0.20,"Gourmand":0.20,"Green":0.20,
    "Iris":0.46,"Jasmine":0.24,"Lily":0.26,"Magnolia":0.20,"Mimosa":0.20,"Mossy":0.20,"Musk":0.20,
    "Orange":0.48,"Oriental":0.20,"Rich":0.20,"Richer":0.32,"Rose":0.22,"Soft":0.20,"Spicy":0.20,
    "Tuberose":0.20,"Valley":0.46,"Violet":0.32,"Water":0.20,"White":0.20,"Woods":0.20,"Woody":0.20,
    "of":0.46,"the":0.46
}

# -------------------------------
# 유틸
# -------------------------------
def split_labels(s: str):
    s = str(s)
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, " ")
    return [t.strip() for t in s.split() if t.strip()]

def encode_with_auto_batch(embedder: SentenceTransformer, texts, init_bs=1024, min_bs=64):
    """CUDA OOM 시 배치 크기를 줄여가며 안전하게 임베딩"""
    bs = init_bs
    Xs = []
    i = 0
    n = len(texts)
    while i < n:
        j = min(i + bs, n)
        chunk = texts[i:j]
        try:
            emb = embedder.encode(chunk, batch_size=bs, convert_to_numpy=True, show_progress_bar=False)
            Xs.append(emb)
            i = j
        except RuntimeError as e:
            if "CUDA out of memory" in str(e) and bs > min_bs:
                torch.cuda.empty_cache()
                bs = max(min_bs, bs // 2)
                print(f"[WARN] CUDA OOM → batch_size 축소: {bs}")
                continue
            raise
    return np.vstack(Xs)

# -------------------------------
# 1) 데이터 로드 & 전처리
# -------------------------------
df = pd.read_csv(DATA_CSV, sep="|", engine="python", on_bad_lines="skip")
df = df[~df["description"].isna()].copy()
df["labels"] = df["fragrances"].apply(split_labels)

# 희소 라벨 제거
cnt = Counter([l for L in df["labels"] for l in L])
rare = {k for k, v in cnt.items() if v <= RARE_MIN_COUNT}
df["labels"] = df["labels"].apply(lambda L: [l for l in L if l not in rare])
df = df[df["labels"].map(len) > 0].copy()

# 타깃 인코딩
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["labels"])

# -------------------------------
# 2) 데이터 분할
# -------------------------------
X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["description"].tolist(), Y, test_size=0.2, random_state=42
)

# -------------------------------
# 3) 임베딩
# -------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

embedder = SentenceTransformer(MODEL_NAME, device=device)
init_bs = 1024 if device == "cuda" else 128

X_train = encode_with_auto_batch(embedder, X_train_text, init_bs=init_bs, min_bs=64)
X_val = encode_with_auto_batch(embedder, X_val_text, init_bs=init_bs, min_bs=64)

# -------------------------------
# 4) XGBoost OvR 학습 (epoch = n_estimators 확장)
# -------------------------------
clf = OneVsRestClassifier(
    xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        n_estimators=1000,   # 🔥 epoch 개념 (200 → 1000으로 늘림)
        learning_rate=0.05,  # epoch 늘렸으니 lr 살짝 줄임
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1,
        tree_method="hist"
    )
)

t0 = time.perf_counter()
clf.fit(X_train, y_train)
t1 = time.perf_counter()
print(f"[Train] OvR-XGBoost (1000 trees): {t1 - t0:.2f}s")

y_val_proba = clf.predict_proba(X_val)

# -------------------------------
# 5) 평가 (Threshold-based + Top-K-based)
# -------------------------------
# Threshold 기반
y_val_thr = np.zeros_like(y_val)
for i, label in enumerate(mlb.classes_):
    thr = thresholds.get(label, 0.5)
    y_val_thr[:, i] = (y_val_proba[:, i] >= thr).astype(int)

print("\n=== Threshold-based ===")
print(f"Micro-F1: {f1_score(y_val, y_val_thr, average='micro'):.4f}")
print(f"Macro-F1: {f1_score(y_val, y_val_thr, average='macro'):.4f}")
print(f"Sample-F1: {f1_score(y_val, y_val_thr, average='samples'):.4f}")
print("\n[classification_report @thr]")
print(classification_report(y_val, y_val_thr, target_names=mlb.classes_, zero_division=0))

# Top-K 기반
top_idx = np.argsort(-y_val_proba, axis=1)[:, :TOP_K]
y_val_topk = np.zeros_like(y_val_proba, dtype=int)
for i, idxs in enumerate(top_idx):
    y_val_topk[i, idxs] = 1

print("\n=== Top-K-based ===")
print(f"Micro-F1: {f1_score(y_val, y_val_topk, average='micro'):.4f}")
print(f"Macro-F1: {f1_score(y_val, y_val_topk, average='macro'):.4f}")
print(f"Sample-F1: {f1_score(y_val, y_val_topk, average='samples'):.4f}")
print("\n[classification_report @topK]")
print(classification_report(y_val, y_val_topk, target_names=mlb.classes_, zero_division=0))

# -------------------------------
# 6) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    v = encode_with_auto_batch(embedder, [text], init_bs=64 if device=="cpu" else 256, min_bs=32)
    proba = clf.predict_proba(v)[0]

    if thresholds is not None:  # 라벨별 threshold 적용
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:  # 아무 라벨도 안 나오면 Top-K fallback
            pick = np.argsort(-proba)[:topk]
    else:  # Top-K
        pick = np.argsort(-proba)[:topk]

    return [mlb.classes_[i] for i in pick]

# -------------------------------
# 예시 실행
# -------------------------------
print("\n[Example Prediction]")
print(predict_multilingual("바닷가에서 느껴지는 시원하고 약간 달콤한 향이 좋아요", topk=3, thresholds=thresholds))


[Device] cuda
[Train] OvR-XGBoost (1000 trees): 511.22s

=== Threshold-based ===
Micro-F1: 0.4910
Macro-F1: 0.2250
Sample-F1: 0.4801

[classification_report @thr]
              precision    recall  f1-score   support

       Amber       0.46      0.68      0.55      1738
    Aromatic       0.48      0.22      0.30       450
     Blossom       0.67      0.08      0.14        25
     Bouquet       0.50      0.06      0.11        47
   Carnation       0.00      0.00      0.00         2
      Citrus       0.41      0.30      0.35       981
   Classical       0.42      0.46      0.44      1313
       Crisp       0.36      0.16      0.22       858
         Dry       0.69      0.13      0.21       260
      Floral       0.67      0.75      0.71      2141
      Flower       0.33      0.02      0.04       329
     Fougère       0.48      0.22      0.30       450
       Fresh       0.00      0.00      0.00        22
     Fresher       0.59      0.91      0.71      2838
      Fruity       0.42   

In [None]:
print("\n[Example Prediction]")
print(predict_multilingual("깨끗하게 빨래하고 말린 상쾌한 향", topk=3, thresholds=thresholds))


[Example Prediction]
['Fresher', 'Amber', 'Floral']


In [None]:
print("\n[Example Prediction]")
print(predict_multilingual("바다향", topk=3, thresholds=thresholds))


[Example Prediction]
['Fresher', 'Woods', 'Classical']
