In [1]:
!python -m pip install -U "torch>=2.2,<3.0" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!python -m pip install -U sentence-transformers scikit-learn pandas numpy joblib

Looking in indexes: https://download.pytorch.org/whl/cu121
[0m

In [5]:
# ============================================
# Threshold Grid Search 적용 버전
# ============================================
import os, time
import numpy as np
import pandas as pd
from collections import Counter

import torch
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
warnings.filterwarnings("ignore")

# -------------------------------
# 설정
# -------------------------------
DATA_CSV = "perfumes_huggingface.csv"
MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
TOP_K = 3
RARE_MIN_COUNT = 10

# -------------------------------
# 유틸 함수
# -------------------------------
def split_labels(s: str):
    s = str(s)
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, " ")
    return [t.strip() for t in s.split() if t.strip()]

def encode_with_auto_batch(embedder, texts, init_bs=1024, min_bs=64):
    bs = init_bs
    Xs = []
    i = 0
    n = len(texts)
    while i < n:
        j = min(i + bs, n)
        chunk = texts[i:j]
        try:
            emb = embedder.encode(chunk, batch_size=bs, convert_to_numpy=True, show_progress_bar=False)
            Xs.append(emb)
            i = j
        except RuntimeError as e:
            if "CUDA out of memory" in str(e) and bs > min_bs:
                torch.cuda.empty_cache()
                bs = max(min_bs, bs // 2)
                print(f"[WARN] CUDA OOM → batch_size 축소: {bs}")
                continue
            raise
    return np.vstack(Xs)

# -------------------------------
# 1) 데이터 로드 & 전처리
# -------------------------------
df = pd.read_csv(DATA_CSV, sep="|", engine="python", on_bad_lines="skip")
df = df[~df["description"].isna()].copy()
df["labels"] = df["fragrances"].apply(split_labels)

# 희소 라벨 제거
cnt = Counter([l for L in df["labels"] for l in L])
rare = {k for k, v in cnt.items() if v <= RARE_MIN_COUNT}
df["labels"] = df["labels"].apply(lambda L: [l for l in L if l not in rare])
df = df[df["labels"].map(len) > 0].copy()

# 타깃 인코딩
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["labels"])

# -------------------------------
# 2) 데이터 분할
# -------------------------------
X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["description"].tolist(), Y, test_size=0.2, random_state=42
)

# -------------------------------
# 3) SentenceTransformer 임베딩
# -------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

embedder = SentenceTransformer(MODEL_NAME, device=device)
init_bs = 1024 if device == "cuda" else 128

X_train = encode_with_auto_batch(embedder, X_train_text, init_bs=init_bs, min_bs=64)
X_val = encode_with_auto_batch(embedder, X_val_text, init_bs=init_bs, min_bs=64)

# -------------------------------
# 4) Logistic Regression 학습
# -------------------------------
clf = OneVsRestClassifier(
    LogisticRegression(max_iter=2000, C=2.0, class_weight="balanced")
)
clf.fit(X_train, y_train)

# -------------------------------
# 5) Threshold Grid Search
# -------------------------------
try:
    y_val_proba = clf.predict_proba(X_val)
except Exception:
    scores = clf.decision_function(X_val)
    y_val_proba = 1 / (1 + np.exp(-scores))

best_thr, best_f1 = None, -1
for thr in np.arange(0.2, 0.51, 0.05):
    y_val_thr = (y_val_proba >= thr).astype(int)
    f1 = f1_score(y_val, y_val_thr, average="micro")
    print(f"Threshold={thr:.2f} | Micro-F1={f1:.4f}")
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"\n[Best Threshold] {best_thr:.2f} → Micro-F1={best_f1:.4f}")


[Device] cuda


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Threshold=0.20 | Micro-F1=0.2821
Threshold=0.25 | Micro-F1=0.2964
Threshold=0.30 | Micro-F1=0.3104
Threshold=0.35 | Micro-F1=0.3235
Threshold=0.40 | Micro-F1=0.3350
Threshold=0.45 | Micro-F1=0.3426
Threshold=0.50 | Micro-F1=0.3497

[Best Threshold] 0.50 → Micro-F1=0.3497
