In [1]:
# ============================================
# 설치 (Runpod A40 / 로컬)
# ============================================

# CPU 전용
# python -m pip install -U "torch>=2.2,<3.0" scikit-learn pandas numpy joblib sentence-transformers transformers

# GPU (CUDA 12.1, Runpod A40)
!python -m pip install -U "torch>=2.2,<3.0" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!python -m pip install -U scikit-learn pandas numpy joblib sentence-transformers transformers

Looking in indexes: https://download.pytorch.org/whl/cu121
[0mCollecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting numpy
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting joblib
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting pytz

In [2]:
!pip uninstall transformers -y
!pip uninstall huggingface-hub -y
!pip uninstall tokenizers -y

!pip install --no-cache-dir transformers
!pip install xgboost

Found existing installation: transformers 4.56.0
Uninstalling transformers-4.56.0:
  Successfully uninstalled transformers-4.56.0
[0mFound existing installation: huggingface-hub 0.34.4
Uninstalling huggingface-hub-0.34.4:
  Successfully uninstalled huggingface-hub-0.34.4
[0mFound existing installation: tokenizers 0.22.0
Uninstalling tokenizers-0.22.0:
  Successfully uninstalled tokenizers-0.22.0
[0mCollecting transformers
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.56.0-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m305.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingfa

In [8]:
# ============================================
# MiniLM 임베딩 + VotingClassifier (앙상블)
# + 하드코딩된 Threshold 적용(요청값 재현)
# ============================================

import os, numpy as np, pandas as pd, random
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC  # 참고: 사용 안함(soft 투표 불가)
from xgboost import XGBClassifier

import torch
from transformers import AutoTokenizer, AutoModel

# -------------------------------
# 설정
# -------------------------------
DATA_CSV = "perfumes_huggingface.csv"
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
TOP_K = 3
RARE_MIN_COUNT = 7
MAX_LEN = 256
BATCH_SIZE = 16

# 재현성 고정
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

# -------------------------------
# 유틸 함수
# -------------------------------
def split_labels(s: str):
    s = str(s)
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, " ")
    return [t.strip() for t in s.split() if t.strip()]

# -------------------------------
# 1) 데이터 로드 & 전처리
# -------------------------------
df = pd.read_csv(DATA_CSV, sep="|", engine="python", on_bad_lines="skip")
df = df[~df["description"].isna()].copy()
df["labels"] = df["fragrances"].apply(split_labels)

# 희소 라벨 제거
cnt = Counter([l for L in df["labels"] for l in L])
rare = {k for k, v in cnt.items() if v <= RARE_MIN_COUNT}
df["labels"] = df["labels"].apply(lambda L: [l for l in L if l not in rare])
df = df[df["labels"].map(len) > 0].copy()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["labels"])

X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["description"].tolist(), Y, test_size=0.2, random_state=SEED
)

# -------------------------------
# 2) MiniLM 임베딩 추출
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
base_model.eval()

def encode_texts(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        ).to(device)
        with torch.no_grad():
            out = base_model(**enc)
            # mean-pooling
            emb = out.last_hidden_state.mean(dim=1)
        all_embeddings.append(emb.cpu().numpy())
    return np.vstack(all_embeddings)

print("[Encoding Train Texts]")
X_train_emb = encode_texts(X_train_text, batch_size=BATCH_SIZE)
print("[Encoding Validation Texts]")
X_val_emb = encode_texts(X_val_text, batch_size=BATCH_SIZE)

# -------------------------------
# 3) VotingClassifier 앙상블 학습
# -------------------------------
logreg = LogisticRegression(max_iter=200)
xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    tree_method="hist",  # (cuda일 때 gpu_hist로 바꿔도 되지만 결과 재현 위해 고정)
    device="cuda" if device == "cuda" else "cpu",
    random_state=SEED,
)

# LinearSVC는 predict_proba 없음 → soft voting 불가하므로 제외
ensemble = VotingClassifier(
    estimators=[("lr", logreg), ("xgb", xgb)],
    voting="soft"
)

clf = OneVsRestClassifier(ensemble, n_jobs=-1)
clf.fit(X_train_emb, y_train)

# -------------------------------
# 4) 검증 예측 & (하드코딩) Threshold 적용
# -------------------------------
y_val_proba = np.array(clf.predict_proba(X_val_emb))

# === 너가 제공한 [Best Thresholds per label]을 고정 딕셔너리로 사용 ===
fixed_thresholds = {
    "Amber": 0.22,
    "Aromatic": 0.20,
    "Blossom": 0.28,
    "Bouquet": 0.20,
    "Carnation": 0.20,
    "Citrus": 0.20,
    "Classical": 0.24,
    "Crisp": 0.20,
    "Dry": 0.20,
    "Floral": 0.30,
    "Flower": 0.20,
    "Fougère": 0.20,
    "Fresh": 0.20,
    "Fresher": 0.28,
    "Fruity": 0.20,
    "Gardenia": 0.20,
    "Gourmand": 0.20,
    "Green": 0.20,
    "Honeysuckle": 0.20,
    "Iris": 0.42,
    "Jasmine": 0.20,
    "Lilac": 0.20,
    "Lily": 0.26,
    "Magnolia": 0.20,
    "Mimosa": 0.20,
    "Mossy": 0.20,
    "Musk": 0.20,
    "Orange": 0.22,
    "Oriental": 0.20,
    "Rich": 0.40,
    "Richer": 0.22,
    "Rose": 0.20,
    "Soft": 0.20,
    "Spicy": 0.20,
    "Tuberose": 0.20,
    "Valley": 0.26,
    "Violet": 0.40,
    "Water": 0.20,
    "White": 0.20,
    "Woods": 0.20,
    "Woody": 0.20,
    "info": 0.20
}

# 라벨 순서(mlb.classes_)에 맞춰 적용
y_val_pred_opt = np.zeros_like(y_val)
print("\n[Best Thresholds per label]")
for i, label in enumerate(mlb.classes_):
    thr = fixed_thresholds.get(label, 0.5)
    print(f"{label}: {thr:.2f}")
    y_val_pred_opt[:, i] = (y_val_proba[:, i] >= thr).astype(int)

# -------------------------------
# 5) 평가 (요청값 재현)
# -------------------------------
micro_f1 = f1_score(y_val, y_val_pred_opt, average="micro")
macro_f1 = f1_score(y_val, y_val_pred_opt, average="macro")
sample_f1 = f1_score(y_val, y_val_pred_opt, average="samples")

print("\n=== Threshold-based ===")
print(f"Micro-F1: {micro_f1:.4f}")
print(f"Macro-F1: {macro_f1:.4f}")
print(f"Sample-F1: {sample_f1:.4f}")

print("\n[classification_report @thr]")
print(classification_report(y_val, y_val_pred_opt, target_names=mlb.classes_, zero_division=0))

# -------------------------------
# 6) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    emb = encode_texts([text], batch_size=1)
    proba = clf.predict_proba(emb)[0]
    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]
    return [mlb.classes_[i] for i in pick]

print("\n[Example Prediction]")
print(predict_multilingual(
    "바닷가에서 느껴지는 시원하고 약간 달콤한 향이 좋아요",
    topk=TOP_K,
    thresholds=fixed_thresholds
))

[Device] cuda
[Encoding Train Texts]
[Encoding Validation Texts]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.



[Best Thresholds per label]
$$$: 0.50
Amber: 0.22
Aromatic: 0.20
Blossom: 0.28
Bouquet: 0.20
Carnation: 0.20
Citrus: 0.20
Classical: 0.24
Crisp: 0.20
Dry: 0.20
Floral: 0.30
Flower: 0.20
Fougère: 0.20
Fresh: 0.20
Fresher: 0.28
Fruity: 0.20
Gardenia: 0.20
Gourmand: 0.20
Green: 0.20
Honeysuckle: 0.20
Iris: 0.42
Jasmine: 0.20
Lilac: 0.20
Lily: 0.26
Magnolia: 0.20
Mimosa: 0.20
Mossy: 0.20
Musk: 0.20
Orange: 0.22
Oriental: 0.20
Rich: 0.40
Richer: 0.22
Rose: 0.20
Soft: 0.20
Spicy: 0.20
Tuberose: 0.20
Valley: 0.26
Violet: 0.40
Water: 0.20
White: 0.20
Woods: 0.20
Woody: 0.20
info: 0.20
of: 0.50
the: 0.50

=== Threshold-based ===
Micro-F1: 0.5001
Macro-F1: 0.2513
Sample-F1: 0.4985

[classification_report @thr]
              precision    recall  f1-score   support

         $$$       0.00      0.00      0.00         1
       Amber       0.41      0.84      0.55      1744
    Aromatic       0.32      0.31      0.32       420
     Blossom       0.60      0.12      0.19        26
     Bouquet      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [25]:
# ============================================
# MiniLM 임베딩 + VotingClassifier (앙상블)
# + 하드코딩된 Threshold 적용(요청값 재현)
# + 개선: LR에 class_weight="balanced", 소프트보팅 가중치 조정
# ============================================

import os, numpy as np, pandas as pd, random
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC  # 참고: 사용 안함(soft 투표 불가)
from xgboost import XGBClassifier

import torch
from transformers import AutoTokenizer, AutoModel

# -------------------------------
# 설정
# -------------------------------
DATA_CSV = "perfumes_huggingface.csv"
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
TOP_K = 3
RARE_MIN_COUNT = 7
MAX_LEN = 256
BATCH_SIZE = 16

# 재현성 고정
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

# -------------------------------
# 유틸 함수
# -------------------------------
def split_labels(s: str):
    s = str(s)
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, " ")
    return [t.strip() for t in s.split() if t.strip()]

# -------------------------------
# 1) 데이터 로드 & 전처리
# -------------------------------
df = pd.read_csv(DATA_CSV, sep="|", engine="python", on_bad_lines="skip")
df = df[~df["description"].isna()].copy()
df["labels"] = df["fragrances"].apply(split_labels)

# 희소 라벨 제거
cnt = Counter([l for L in df["labels"] for l in L])
rare = {k for k, v in cnt.items() if v <= RARE_MIN_COUNT}
df["labels"] = df["labels"].apply(lambda L: [l for l in L if l not in rare])
df = df[df["labels"].map(len) > 0].copy()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["labels"])

X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["description"].tolist(), Y, test_size=0.2, random_state=SEED
)

# -------------------------------
# 2) MiniLM 임베딩 추출
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
base_model.eval()

def encode_texts(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        ).to(device)
        with torch.no_grad():
            out = base_model(**enc)
            # mean-pooling
            emb = out.last_hidden_state.mean(dim=1)
        all_embeddings.append(emb.cpu().numpy())
    return np.vstack(all_embeddings)

print("[Encoding Train Texts]")
X_train_emb = encode_texts(X_train_text, batch_size=BATCH_SIZE)
print("[Encoding Validation Texts]")
X_val_emb = encode_texts(X_val_text, batch_size=BATCH_SIZE)

# -------------------------------
# 3) VotingClassifier 앙상블 학습
# -------------------------------
logreg = LogisticRegression(max_iter=200)

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    tree_method="hist",  # (cuda일 때 gpu_hist로 바꿔도 가능하나 결과 재현 위해 고정)
    device="cuda" if device == "cuda" else "cpu",
    random_state=SEED,
)

# LinearSVC는 predict_proba 없음 → soft voting 불가하므로 제외
# 소프트보팅에서 XGB 가중치 약간 ↑ (경험상 안정적 이득)
ensemble = VotingClassifier(
    estimators=[("lr", logreg), ("xgb", xgb)],
    voting="soft",
    weights=[1.3, 0.8]  # ← 가중치 조정(필요시 1.2~1.4 범위로 소폭 튜닝)
)

clf = OneVsRestClassifier(ensemble, n_jobs=-1)
clf.fit(X_train_emb, y_train)

# -------------------------------
# 4) 검증 예측 & (하드코딩) Threshold 적용
# -------------------------------
y_val_proba = np.array(clf.predict_proba(X_val_emb))

# === 제공된 [Best Thresholds per label]을 고정 딕셔너리로 사용 ===
fixed_thresholds = {
    "Amber": 0.22,
    "Aromatic": 0.20,
    "Blossom": 0.28,
    "Bouquet": 0.20,
    "Carnation": 0.20,
    "Citrus": 0.20,
    "Classical": 0.24,
    "Crisp": 0.20,
    "Dry": 0.20,
    "Floral": 0.30,
    "Flower": 0.20,
    "Fougère": 0.20,
    "Fresh": 0.20,
    "Fresher": 0.28,
    "Fruity": 0.20,
    "Gardenia": 0.20,
    "Gourmand": 0.20,
    "Green": 0.20,
    "Honeysuckle": 0.20,
    "Iris": 0.42,
    "Jasmine": 0.20,
    "Lilac": 0.20,
    "Lily": 0.26,
    "Magnolia": 0.20,
    "Mimosa": 0.20,
    "Mossy": 0.20,
    "Musk": 0.20,
    "Orange": 0.22,
    "Oriental": 0.20,
    "Rich": 0.40,
    "Richer": 0.22,
    "Rose": 0.20,
    "Soft": 0.20,
    "Spicy": 0.20,
    "Tuberose": 0.20,
    "Valley": 0.26,
    "Violet": 0.40,
    "Water": 0.20,
    "White": 0.20,
    "Woods": 0.20,
    "Woody": 0.20,
}

# 라벨 순서(mlb.classes_)에 맞춰 적용
y_val_pred_opt = np.zeros_like(y_val)
print("\n[Best Thresholds per label]")
for i, label in enumerate(mlb.classes_):
    thr = fixed_thresholds.get(label, 0.5)
    print(f"{label}: {thr:.2f}")
    y_val_pred_opt[:, i] = (y_val_proba[:, i] >= thr).astype(int)

# -------------------------------
# 5) 평가 (요청값 재현)
# -------------------------------
micro_f1 = f1_score(y_val, y_val_pred_opt, average="micro")
macro_f1 = f1_score(y_val, y_val_pred_opt, average="macro")
sample_f1 = f1_score(y_val, y_val_pred_opt, average="samples")

print("\n=== Threshold-based ===")
print(f"Micro-F1: {micro_f1:.4f}")
print(f"Macro-F1: {macro_f1:.4f}")
print(f"Sample-F1: {sample_f1:.4f}")

print("\n[classification_report @thr]")
print(classification_report(y_val, y_val_pred_opt, target_names=mlb.classes_, zero_division=0))

# -------------------------------
# 6) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    emb = encode_texts([text], batch_size=1)
    proba = clf.predict_proba(emb)[0]
    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]
    return [mlb.classes_[i] for i in pick]

print("\n[Example Prediction]")
print(predict_multilingual(
    "바닷가에서 느껴지는 시원하고 약간 달콤한 향이 좋아요",
    topk=TOP_K,
    thresholds=fixed_thresholds
))


[Device] cuda
[Encoding Train Texts]
[Encoding Validation Texts]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.



[Best Thresholds per label]
$$$: 0.50
Amber: 0.22
Aromatic: 0.20
Blossom: 0.28
Bouquet: 0.20
Carnation: 0.20
Citrus: 0.20
Classical: 0.24
Crisp: 0.20
Dry: 0.20
Floral: 0.30
Flower: 0.20
Fougère: 0.20
Fresh: 0.20
Fresher: 0.28
Fruity: 0.20
Gardenia: 0.20
Gourmand: 0.20
Green: 0.20
Honeysuckle: 0.20
Iris: 0.42
Jasmine: 0.20
Lilac: 0.20
Lily: 0.26
Magnolia: 0.20
Mimosa: 0.20
Mossy: 0.20
Musk: 0.20
Orange: 0.22
Oriental: 0.20
Rich: 0.40
Richer: 0.22
Rose: 0.20
Soft: 0.20
Spicy: 0.20
Tuberose: 0.20
Valley: 0.26
Violet: 0.40
Water: 0.20
White: 0.20
Woods: 0.20
Woody: 0.20
info: 0.50
of: 0.50
the: 0.50

=== Threshold-based ===
Micro-F1: 0.5006
Macro-F1: 0.2501
Sample-F1: 0.4996

[classification_report @thr]
              precision    recall  f1-score   support

         $$$       0.00      0.00      0.00         1
       Amber       0.41      0.85      0.55      1744
    Aromatic       0.34      0.36      0.35       420
     Blossom       0.50      0.08      0.13        26
     Bouquet      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# ============================================
# MiniLM 임베딩 + VotingClassifier (앙상블)
# + 하드코딩된 Threshold 적용(요청값 재현)
# + 초소형 하이퍼파라미터 탐색(8조합): weights, LR.C, XGB.reg_lambda
# ============================================

import os, numpy as np, pandas as pd, random
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC  # 참고: 사용 안함(soft 투표 불가)
from xgboost import XGBClassifier

import torch
from transformers import AutoTokenizer, AutoModel

# -------------------------------
# 설정
# -------------------------------
DATA_CSV = "perfumes_huggingface.csv"
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
TOP_K = 3
RARE_MIN_COUNT = 7
MAX_LEN = 256
BATCH_SIZE = 16

# 재현성 고정
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

# -------------------------------
# 유틸 함수
# -------------------------------
def split_labels(s: str):
    s = str(s)
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, " ")
    return [t.strip() for t in s.split() if t.strip()]

# -------------------------------
# 1) 데이터 로드 & 전처리
# -------------------------------
df = pd.read_csv(DATA_CSV, sep="|", engine="python", on_bad_lines="skip")
df = df[~df["description"].isna()].copy()
df["labels"] = df["fragrances"].apply(split_labels)

# 희소 라벨 제거
cnt = Counter([l for L in df["labels"] for l in L])
rare = {k for k, v in cnt.items() if v <= RARE_MIN_COUNT}
df["labels"] = df["labels"].apply(lambda L: [l for l in L if l not in rare])
df = df[df["labels"].map(len) > 0].copy()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["labels"])

X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["description"].tolist(), Y, test_size=0.2, random_state=SEED
)

# -------------------------------
# 2) MiniLM 임베딩 추출 (1회만)
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
base_model.eval()

def encode_texts(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt"
        ).to(device)
        with torch.no_grad():
            out = base_model(**enc)
            emb = out.last_hidden_state.mean(dim=1)  # mean-pooling
        all_embeddings.append(emb.cpu().numpy())
    return np.vstack(all_embeddings)

print("[Encoding Train Texts]")
X_train_emb = encode_texts(X_train_text, batch_size=BATCH_SIZE)
print("[Encoding Validation Texts]")
X_val_emb = encode_texts(X_val_text, batch_size=BATCH_SIZE)

# -------------------------------
# 3) 초소형 하이퍼파라미터 탐색
#    - weights: 2가지
#    - LR.C: 2가지 (class_weight='balanced' 고정)
#    - XGB.reg_lambda: 2가지 (경량 정규화만 탐색)
#    총 8조합, Micro-F1로 선택 (고정 임계값 적용)
# -------------------------------
# 고정 임계값 (네가 준 값)
fixed_thresholds = {
    "Amber": 0.22, "Aromatic": 0.20, "Blossom": 0.28, "Bouquet": 0.20, "Carnation": 0.20,
    "Citrus": 0.20, "Classical": 0.24, "Crisp": 0.20, "Dry": 0.20, "Floral": 0.30,
    "Flower": 0.20, "Fougère": 0.20, "Fresh": 0.20, "Fresher": 0.28, "Fruity": 0.20,
    "Gardenia": 0.20, "Gourmand": 0.20, "Green": 0.20, "Honeysuckle": 0.20, "Iris": 0.42,
    "Jasmine": 0.20, "Lilac": 0.20, "Lily": 0.26, "Magnolia": 0.20, "Mimosa": 0.20,
    "Mossy": 0.20, "Musk": 0.20, "Orange": 0.22, "Oriental": 0.20, "Rich": 0.40,
    "Richer": 0.22, "Rose": 0.20, "Soft": 0.20, "Spicy": 0.20, "Tuberose": 0.20,
    "Valley": 0.26, "Violet": 0.40, "Water": 0.20, "White": 0.20, "Woods": 0.20, "Woody": 0.20
}

def eval_with_fixed_thresholds(clf, X_val_emb, y_val):
    """확률→고정 임계값 적용→Micro/Sample/Macro F1 반환"""
    y_proba = np.array(clf.predict_proba(X_val_emb))
    y_pred = np.zeros_like(y_val)
    for i, label in enumerate(mlb.classes_):
        thr = fixed_thresholds.get(label, 0.5)
        y_pred[:, i] = (y_proba[:, i] >= thr).astype(int)
    micro = f1_score(y_val, y_pred, average="micro")
    sample = f1_score(y_val, y_pred, average="samples")
    macro = f1_score(y_val, y_pred, average="macro")
    return micro, sample, macro, y_pred

# 탐색 후보 (8개)
weight_grid = [(1.3, 0.8), (1.4, 0.8)]
lr_c_grid = [1.0, 1.3]
xgb_l2_grid = [1.0, 1.5]   # reg_lambda

best = None
best_clf = None
print("\n[Mini Search] Trying tiny grid of (weights, LR.C, XGB.reg_lambda)")
for w in weight_grid:
    for c in lr_c_grid:
        for l2 in xgb_l2_grid:
            # ── 모델 구성 ───────────────────────────────────────────────
            logreg = LogisticRegression(max_iter=200, class_weight="balanced", C=c)
            xgb = XGBClassifier(
                objective="binary:logistic",
                eval_metric="logloss",
                use_label_encoder=False,
                tree_method="hist",
                device="cuda" if device == "cuda" else "cpu",
                random_state=SEED,
                # 경량/안전 고정값
                max_depth=3,
                n_estimators=200,
                learning_rate=0.1,
                subsample=0.9,
                colsample_bytree=0.8,
                min_child_weight=1,
                gamma=0.0,
                reg_lambda=l2,
            )
            ensemble = VotingClassifier(
                estimators=[("lr", logreg), ("xgb", xgb)],
                voting="soft",
                weights=list(w),
            )
            clf_try = OneVsRestClassifier(ensemble, n_jobs=-1)
            clf_try.fit(X_train_emb, y_train)

            micro, sample, macro, _ = eval_with_fixed_thresholds(clf_try, X_val_emb, y_val)
            print(f"  weights={w}, C={c}, reg_lambda={l2}  →  Micro {micro:.4f} | Sample {sample:.4f} | Macro {macro:.4f}")

            if (best is None) or (micro > best["micro"]):
                best = {"weights": w, "C": c, "reg_lambda": l2, "micro": micro, "sample": sample, "macro": macro}
                best_clf = clf_try

print("\n[Mini Search] Best by Micro-F1:", best)

# 최적 조합 모델 선택
clf = best_clf

# -------------------------------
# 4) 검증 예측 & 임계값 적용 (best 모델)
# -------------------------------
y_val_proba = np.array(clf.predict_proba(X_val_emb))

print("\n[Best Thresholds per label](fixed)")
y_val_pred_opt = np.zeros_like(y_val)
for i, label in enumerate(mlb.classes_):
    thr = fixed_thresholds.get(label, 0.5)
    print(f"{label}: {thr:.2f}")
    y_val_pred_opt[:, i] = (y_val_proba[:, i] >= thr).astype(int)

# -------------------------------
# 5) 평가
# -------------------------------
micro_f1 = f1_score(y_val, y_val_pred_opt, average="micro")
macro_f1 = f1_score(y_val, y_val_pred_opt, average="macro")
sample_f1 = f1_score(y_val, y_val_pred_opt, average="samples")

print("\n=== Threshold-based (mini-search best) ===")
print(f"Micro-F1: {micro_f1:.4f}")
print(f"Macro-F1: {macro_f1:.4f}")
print(f"Sample-F1: {sample_f1:.4f}")

print("\n[classification_report @thr]")
print(classification_report(y_val, y_val_pred_opt, target_names=mlb.classes_, zero_division=0))

# -------------------------------
# 6) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    emb = encode_texts([text], batch_size=1)
    proba = clf.predict_proba(emb)[0]
    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]
    return [mlb.classes_[i] for i in pick]

print("\n[Example Prediction]")
print(predict_multilingual(
    "바닷가에서 느껴지는 시원하고 약간 달콤한 향이 좋아요",
    topk=TOP_K,
    thresholds=fixed_thresholds
))


[Device] cuda
[Encoding Train Texts]
[Encoding Validation Texts]

[Mini Search] Trying tiny grid of (weights, LR.C, XGB.reg_lambda)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  weights=(1.3, 0.8), C=1.0, reg_lambda=1.0  →  Micro 0.3090 | Sample 0.3128 | Macro 0.1956


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  weights=(1.3, 0.8), C=1.0, reg_lambda=1.5  →  Micro 0.3090 | Sample 0.3128 | Macro 0.1955
