In [1]:
# ============================================
# 설치 (Runpod A40 / 로컬)
# ============================================

# CPU 전용
# python -m pip install -U "torch>=2.2,<3.0" scikit-learn pandas numpy joblib sentence-transformers transformers

# GPU (CUDA 12.1, Runpod A40)
!python -m pip install -U "torch>=2.2,<3.0" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!python -m pip install -U scikit-learn pandas numpy joblib sentence-transformers transformers

Looking in indexes: https://download.pytorch.org/whl/cu121
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [5]:
!pip uninstall transformers -y
!pip uninstall huggingface-hub -y
!pip uninstall tokenizers -y

!pip install --no-cache-dir transformers

Found existing installation: transformers 4.55.4
Uninstalling transformers-4.55.4:
  Successfully uninstalled transformers-4.55.4
[0mFound existing installation: huggingface-hub 0.34.4
Uninstalling huggingface-hub-0.34.4:
  Successfully uninstalled huggingface-hub-0.34.4
[0mFound existing installation: tokenizers 0.21.4
Uninstalling tokenizers-0.21.4:
  Successfully uninstalled tokenizers-0.21.4
[0mCollecting transformers
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.55.4-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m753.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hu

In [10]:
# ============================================
# MiniLM 임베딩 + AutoModelForSequenceClassification + Threshold 최적화 + Epoch 8
# ============================================

import os, time
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import BCEWithLogitsLoss
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW

# -------------------------------
# 설정
# -------------------------------
DATA_CSV = "perfumes_huggingface.csv"
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
TOP_K = 3
RARE_MIN_COUNT = 7
MAX_LEN = 384
BATCH_SIZE = 16
EPOCHS = 8
LR = 1e-5

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

# -------------------------------
# 유틸 함수
# -------------------------------
def split_labels(s: str):
    s = str(s)
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, " ")
    return [t.strip() for t in s.split() if t.strip()]

# -------------------------------
# 1) 데이터 로드 & 전처리
# -------------------------------
df = pd.read_csv(DATA_CSV, sep="|", engine="python", on_bad_lines="skip")
df = df[~df["description"].isna()].copy()
df["labels"] = df["fragrances"].apply(split_labels)

cnt = Counter([l for L in df["labels"] for l in L])
rare = {k for k, v in cnt.items() if v <= RARE_MIN_COUNT}
df["labels"] = df["labels"].apply(lambda L: [l for l in L if l not in rare])
df = df[df["labels"].map(len) > 0].copy()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["labels"])

from sklearn.model_selection import train_test_split
X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["description"].tolist(), Y, test_size=0.2, random_state=42
)

# -------------------------------
# 2) 토크나이저 및 데이터셋
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class PerfumeDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = PerfumeDataset(X_train_text, y_train)
val_dataset = PerfumeDataset(X_val_text, y_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# -------------------------------
# 3) 모델 & 옵티마이저
# -------------------------------
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=Y.shape[1],
    problem_type="multi_label_classification"
).to(device)

optimizer = AdamW(model.parameters(), lr=LR)
loss_fn = BCEWithLogitsLoss()

# -------------------------------
# 4) 학습 루프
# -------------------------------
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        labels = batch["labels"].to(device)

        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f}")

# -------------------------------
# 5) 검증 예측 & Threshold 최적화
# -------------------------------
model.eval()
all_logits, all_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        labels = batch["labels"].cpu().numpy()
        outputs = model(**inputs).logits.cpu().numpy()
        all_logits.append(outputs)
        all_labels.append(labels)

y_val_proba = torch.sigmoid(torch.tensor(np.vstack(all_logits))).numpy()
y_val = np.vstack(all_labels)

thresholds = {}
y_val_pred_opt = np.zeros_like(y_val)

for i, label in enumerate(mlb.classes_):
    best_thr, best_f1 = 0.5, -1
    for thr in np.linspace(0.2, 0.5, 16):
        pred = (y_val_proba[:, i] >= thr).astype(int)
        f1 = f1_score(y_val[:, i], pred, zero_division=0)
        if f1 > best_f1:
            best_thr, best_f1 = thr, f1
    thresholds[label] = best_thr
    y_val_pred_opt[:, i] = (y_val_proba[:, i] >= best_thr).astype(int)

print("\n[Best Thresholds per label]")
for k, v in thresholds.items():
    print(f"{k}: {v:.2f}")

# -------------------------------
# 6) 평가
# -------------------------------
print("\n=== Threshold-based ===")
print(f"Micro-F1: {f1_score(y_val, y_val_pred_opt, average='micro'):.4f}")
print(f"Macro-F1: {f1_score(y_val, y_val_pred_opt, average='macro'):.4f}")
print(f"Sample-F1: {f1_score(y_val, y_val_pred_opt, average='samples'):.4f}")
print("\n[classification_report @thr]")
print(classification_report(y_val, y_val_pred_opt, target_names=mlb.classes_, zero_division=0))

# Top-K 기반 예측도 비교
topk_preds = np.argsort(-y_val_proba, axis=1)[:, :TOP_K]
topk_bin = np.zeros_like(y_val)
for i, preds in enumerate(topk_preds):
    topk_bin[i, preds] = 1

print("\n=== Top-K-based ===")
print(f"Micro-F1: {f1_score(y_val, topk_bin, average='micro'):.4f}")
print(f"Macro-F1: {f1_score(y_val, topk_bin, average='macro'):.4f}")
print(f"Sample-F1: {f1_score(y_val, topk_bin, average='samples'):.4f}")

# -------------------------------
# 7) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LEN).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        proba = torch.sigmoid(logits).cpu().numpy()[0]

    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]

    return [mlb.classes_[i] for i in pick]

# -------------------------------
# 예시 실행
# -------------------------------
print("\n[Example Prediction]")
print(predict_multilingual("바닷가에서 느껴지는 시원하고 약간 달콤한 향이 좋아요", topk=3, thresholds=thresholds))


[Device] cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 | Train Loss: 0.2803
Epoch 2 | Train Loss: 0.1823
Epoch 3 | Train Loss: 0.1771
Epoch 4 | Train Loss: 0.1702
Epoch 5 | Train Loss: 0.1642
Epoch 6 | Train Loss: 0.1581
Epoch 7 | Train Loss: 0.1529
Epoch 8 | Train Loss: 0.1479

[Best Thresholds per label]
$$$: 0.20
Amber: 0.28
Aromatic: 0.20
Blossom: 0.20
Bouquet: 0.20
Carnation: 0.20
Citrus: 0.24
Classical: 0.24
Crisp: 0.20
Dry: 0.20
Floral: 0.30
Flower: 0.20
Fougère: 0.22
Fresh: 0.20
Fresher: 0.30
Fruity: 0.20
Gardenia: 0.20
Gourmand: 0.20
Green: 0.20
Honeysuckle: 0.20
Iris: 0.20
Jasmine: 0.20
Lilac: 0.20
Lily: 0.20
Magnolia: 0.20
Mimosa: 0.20
Mossy: 0.20
Musk: 0.20
Orange: 0.20
Oriental: 0.20
Rich: 0.20
Richer: 0.20
Rose: 0.20
Soft: 0.20
Spicy: 0.20
Tuberose: 0.20
Valley: 0.20
Violet: 0.20
Water: 0.20
White: 0.20
Woods: 0.22
Woody: 0.20
info: 0.20
of: 0.20
the: 0.20

=== Threshold-based ===
Micro-F1: 0.4926
Macro-F1: 0.1344
Sample-F1: 0.4951

[classification_report @thr]
              precision    recall  f1-score   support

  

In [11]:
print("\n[Example Prediction]")
print(predict_multilingual("깨끗하게 빨래하고 말린 상쾌한 향", topk=3, thresholds=thresholds))


[Example Prediction]
['Citrus', 'Floral', 'Fresher', 'Fruity']


In [12]:
print("\n[Example Prediction]")
print(predict_multilingual("바다향", topk=3, thresholds=thresholds))


[Example Prediction]
['Citrus', 'Floral', 'Fresher']
