In [1]:
# ============================================
# 설치 (Runpod A40 / 로컬)
# ============================================

# CPU 전용
# python -m pip install -U "torch>=2.2,<3.0" scikit-learn pandas numpy joblib sentence-transformers transformers

# GPU (CUDA 12.1, Runpod A40)
!python -m pip install -U "torch>=2.2,<3.0" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!python -m pip install -U scikit-learn pandas numpy joblib sentence-transformers transformers

Looking in indexes: https://download.pytorch.org/whl/cu121
[0mCollecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting numpy
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting joblib
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting pytz

In [2]:
# pip 최신화
!pip install --upgrade pip

# bitsandbytes + accelerate 최신 설치
!pip install -U bitsandbytes accelerate

# HuggingFace 최신 필수 패키지
!pip install -U transformers peft safetensors

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.2
[0mCollecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting accelerate
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m245.8 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hDownloading accelerate-1.10.1-py3-none-any.whl (374 kB)
Installing collected packages: bitsandbytes, accelerate
[2K   [90m

In [5]:
# ============================================
# Unsloth LLaMA-3-8B-Instruct 기반
# 멀티라벨 분류 + Threshold 최적화
# ============================================

import os, numpy as np, pandas as pd
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# -------------------------------
# 설정
# -------------------------------
DATA_CSV = "perfumes_huggingface.csv"
MODEL_NAME = "unsloth/llama-3-8b-Instruct"   # 일반 디코더 모델
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 1
LR = 2e-5

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

# -------------------------------
# 데이터셋 정의
# -------------------------------
def split_labels(s: str):
    """ fragrance 문자열을 라벨 리스트로 변환 """
    s = str(s)
    for sep in [",", "|", "/", ";"]:
        s = s.replace(sep, " ")
    return [t.strip() for t in s.split() if t.strip()]

# 데이터 로드
df = pd.read_csv(DATA_CSV, sep="|", engine="python", on_bad_lines="skip")
df = df[~df["description"].isna()].copy()
df["labels"] = df["fragrances"].apply(split_labels)

# 희소 라벨 제거
cnt = Counter([l for L in df["labels"] for l in L])
rare = {k for k, v in cnt.items() if v <= 7}
df["labels"] = df["labels"].apply(lambda L: [l for l in L if l not in rare])
df = df[df["labels"].map(len) > 0].copy()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df["labels"])

X_train, X_val, y_train, y_val = train_test_split(
    df["description"].tolist(), Y, test_size=0.2, random_state=42
)

# -------------------------------
# Dataset / Dataloader
# -------------------------------
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class PerfumeDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_ds = PerfumeDataset(X_train, y_train)
val_ds   = PerfumeDataset(X_val, y_val)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE)

# -------------------------------
# 모델 정의
# -------------------------------
# LLaMA-3-8B-Instruct를 분류기로 사용 (multi-label classification)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification",
    torch_dtype=torch.bfloat16 if device=="cuda" else torch.float32,
).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=LR)

# -------------------------------
# 학습 루프
# -------------------------------
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attn)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"[Train] Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

# -------------------------------
# 검증 & Threshold 최적화
# -------------------------------
model.eval()
all_probs, all_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()
        logits = model(input_ids=input_ids, attention_mask=attn).logits
        probs = torch.sigmoid(logits.float()).cpu().numpy()
        all_probs.append(probs)
        all_labels.append(labels)

y_val_proba = np.vstack(all_probs)
y_val_true  = np.vstack(all_labels)

thresholds = {}
y_val_pred_opt = np.zeros_like(y_val_true)
for i, label in enumerate(mlb.classes_):
    precision, recall, thr = precision_recall_curve(y_val_true[:, i], y_val_proba[:, i])
    f1 = 2*precision*recall/(precision+recall+1e-8)
    best_idx = np.argmax(f1)
    best_thr = thr[best_idx] if best_idx < len(thr) else 0.5
    thresholds[label] = best_thr
    y_val_pred_opt[:, i] = (y_val_proba[:, i] >= best_thr).astype(int)

print("\n[Best Thresholds per label]")
for k, v in thresholds.items():
    print(f"{k}: {v:.2f}")

print("\n=== Threshold-Optimized Results ===")
print(f"Micro-F1: {f1_score(y_val_true, y_val_pred_opt, average='micro'):.4f}")
print(f"Macro-F1: {f1_score(y_val_true, y_val_pred_opt, average='macro'):.4f}")
print(f"Sample-F1: {f1_score(y_val_true, y_val_pred_opt, average='samples'):.4f}")

print("\n[classification_report @thr]")
print(classification_report(y_val_true, y_val_pred_opt, target_names=mlb.classes_, zero_division=0))

# -------------------------------
# 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    enc = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        logits = model(**enc).logits
        probs = torch.sigmoid(logits.float()).cpu().numpy()[0]
    if thresholds is not None:
        pick = [i for i, p in enumerate(probs) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:  # threshold 기준으로 아무 라벨도 없으면 top-k fallback
            pick = np.argsort(-probs)[:topk]
    else:
        pick = np.argsort(-probs)[:topk]
    return [mlb.classes_[i] for i in pick]

print("\n[Example Prediction]")
print(predict_multilingual("바닷가에서 느껴지는 시원하고 약간 달콤한 향이 좋아요", topk=3, thresholds=thresholds))


[Device] cuda


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at unsloth/llama-3-8b-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/1: 100%|██████████| 2627/2627 [11:54<00:00,  3.68it/s]


[Train] Epoch 1 Loss: 0.1567

[Best Thresholds per label]
$$$: 0.00
Amber: 0.27
Aromatic: 0.09
Blossom: 0.02
Bouquet: 0.02
Carnation: 0.00
Citrus: 0.27
Classical: 0.25
Crisp: 0.16
Dry: 0.11
Floral: 0.43
Flower: 0.10
Fougère: 0.08
Fresh: 0.01
Fresher: 0.32
Fruity: 0.17
Gardenia: 0.01
Gourmand: 0.21
Green: 0.15
Honeysuckle: 0.00
Iris: 0.01
Jasmine: 0.23
Lilac: 0.02
Lily: 0.00
Magnolia: 0.01
Mimosa: 0.00
Mossy: 0.13
Musk: 0.15
Orange: 0.02
Oriental: 0.00
Rich: 0.02
Richer: 0.06
Rose: 0.18
Soft: 0.15
Spicy: 0.06
Tuberose: 0.16
Valley: 0.00
Violet: 0.13
Water: 0.07
White: 0.11
Woods: 0.24
Woody: 0.22
info: 0.00
of: 0.00
the: 0.00

=== Threshold-Optimized Results ===
Micro-F1: 0.4357
Macro-F1: 0.3437
Sample-F1: 0.4397

[classification_report @thr]
              precision    recall  f1-score   support

         $$$       0.00      1.00      0.00         1
       Amber       0.51      0.75      0.61      1744
    Aromatic       0.38      0.58      0.46       420
     Blossom       0.39      0.

