In [1]:
# 라이브러리
import os, torch
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from transformers import CLIPModel, CLIPProcessor
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch

# 설정
DATA_DIR = r"D:\Project\PJT_10\shopee-product-matching"
CSV_PATH = os.path.join(DATA_DIR, "train.csv")
IMG_DIR = os.path.join(DATA_DIR, "train_images")

BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터 로딩 및 인코딩
df = pd.read_csv(CSV_PATH)
df["label_group"] = df["label_group"].astype("category").cat.codes

# Group 기반 데이터 분할
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.4, random_state=42)
train_idx, temp_idx = next(gss.split(df, groups=df["label_group"]))
train_df = df.iloc[train_idx].reset_index(drop=True)
temp_df = df.iloc[temp_idx].reset_index(drop=True)

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(gss2.split(temp_df, groups=temp_df["label_group"]))
val_df = temp_df.iloc[val_idx].reset_index(drop=True)
test_df = temp_df.iloc[test_idx].reset_index(drop=True)

# Dataset 정의
class ShopeeDataset(Dataset):
    def __init__(self, df, img_dir, processor):
        self.df = df
        self.img_dir = img_dir
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(os.path.join(self.img_dir, row["image"])).convert("RGB")
        text = row["title"]
        return self.processor(text=[text], images=[image], return_tensors="pt", padding=True, truncation=True)

# Collate 함수
def collate_fn(batch):
    input_ids = torch.cat([item["input_ids"] for item in batch], dim=0)
    attention_mask = torch.cat([item["attention_mask"] for item in batch], dim=0)
    pixel_values = torch.cat([item["pixel_values"] for item in batch], dim=0)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "pixel_values": pixel_values
    }

# 모델 및 Processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 임베딩 함수
def get_embeddings(model, loader):
    model.eval()
    all_embeddings = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Embedding"):
            inputs = {k: v.to(DEVICE) for k, v in batch.items()}
            image_embeds = model.get_image_features(pixel_values=inputs["pixel_values"])
            text_embeds = model.get_text_features(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
            combined = torch.cat([image_embeds, text_embeds], dim=1)
            all_embeddings.append(combined.cpu())
    return torch.cat(all_embeddings, dim=0)

# Dataloader
train_loader = DataLoader(ShopeeDataset(train_df, IMG_DIR, clip_processor), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
val_loader = DataLoader(ShopeeDataset(val_df, IMG_DIR, clip_processor), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(ShopeeDataset(test_df, IMG_DIR, clip_processor), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# F1 & 정확도 평가 함수
def compute_row_wise_metrics(embeds, df, top_k=5):
    sim_matrix = cosine_similarity(embeds)
    np.fill_diagonal(sim_matrix, -1)  # 자기 자신 제외

    preds = []
    targets = []

    for i, row in df.iterrows():
        true_ids = df[df["label_group"] == row["label_group"]]["posting_id"].tolist()
        top_idx = np.argsort(sim_matrix[i])[::-1][:top_k]
        pred_ids = df.iloc[top_idx]["posting_id"].tolist()
        preds.append(set(pred_ids))
        targets.append(set(true_ids))

    # F1 Score (macro 평균)
    row_f1 = [f1_score(list(t), list(p), average='macro') for p, t in zip(preds, targets)]

    # Accuracy: 정확히 예측했는가 (set 기준 완전 일치)
    row_acc = [1.0 if p == t else 0.0 for p, t in zip(preds, targets)]

    return np.mean(row_f1), np.mean(row_acc)


def compute_loss_cosine(embeds, df, top_k=5):
    from sklearn.metrics.pairwise import cosine_similarity

    sim_matrix = cosine_similarity(embeds)
    np.fill_diagonal(sim_matrix, -1)

    # 정답 벡터
    y_true = []
    y_pred = []

    posting_ids = df["posting_id"].tolist()
    pid_to_label = dict(zip(df["posting_id"], df["label_group"]))

    for i, row in df.iterrows():
        anchor_label = row["label_group"]
        anchor_id = row["posting_id"]

        # 상위 top_k 후보 선택
        sim_row = sim_matrix[i]
        top_indices = np.argsort(sim_row)[::-1][:top_k]
        for j in top_indices:
            candidate_id = df.iloc[j]["posting_id"]
            candidate_label = df.iloc[j]["label_group"]
            similarity = sim_matrix[i][j]

            y_true.append(1.0 if anchor_label == candidate_label else 0.0)
            y_pred.append(similarity)

    # Normalize similarity to [0, 1] if needed
    y_pred = np.array(y_pred)
    y_pred = (y_pred - y_pred.min()) / (y_pred.max() - y_pred.min() + 1e-8)

    y_true_tensor = torch.tensor(y_true, dtype=torch.float32)
    y_pred_tensor = torch.tensor(y_pred, dtype=torch.float32)

    criterion = nn.BCELoss()
    loss = criterion(y_pred_tensor, y_true_tensor)

    return loss.item()


# 임베딩 추출
print("🔹 Train Embedding 중...")
train_embeddings = get_embeddings(clip_model, train_loader)

print("🔹 Validation Embedding 중...")
val_embeddings = get_embeddings(clip_model, val_loader)

print("🔹 Test Embedding 중...")
test_embeddings = get_embeddings(clip_model, test_loader)

# 평가
print("\n📊 [Train Metrics]")
train_f1, train_acc = compute_row_wise_metrics(train_embeddings, train_df)
train_loss = compute_loss_cosine(train_embeddings, train_df)
print(f"Mean F1 (train): {train_f1:.4f}")
print(f"Mean Accuracy (train): {train_acc:.4f}")
print(f"Loss (train): {train_loss:.4f}")

print("\n📊 [Val Metrics]")
val_f1, val_acc = compute_row_wise_metrics(val_embeddings, val_df)
val_loss = compute_loss_cosine(val_embeddings, val_df)
print(f"Mean F1 (val): {val_f1:.4f}")
print(f"Mean Accuracy (val): {val_acc:.4f}")
print(f"Loss (val): {val_loss:.4f}")

print("\n📊 [Test Metrics]")
test_f1, test_acc = compute_row_wise_metrics(test_embeddings, test_df)
test_loss = compute_loss_cosine(test_embeddings, test_df)
print(f"Mean F1 (test): {test_f1:.4f}")
print(f"Mean Accuracy (test): {test_acc:.4f}")
print(f"Loss (test): {test_loss:.4f}")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


🔹 Train Embedding 중...


Embedding:   0%|                                                                        | 0/638 [00:00<?, ?it/s]


RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 6 but got size 13 for tensor number 1 in the list.