In [3]:
# 라이브러리
import os, torch
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from transformers import CLIPModel, CLIPProcessor
from sentence_transformers import SentenceTransformer

from torch.utils.data import Dataset, DataLoader

# 설정
DATA_DIR = r"D:\Project\PJT_10\shopee-product-matching"
CSV_PATH = os.path.join(DATA_DIR, "train.csv")
IMG_DIR = os.path.join(DATA_DIR, "train_images")

BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터
df = pd.read_csv(CSV_PATH)
df["label_group"] = df["label_group"].astype("category").cat.codes

from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.4, random_state=42)
train_idx, temp_idx = next(gss.split(df, groups=df["label_group"]))
train_df = df.iloc[train_idx].reset_index(drop=True)
temp_df = df.iloc[temp_idx].reset_index(drop=True)

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(gss2.split(temp_df, groups=temp_df["label_group"]))
val_df = temp_df.iloc[val_idx].reset_index(drop=True)
test_df = temp_df.iloc[test_idx].reset_index(drop=True)

# Dataset
class ShopeeDataset(Dataset):
    def __init__(self, df, img_dir, processor):
        self.df = df
        self.img_dir = img_dir
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(os.path.join(self.img_dir, row["image"])).convert("RGB")
        return self.processor(images=image, return_tensors="pt", padding=True, truncation=True)

def collate_fn(batch):
    pixel_values = torch.cat([item["pixel_values"] for item in batch], dim=0)
    return {"pixel_values": pixel_values}

# 모델 불러오기
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
text_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# 텍스트 임베딩
def get_text_embeddings(df):
    texts = df["title"].tolist()
    return torch.tensor(text_model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True))

# 이미지 임베딩
def get_image_embeddings(model, processor, df):
    dataset = ShopeeDataset(df, IMG_DIR, processor)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
    all_embeddings = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(loader, desc="Image Embedding"):
            pixel_values = batch["pixel_values"].to(DEVICE)
            image_features = model.get_image_features(pixel_values=pixel_values)
            all_embeddings.append(image_features.cpu())
    return torch.cat(all_embeddings, dim=0)

# 전체 posting_id 리스트와 매핑 딕셔너리 생성
all_posting_ids = test_df["posting_id"].tolist()
posting_id_to_idx = {pid: idx for idx, pid in enumerate(all_posting_ids)}

# posting_id 리스트를 0/1 벡터로 변환하는 함수
def ids_to_binary_vector(ids):
    vec = np.zeros(len(all_posting_ids), dtype=int)
    for id_ in ids:
        if id_ in posting_id_to_idx:
            vec[posting_id_to_idx[id_]] = 1
    return vec

# 평가 함수 (row-wise F1) - 수정됨
def compute_row_wise_f1(embeds, df, top_k=5):
    sim = cosine_similarity(embeds)
    np.fill_diagonal(sim, -1)
    
    preds = []
    targets = []

    for i, row in df.iterrows():
        true_ids = df[df["label_group"] == row["label_group"]]["posting_id"].tolist()
        top_idx = np.argsort(sim[i])[::-1][:top_k]
        pred_ids = df.iloc[top_idx]["posting_id"].tolist()

        preds.append(set(pred_ids))
        targets.append(set(true_ids))

    row_f1 = []
    for pred_set, true_set in zip(preds, targets):
        y_pred = ids_to_binary_vector(pred_set)
        y_true = ids_to_binary_vector(true_set)
        row_f1.append(f1_score(y_true, y_pred, average='macro'))

    return np.mean(row_f1)

# 임베딩 추출 및 평가 실행
print("🔹 텍스트 임베딩 중...")
text_embeds = get_text_embeddings(test_df)

print("🔹 이미지 임베딩 중...")
image_embeds = get_image_embeddings(clip_model, clip_processor, test_df)

combined_embeds = torch.cat([text_embeds, image_embeds], dim=1).numpy()

f1 = compute_row_wise_f1(combined_embeds, test_df)
print(f"\n📊 Mean Row-wise F1 (no fine-tuning): {f1:.4f}")

🔹 텍스트 임베딩 중...


Batches:   0%|          | 0/110 [00:00<?, ?it/s]

🔹 이미지 임베딩 중...


Image Embedding: 100%|████████████████████████████████████████████████████████| 220/220 [01:31<00:00,  2.40it/s]



📊 Mean Row-wise F1 (no fine-tuning): 0.6940
