# 임베딩을 NPY, CSV로 저장 -> 잘 됨

In [7]:
# 0. 라이브러리 임포트
import os, random, datetime
import pandas as pd
import numpy as np
from PIL import Image
from itertools import combinations
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import CLIPProcessor, CLIPModel

# 1. 설정
DATA_DIR = r"D:\Project\PJT_10\shopee-product-matching"
CSV_PATH = os.path.join(DATA_DIR, "train.csv")
IMG_DIR = os.path.join(DATA_DIR, "train_images")

SAVE_DIR = "./saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

BATCH_SIZE = 32
EPOCHS = 10
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if DEVICE.type == "cuda":
        torch.cuda.manual_seed_all(seed)
set_seed(SEED)

# 2. 데이터 로딩 및 라벨 인코딩
df = pd.read_csv(CSV_PATH).reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label_group"])

# 3. 그룹 단위로 train/val/test split
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, test_size=0.4, random_state=SEED)
train_idx, temp_idx = next(gss.split(df, groups=df["label_encoded"]))

train_df = df.iloc[train_idx].reset_index(drop=True)
temp_df = df.iloc[temp_idx].reset_index(drop=True)

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=SEED)
val_idx, test_idx = next(gss2.split(temp_df, groups=temp_df["label_encoded"]))

val_df = temp_df.iloc[val_idx].reset_index(drop=True)
test_df = temp_df.iloc[test_idx].reset_index(drop=True)

In [14]:
print(len(train_df), len(val_df), len(test_df))

20392 6820 7038


In [8]:
import torch
import torch.nn as nn
from transformers import CLIPModel

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 7. 모델 정의 (Siamese)
class CLIPSiameseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.clip = CLIPModel.from_pretrained(model_name)
        self.classifier = nn.Sequential(
            nn.Linear(self.clip.config.projection_dim * 4, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, input_ids1, attention_mask1, pixel_values1,
                      input_ids2, attention_mask2, pixel_values2):
        text_features1 = self.clip.get_text_features(input_ids=input_ids1, attention_mask=attention_mask1)
        image_features1 = self.clip.get_image_features(pixel_values=pixel_values1)
        feat1 = torch.cat([image_features1, text_features1], dim=1)

        text_features2 = self.clip.get_text_features(input_ids=input_ids2, attention_mask=attention_mask2)
        image_features2 = self.clip.get_image_features(pixel_values=pixel_values2)
        feat2 = torch.cat([image_features2, text_features2], dim=1)

        combined = torch.cat([feat1, feat2], dim=1)
        output = self.classifier(combined).squeeze(1)
        return output

In [9]:
import torch
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor

SAVE_DIR = "./saved_models"
# 저장할 위치
EMBED_SAVE_PATH = os.path.join(SAVE_DIR, "test_embeddings.npy")
ID_SAVE_PATH = os.path.join(SAVE_DIR, "test_posting_ids.npy")

# 1. 모델 초기화 및 가중치 불러오기
model = CLIPSiameseModel("openai/clip-vit-base-patch32").to(DEVICE)
best_model_path = os.path.join(SAVE_DIR, "clip_pair_best_epoch2_20250716_025548.pth")
model.load_state_dict(torch.load(best_model_path, map_location=DEVICE))
model.eval()
print(f"✅ Loaded model from {best_model_path}")

# 2. processor 로드
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 3. 테스트셋 단일 입력 기반 임베딩 생성
embeddings = []
posting_ids = []

with torch.no_grad():
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Generating embeddings"):
        image_path = os.path.join(IMG_DIR, row["image"])
        image = Image.open(image_path).convert("RGB")
        text = row["title"]

        inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True, truncation=True).to(DEVICE)

        text_feat = model.clip.get_text_features(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        image_feat = model.clip.get_image_features(pixel_values=inputs["pixel_values"])

        combined_feat = torch.cat([image_feat, text_feat], dim=1)  # shape: [1, 1024]
        embeddings.append(combined_feat.squeeze(0).cpu().numpy())
        posting_ids.append(row["posting_id"])

# 4. numpy로 저장
embeddings = np.stack(embeddings)  # shape: [N, 1024]
np.save(EMBED_SAVE_PATH, embeddings)
np.save(ID_SAVE_PATH, np.array(posting_ids))

print(f"💾 Saved test embeddings to {EMBED_SAVE_PATH}")
print(f"🆔 Saved posting IDs to {ID_SAVE_PATH}")

✅ Loaded model from ./saved_models\clip_pair_best_epoch2_20250716_025548.pth


Generating embeddings: 100%|█████████████████████████████| 7038/7038 [03:09<00:00, 37.15it/s]

💾 Saved test embeddings to ./saved_models\test_embeddings.npy
🆔 Saved posting IDs to ./saved_models\test_posting_ids.npy





In [11]:
# csv로 저장

import pandas as pd

df_embed = pd.DataFrame(embeddings)
df_embed["posting_id"] = posting_ids
csv_path = os.path.join(SAVE_DIR, "test_embeddings.csv")
df_embed.to_csv(csv_path, index=False)
print(f"🧾 Saved embeddings to CSV at {csv_path}")

🧾 Saved embeddings to CSV at ./saved_models\test_embeddings.csv


# faiss, ANN -> 현재 오류 나있는 상태

In [1]:
# ✅ 1. Faiss 기반 최근접 검색 코드
# 🔧 설정: 라이브러리 및 파일 로딩

import numpy as np
import faiss

# 임베딩 및 ID 로딩
embedding_path = "./saved_models/test_embeddings.npy"
id_path = "./saved_models/test_posting_ids.npy"

embeddings = np.load(embedding_path).astype("float32")  # (N, 1024)
posting_ids = np.load(id_path)  # (N,)

print(f"✅ Embedding shape: {embeddings.shape}, IDs: {len(posting_ids)}")

ModuleNotFoundError: No module named 'faiss'

In [None]:
# ✅ 2. Faiss 인덱스 구축

# L2 거리 기반 인덱스 생성
dim = embeddings.shape[1]  # 보통 1024
index = faiss.IndexFlatL2(dim)

# 임베딩 추가
index.add(embeddings)
print(f"✅ Index populated with {index.ntotal} items.")

In [None]:
# ✅ 3. 최근접 이웃 검색 (Top-K)

# 쿼리: 테스트셋의 첫 번째 임베딩
query_vec = embeddings[0].reshape(1, -1)  # (1, 1024)

# K개 최근접 이웃 검색 (자기 자신 포함됨)
k = 5
distances, indices = index.search(query_vec, k)

# 결과 출력
print("\n🔍 Top-K Nearest Neighbors:")
for rank, (idx, dist) in enumerate(zip(indices[0], distances[0])):
    print(f"{rank+1}. ID: {posting_ids[idx]}, Distance: {dist:.4f}")

In [2]:
# ✅ 4. (선택) 전체 상품에 대해 Top-K 최근접 이웃 저장

# 전체 test_df 기준으로 top-K 유사 posting_id 뽑기
k = 5
all_distances, all_indices = index.search(embeddings, k)

# 결과 저장 (리스트 형태)
results = []
for i, neighbors in enumerate(all_indices):
    anchor_id = posting_ids[i]
    neighbor_ids = [posting_ids[n] for n in neighbors if posting_ids[n] != anchor_id]
    results.append({"posting_id": anchor_id, "matches": " ".join(neighbor_ids)})

# pandas로 저장
import pandas as pd
result_df = pd.DataFrame(results)
result_df.to_csv("./saved_models/faiss_topk_result.csv", index=False)
print("💾 Saved top-K retrieval result to ./saved_models/faiss_topk_result.csv")

NameError: name 'index' is not defined