In [1]:
# 0. 라이브러리 임포트
import os, random, datetime
import pandas as pd
import numpy as np
from PIL import Image
from itertools import combinations
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import CLIPProcessor, CLIPModel

# 1. 설정
DATA_DIR = r"D:\Project\PJT_10\shopee-product-matching"
CSV_PATH = os.path.join(DATA_DIR, "train.csv")
IMG_DIR = os.path.join(DATA_DIR, "train_images")

SAVE_DIR = "./saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

BATCH_SIZE = 8 # <- 16
EPOCHS = 30 # <- 1
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if DEVICE.type == "cuda":
        torch.cuda.manual_seed_all(seed)
set_seed(SEED)

# 2. 데이터 로딩 및 라벨 인코딩
df = pd.read_csv(CSV_PATH).reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label_group"])

# 3. 그룹 단위로 train/val/test split
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, test_size=0.4, random_state=SEED)
train_idx, temp_idx = next(gss.split(df, groups=df["label_encoded"]))

train_df = df.iloc[train_idx].reset_index(drop=True)
temp_df = df.iloc[temp_idx].reset_index(drop=True)

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=SEED)
val_idx, test_idx = next(gss2.split(temp_df, groups=temp_df["label_encoded"]))

val_df = temp_df.iloc[val_idx].reset_index(drop=True)
test_df = temp_df.iloc[test_idx].reset_index(drop=True)

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

# 4. Positive / Negative pair 생성 함수
def create_pairs(df, max_neg_per_pos=2):
    pairs = []
    label_groups = df["label_encoded"].unique()

    for lg in label_groups:
        group_df = df[df["label_encoded"] == lg]
        if len(group_df) < 2:
            continue
        idxs = group_df.index.tolist()
        pos_combs = list(combinations(idxs, 2))
        for i, j in pos_combs:
            pairs.append((i, j, 1))

    pos_count = sum(1 for _,_,label in pairs if label == 1)
    neg_needed = pos_count * max_neg_per_pos

    all_indices = list(df.index)  # <- 여기 변경
    neg_pairs = set()

    while len(neg_pairs) < neg_needed:
        i, j = random.sample(all_indices, 2)
        if df.loc[i, "label_encoded"] != df.loc[j, "label_encoded"]:
            neg_pairs.add((i, j))
    for i, j in neg_pairs:
        pairs.append((i, j, 0))

    return pairs

# 5. Dataset 클래스
class ShopeePairDataset(Dataset):
    def __init__(self, df, pairs, img_dir, processor):
        self.df = df
        self.pairs = pairs
        self.img_dir = img_dir
        self.processor = processor

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        i1, i2, label = self.pairs[idx]
        row1 = self.df.loc[i1]
        row2 = self.df.loc[i2]

        image1 = Image.open(os.path.join(self.img_dir, row1["image"])).convert("RGB")
        image2 = Image.open(os.path.join(self.img_dir, row2["image"])).convert("RGB")

        text1 = row1["title"]
        text2 = row2["title"]

        return {"image1": image1, "text1": text1,
                "image2": image2, "text2": text2,
                "label": label}

# 6. Collate 함수
def collate_fn(batch):
    texts1 = [item["text1"] for item in batch]
    texts2 = [item["text2"] for item in batch]
    images1 = [item["image1"] for item in batch]
    images2 = [item["image2"] for item in batch]
    labels = torch.tensor([item["label"] for item in batch], dtype=torch.float)

    inputs1 = processor(text=texts1, images=images1, return_tensors="pt", padding=True, truncation=True)
    inputs2 = processor(text=texts2, images=images2, return_tensors="pt", padding=True, truncation=True)

    return {
        "input_ids1": inputs1["input_ids"],
        "attention_mask1": inputs1["attention_mask"],
        "pixel_values1": inputs1["pixel_values"],

        "input_ids2": inputs2["input_ids"],
        "attention_mask2": inputs2["attention_mask"],
        "pixel_values2": inputs2["pixel_values"],

        "label": labels
    }

# 7. 모델 정의 (Siamese)
class CLIPSiameseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.clip = CLIPModel.from_pretrained(model_name)
        # 모델 마지막 레이어 수정
        self.classifier = nn.Sequential(
            nn.Linear(self.clip.config.projection_dim * 4, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )  # sigmoid 제거

    def forward(self, input_ids1, attention_mask1, pixel_values1,
                      input_ids2, attention_mask2, pixel_values2):
        text_features1 = self.clip.get_text_features(input_ids=input_ids1, attention_mask=attention_mask1)
        image_features1 = self.clip.get_image_features(pixel_values=pixel_values1)
        feat1 = torch.cat([image_features1, text_features1], dim=1)

        text_features2 = self.clip.get_text_features(input_ids=input_ids2, attention_mask=attention_mask2)
        image_features2 = self.clip.get_image_features(pixel_values=pixel_values2)
        feat2 = torch.cat([image_features2, text_features2], dim=1)

        combined = torch.cat([feat1, feat2], dim=1)
        output = self.classifier(combined).squeeze(1)
        return output

# 8. 데이터셋 및 데이터로더 준비
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

train_pairs = create_pairs(train_df)
val_pairs = create_pairs(val_df)
test_pairs = create_pairs(test_df)

train_dataset = ShopeePairDataset(train_df, train_pairs, IMG_DIR, processor)
val_dataset = ShopeePairDataset(val_df, val_pairs, IMG_DIR, processor)
test_dataset = ShopeePairDataset(test_df, test_pairs, IMG_DIR, processor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# 9. 학습 루프 및 평가
model = CLIPSiameseModel("openai/clip-vit-base-patch32").to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

best_val_loss = float('inf')
log_list = []

patience = 5  # 개선 없을 때 최대 허용 epoch 수
counter = 0   # 얼리 스탑 카운터

for epoch in range(EPOCHS):
    model.train()
    
    train_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        optimizer.zero_grad()
        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "label"}
        labels = batch["label"].to(DEVICE)

        outputs = model(**inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        probs = torch.sigmoid(outputs)        # logits → 확률
        preds = (probs >= 0.5).float()        # threshold 적용
        
        train_loss += loss.item() * labels.size(0)
        
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    train_loss /= len(train_loader.dataset)
    train_acc = correct / total

    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(DEVICE)

            outputs = model(**inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * labels.size(0)
            
            probs = torch.sigmoid(outputs)
            preds = (probs >= 0.5).float()
            
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_loss /= len(val_loader.dataset)
    val_acc = correct / total

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    log_list.append({
        "epoch": epoch+1,
        "train_loss": train_loss,
        "train_acc": train_acc,
        "val_loss": val_loss,
        "val_acc": val_acc
    })

    # 얼리 스탑 로직
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        save_path = os.path.join(SAVE_DIR, f"clip_pair_best_epoch{epoch+1}_{timestamp}.pth")
        torch.save(model.state_dict(), save_path)
        print(f"✅ Saved best model at epoch {epoch+1} to {save_path}")
    else:
        counter += 1
        print(f"⏳ EarlyStopping counter: {counter} out of {patience}")
        if counter >= patience:
            print(f"⚠️ Early stopping triggered at epoch {epoch+1}")
            break

log_df = pd.DataFrame(log_list)
log_csv_path = os.path.join(SAVE_DIR, f"training_log_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
log_df.to_csv(log_csv_path, index=False)
print(f"\n📊 Training log saved to {log_csv_path}")
    
def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0

    #criterion = nn.BCELoss()
    criterion = nn.BCEWithLogitsLoss()
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):  # ✅ 전달받은 loader 사용
            inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(DEVICE)

            outputs = model(**inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * labels.size(0)
            
            probs = torch.sigmoid(outputs)
            preds = (probs >= 0.5).float()
            
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    avg_loss = total_loss / total  # ✅ 평균 loss 계산
    
    print(f"\n🧪 Test Loss: {avg_loss:.4f} | Test Accuracy: {acc:.4f}")

best_models = sorted([f for f in os.listdir(SAVE_DIR) if f.startswith("clip_pair_best") and f.endswith(".pth")])
best_model_path = os.path.join(SAVE_DIR, best_models[-1])
model.load_state_dict(torch.load(best_model_path, map_location=DEVICE))
print(f"✅ Loaded best model from {best_model_path}")

evaluate(model, test_loader)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Train size: 20392, Val size: 6820, Test size: 7038


Train Epoch 1: 100%|██████████████████████████████████████████████████████| 9312/9312 [1:25:12<00:00,  1.82it/s]


Epoch 1 | Train Loss: 0.2787 | Val Loss: 0.2871 | Val Acc: 0.8884
✅ Saved best model at epoch 1 to ./saved_models\clip_pair_best_epoch1_20250715_225123.pth

📊 Training log saved to ./saved_models\training_log_20250715_225123.csv
✅ Loaded best model from ./saved_models\clip_pair_best_epoch1_20250715_225123.pth

🧪 Test Accuracy: 0.8905
