In [1]:
import os
import random
import pandas as pd
import numpy as np
from PIL import Image
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

from transformers import BertTokenizer, BertModel
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights

In [2]:
# 0. 환경 설정
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_DIR = r"D:\Project\PJT_10\shopee-product-matching"
IMG_DIR = os.path.join(DATA_DIR, "train_images")
CSV_PATH = os.path.join(DATA_DIR, "train.csv")
MODEL_SAVE_PATH = "./model_best.pth"
BATCH_SIZE = 32
EPOCHS = 1
SEED = 42

In [3]:
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if DEVICE.type=='cuda':
        torch.cuda.manual_seed_all(seed)

In [4]:
class ShopeeDataset(Dataset):
    def __init__(self, df, img_dir, tokenizer, max_len=32, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_dir, row['image'])
        try:
            image = Image.open(img_path).convert('RGB')
        except Exception as e:
            print(f"Error loading image: {img_path}, error: {e}")
            image = Image.new('RGB', (224,224), (0,0,0))
        
        if self.transform:
            image = self.transform(image)
        
        text = row['title']
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        label = row['label_encoded']
        
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [5]:
class ImageTextClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        weights = EfficientNet_B0_Weights.DEFAULT
        self.image_model = models.efficientnet_b0(weights=weights)
        self.image_model.classifier = nn.Identity()  # feature extractor
        
        self.text_model = BertModel.from_pretrained('bert-base-uncased')
        for param in self.text_model.parameters():
            param.requires_grad = False
        
        self.classifier = nn.Sequential(
            nn.Linear(1280 + 768, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, image, input_ids, attention_mask):
        img_feat = self.image_model(image)
        txt_feat = self.text_model(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        combined = torch.cat((img_feat, txt_feat), dim=1)
        output = self.classifier(combined)
        return output

In [6]:
def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss, total_correct = 0, 0
    loop = tqdm(loader, desc="Train", leave=True)
    
    for batch in loop:
        optimizer.zero_grad()
        image = batch['image'].to(DEVICE)
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)
        
        outputs = model(image, input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * image.size(0)
        preds = outputs.argmax(dim=1)
        total_correct += (preds == labels).sum().item()
        
        loop.set_postfix(loss=loss.item())
    
    return total_loss / len(loader.dataset), total_correct / len(loader.dataset)

In [7]:
def validate_one_epoch(model, loader, criterion):
    model.eval()
    total_loss, total_correct = 0, 0
    loop = tqdm(loader, desc="Val", leave=True)
    
    with torch.no_grad():
        for batch in loop:
            image = batch['image'].to(DEVICE)
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['label'].to(DEVICE)
            
            outputs = model(image, input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * image.size(0)
            preds = outputs.argmax(dim=1)
            total_correct += (preds == labels).sum().item()
            
            loop.set_postfix(loss=loss.item())
    
    return total_loss / len(loader.dataset), total_correct / len(loader.dataset)

In [8]:
def test_model(model, loader):
    model.eval()
    total_correct = 0
    with torch.no_grad():
        for batch in loader:
            image = batch['image'].to(DEVICE)
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['label'].to(DEVICE)
            
            outputs = model(image, input_ids, attention_mask)
            preds = outputs.argmax(dim=1)
            total_correct += (preds == labels).sum().item()
    acc = total_correct / len(loader.dataset)
    print(f"Test Accuracy: {acc:.4f}")

In [10]:
if __name__ == "__main__":
    set_seed()
    
    print("CUDA available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("Device name:", torch.cuda.get_device_name(0))
        print("Current device:", torch.cuda.current_device())
    
    # 1. 데이터 로딩 및 그룹 단위 분할 함수
    df = pd.read_csv(CSV_PATH)
    
    # LabelEncoder 전체 데이터셋 기준으로 한 번만 학습
    le = LabelEncoder()
    df['label_encoded'] = le.fit_transform(df['label_group'])
    
    label_groups = df['label_group'].unique()
    train_groups, test_groups = train_test_split(label_groups, test_size=0.2, random_state=SEED)
    train_groups, val_groups = train_test_split(train_groups, test_size=0.25, random_state=SEED)  # 60/20/20
    
    def filter_by_groups(df, groups):
        return df[df['label_group'].isin(groups)].reset_index(drop=True)
    
    train_df = filter_by_groups(df, train_groups)
    val_df = filter_by_groups(df, val_groups)
    test_df = filter_by_groups(df, test_groups)
    
    print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")
    
    # 클래스 분포 확인
    def print_class_distribution(df, name="Dataset"):
        dist = df['label_group'].value_counts()
        print(f"{name} class distribution: {len(dist)} classes")
        print(dist.describe())
    
    print_class_distribution(train_df, "Train")
    print_class_distribution(val_df, "Val")
    print_class_distribution(test_df, "Test")
    
    # 2. Dataset, DataLoader 생성
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    train_transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
    ])
    val_transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
    ])
    
    train_dataset = ShopeeDataset(train_df, IMG_DIR, tokenizer, transform=train_transform)
    val_dataset = ShopeeDataset(val_df, IMG_DIR, tokenizer, transform=val_transform)
    test_dataset = ShopeeDataset(test_df, IMG_DIR, tokenizer, transform=val_transform)
    
    # num_workers 테스트 목적상 우선 0으로 시작하는 것을 추천합니다.
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    
    # 3. 모델 정의
    num_classes = len(le.classes_)
    model = ImageTextClassifier(num_classes).to(DEVICE)
    
    # 4. 학습 설정
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    
    # 6. 학습 루프 + 모델 저장 + epoch 기록
    best_val_acc = 0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    for epoch in range(EPOCHS):
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_acc = validate_one_epoch(model, val_loader, criterion)
        
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        print(f"Epoch {epoch+1}/{EPOCHS} - Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
        
        # 무조건 첫 epoch에는 모델 저장
        if epoch == 0 or val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), MODEL_SAVE_PATH)
            print(f"Model saved with val_acc: {best_val_acc:.4f}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), MODEL_SAVE_PATH)
            print(f"Model saved with val_acc: {best_val_acc:.4f}")
    
    # 7. 테스트 평가
    model.load_state_dict(torch.load(MODEL_SAVE_PATH))
    test_model(model, test_loader)

CUDA available: True
Device name: NVIDIA GeForce RTX 4070 Laptop GPU
Current device: 0
Train size: 20376, Val size: 6839, Test size: 7035
Train class distribution: 6608 classes
count    6608.000000
mean        3.083535
std         2.999922
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max        51.000000
Name: count, dtype: float64
Val class distribution: 2203 classes
count    2203.000000
mean        3.104403
std         2.531005
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max        41.000000
Name: count, dtype: float64
Test class distribution: 2203 classes
count    2203.000000
mean        3.193373
std         3.136294
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max        51.000000
Name: count, dtype: float64


Train: 100%|███████████████████████████████████████████████████████| 637/637 [03:20<00:00,  3.17it/s, loss=7.17]
Val: 100%|█████████████████████████████████████████████████████████| 214/214 [01:01<00:00,  3.47it/s, loss=10.3]


Epoch 1/1 - Train Loss: 8.8878, Acc: 0.0213 | Val Loss: 10.2685, Acc: 0.0000
Model saved with val_acc: 0.0000
Test Accuracy: 0.0000
