# 2회차 (1회차에서 이어서 훈련)

In [5]:
# 0. 라이브러리 임포트
import os, random
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import CLIPProcessor, CLIPModel

# 1. 설정
# 1. 설정
DATA_DIR = r"D:\Project\PJT_10\shopee-product-matching"
CSV_PATH = os.path.join(DATA_DIR, "train.csv")
IMG_DIR = os.path.join(DATA_DIR, "train_images")
MODEL_LOAD_PATH = "./clip_best0.pth"
MODEL_SAVE_PATH = "./clip_best.pth"

model = CLIPForClassification(clip_model_name, num_classes).to(DEVICE)

# ✅ 이전 체크포인트에서 가중치 불러오기
model.load_state_dict(torch.load(MODEL_LOAD_PATH))
print(f"✅ Loaded pretrained model from {MODEL_LOAD_PATH}")


BATCH_SIZE = 32
EPOCHS = 1
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if DEVICE.type == "cuda":
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

# 2. 데이터 로딩 및 레이블 인코딩
df = pd.read_csv(CSV_PATH).reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label_group"])
num_classes = df["label_encoded"].nunique()

# 3. 데이터셋 및 전처리 정의
clip_model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(clip_model_name)

class ShopeeCLIPDataset(Dataset):
    def __init__(self, dataframe, img_dir):
        self.df = dataframe
        self.img_dir = img_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.img_dir, row["image"])
        image = Image.open(image_path).convert("RGB")
        text = row["title"]
        label = row["label_encoded"]
        return {"image": image, "text": text, "label": label}

# 4. 모델 정의
class CLIPForClassification(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.clip = CLIPModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.clip.config.projection_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask, pixel_values):
        text_features = self.clip.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
        image_features = self.clip.get_image_features(pixel_values=pixel_values)
        combined = torch.cat([image_features, text_features], dim=1)
        logits = self.classifier(combined)
        return logits

model = CLIPForClassification(clip_model_name, num_classes).to(DEVICE)

def collate_fn(batch):
    texts = [item["text"] for item in batch]
    images = [item["image"] for item in batch]
    labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)
    inputs = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "pixel_values": inputs["pixel_values"],
        "label": labels
    }

# 5. Train/Val/Test DataLoader 구성
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(df, test_size=0.4, shuffle=True, random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, shuffle=True, random_state=SEED)

train_loader = DataLoader(ShopeeCLIPDataset(train_df, IMG_DIR), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(ShopeeCLIPDataset(val_df, IMG_DIR), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(ShopeeCLIPDataset(test_df, IMG_DIR), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# 6. 학습 루프
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
best_val_acc = 0

for epoch in range(EPOCHS):
    model.train()
    train_loss, train_correct = 0, 0
    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        pixel_values = batch["pixel_values"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, pixel_values)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * labels.size(0)
        preds = outputs.argmax(dim=1)
        train_correct += (preds == labels).sum().item()

    train_acc = train_correct / len(train_loader.dataset)
    train_loss /= len(train_loader.dataset)

    # 검증
    model.eval()
    val_loss, val_correct = 0, 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            pixel_values = batch["pixel_values"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            outputs = model(input_ids, attention_mask, pixel_values)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * labels.size(0)
            preds = outputs.argmax(dim=1)
            val_correct += (preds == labels).sum().item()

    val_acc = val_correct / len(val_loader.dataset)
    val_loss /= len(val_loader.dataset)

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"✅ Saved new best model (Val Acc: {val_acc:.4f})")

# 7. 테스트 평가
def evaluate(model, loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            pixel_values = batch["pixel_values"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            outputs = model(input_ids, attention_mask, pixel_values)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
    acc = correct / len(loader.dataset)
    print(f"\n🧪 Test Accuracy: {acc:.4f}")

# 8. 최종 테스트
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
evaluate(model, test_loader)

✅ Loaded pretrained model from ./clip_best0.pth


Train Epoch 1: 100%|██████████████████████████████████████████████████████████| 643/643 [05:22<00:00,  1.99it/s]
Val Epoch 1: 100%|████████████████████████████████████████████████████████████| 215/215 [01:17<00:00,  2.76it/s]


Epoch 1 | Train Loss: 9.1839, Acc: 0.0319 | Val Loss: 8.9730, Acc: 0.0724
✅ Saved new best model (Val Acc: 0.0724)

🧪 Test Accuracy: 0.0689


# 1-2회차 (실행 안 함. 처음부터 훈련.)

In [None]:
# 0. 라이브러리 임포트
import os, random
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import CLIPProcessor, CLIPModel

# 1. 설정
DATA_DIR = r"D:\Project\PJT_10\shopee-product-matching"
CSV_PATH = os.path.join(DATA_DIR, "train.csv")
IMG_DIR = os.path.join(DATA_DIR, "train_images")
MODEL_SAVE_PATH = "./clip_best.pth"

BATCH_SIZE = 32
EPOCHS = 1
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if DEVICE.type == "cuda":
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

# 2. 데이터 로딩 및 레이블 인코딩
df = pd.read_csv(CSV_PATH).reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label_group"])
num_classes = df["label_encoded"].nunique()

# 3. 데이터셋 및 전처리 정의
clip_model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(clip_model_name)

class ShopeeCLIPDataset(Dataset):
    def __init__(self, dataframe, img_dir):
        self.df = dataframe
        self.img_dir = img_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.img_dir, row["image"])
        image = Image.open(image_path).convert("RGB")
        text = row["title"]
        label = row["label_encoded"]
        return {"image": image, "text": text, "label": label}

# 4. 모델 정의
class CLIPForClassification(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.clip = CLIPModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.clip.config.projection_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask, pixel_values):
        text_features = self.clip.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
        image_features = self.clip.get_image_features(pixel_values=pixel_values)
        combined = torch.cat([image_features, text_features], dim=1)
        logits = self.classifier(combined)
        return logits

model = CLIPForClassification(clip_model_name, num_classes).to(DEVICE)

def collate_fn(batch):
    texts = [item["text"] for item in batch]
    images = [item["image"] for item in batch]
    labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)
    inputs = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "pixel_values": inputs["pixel_values"],
        "label": labels
    }

# 5. Train/Val/Test DataLoader 구성
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(df, test_size=0.4, shuffle=True, random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, shuffle=True, random_state=SEED)

train_loader = DataLoader(ShopeeCLIPDataset(train_df, IMG_DIR), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(ShopeeCLIPDataset(val_df, IMG_DIR), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(ShopeeCLIPDataset(test_df, IMG_DIR), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# 6. 학습 루프
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
best_val_acc = 0

for epoch in range(EPOCHS):
    model.train()
    train_loss, train_correct = 0, 0
    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        pixel_values = batch["pixel_values"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, pixel_values)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * labels.size(0)
        preds = outputs.argmax(dim=1)
        train_correct += (preds == labels).sum().item()

    train_acc = train_correct / len(train_loader.dataset)
    train_loss /= len(train_loader.dataset)

    # 검증
    model.eval()
    val_loss, val_correct = 0, 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            pixel_values = batch["pixel_values"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            outputs = model(input_ids, attention_mask, pixel_values)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * labels.size(0)
            preds = outputs.argmax(dim=1)
            val_correct += (preds == labels).sum().item()

    val_acc = val_correct / len(val_loader.dataset)
    val_loss /= len(val_loader.dataset)

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"✅ Saved new best model (Val Acc: {val_acc:.4f})")

# 7. 테스트 평가
def evaluate(model, loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            pixel_values = batch["pixel_values"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            outputs = model(input_ids, attention_mask, pixel_values)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
    acc = correct / len(loader.dataset)
    print(f"\n🧪 Test Accuracy: {acc:.4f}")

# 8. 최종 테스트
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
evaluate(model, test_loader)

# 1회차

In [3]:
# 0. 라이브러리 임포트
import os, random
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import CLIPProcessor, CLIPModel, default_data_collator

# 1. 설정
DATA_DIR = r"D:\Project\PJT_10\shopee-product-matching"
CSV_PATH = os.path.join(DATA_DIR, "train.csv")
IMG_DIR = os.path.join(DATA_DIR, "train_images")
MODEL_SAVE_PATH = "./clip_best.pth"

BATCH_SIZE = 32
EPOCHS = 5
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if DEVICE.type == "cuda":
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

# 2. 데이터 로딩 및 레이블 인코딩
df = pd.read_csv(CSV_PATH)
df = df.reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label_group"])
num_classes = df["label_encoded"].nunique()

# 3. 데이터셋 및 전처리 정의
clip_model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(clip_model_name)

class ShopeeCLIPDataset(Dataset):
    def __init__(self, dataframe, img_dir):
        self.df = dataframe
        self.img_dir = img_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.img_dir, row["image"])
        image = Image.open(image_path).convert("RGB")
        text = row["title"]
        label = row["label_encoded"]
        return {"image": image, "text": text, "label": label}

# 4. 모델 정의
class CLIPForClassification(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.clip = CLIPModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.clip.config.projection_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask, pixel_values):
        text_features = self.clip.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
        image_features = self.clip.get_image_features(pixel_values=pixel_values)
        combined = torch.cat([image_features, text_features], dim=1)
        logits = self.classifier(combined)
        return logits

model = CLIPForClassification(clip_model_name, num_classes).to(DEVICE)


def collate_fn(batch):
    texts = [item["text"] for item in batch]
    images = [item["image"] for item in batch]
    labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)

    inputs = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
        truncation=True
    )

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "pixel_values": inputs["pixel_values"],
        "label": labels
    }


# 5. DataLoader 구성
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=SEED)

train_dataset = ShopeeCLIPDataset(train_df, IMG_DIR)
val_dataset = ShopeeCLIPDataset(val_df, IMG_DIR)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=collate_fn)
# 6. 학습 루프
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
best_val_acc = 0

for epoch in range(EPOCHS):
    model.train()
    train_loss, train_correct = 0, 0
    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        pixel_values = batch["pixel_values"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, pixel_values)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * labels.size(0)
        preds = outputs.argmax(dim=1)
        train_correct += (preds == labels).sum().item()

    train_acc = train_correct / len(train_dataset)
    train_loss /= len(train_dataset)

    # 검증
    model.eval()
    val_loss, val_correct = 0, 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            pixel_values = batch["pixel_values"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            outputs = model(input_ids, attention_mask, pixel_values)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * labels.size(0)
            preds = outputs.argmax(dim=1)
            val_correct += (preds == labels).sum().item()

    val_acc = val_correct / len(val_dataset)
    val_loss /= len(val_dataset)

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"✅ Saved new best model (Val Acc: {val_acc:.4f})")

# 7. 테스트 평가
def evaluate(model, loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            pixel_values = batch["pixel_values"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            outputs = model(input_ids, attention_mask, pixel_values)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
    acc = correct / len(loader.dataset)
    print(f"Test Accuracy: {acc:.4f}")

# Load and test
model.load_state_dict(torch.load(MODEL_SAVE_PATH))

Train Epoch 1: 100%|██████████████████████████████████████████████████████████| 857/857 [07:13<00:00,  1.98it/s]
Val Epoch 1: 100%|████████████████████████████████████████████████████████████| 215/215 [01:17<00:00,  2.79it/s]


Epoch 1 | Train Loss: 9.1306, Acc: 0.0455 | Val Loss: 8.8701, Acc: 0.0717
✅ Saved new best model (Val Acc: 0.0717)


Train Epoch 2: 100%|██████████████████████████████████████████████████████████| 857/857 [07:58<00:00,  1.79it/s]
Val Epoch 2: 100%|████████████████████████████████████████████████████████████| 215/215 [01:26<00:00,  2.48it/s]


Epoch 2 | Train Loss: 8.2295, Acc: 0.0673 | Val Loss: 8.3418, Acc: 0.0775
✅ Saved new best model (Val Acc: 0.0775)


Train Epoch 3: 100%|██████████████████████████████████████████████████████████| 857/857 [07:31<00:00,  1.90it/s]
Val Epoch 3: 100%|████████████████████████████████████████████████████████████| 215/215 [01:19<00:00,  2.69it/s]


Epoch 3 | Train Loss: 7.1459, Acc: 0.1465 | Val Loss: 7.7568, Acc: 0.1476
✅ Saved new best model (Val Acc: 0.1476)


Train Epoch 4: 100%|██████████████████████████████████████████████████████████| 857/857 [07:36<00:00,  1.88it/s]
Val Epoch 4: 100%|████████████████████████████████████████████████████████████| 215/215 [01:21<00:00,  2.65it/s]


Epoch 4 | Train Loss: 6.0046, Acc: 0.2861 | Val Loss: 7.0959, Acc: 0.2209
✅ Saved new best model (Val Acc: 0.2209)


Train Epoch 5: 100%|██████████████████████████████████████████████████████████| 857/857 [07:33<00:00,  1.89it/s]
Val Epoch 5: 100%|████████████████████████████████████████████████████████████| 215/215 [01:17<00:00,  2.76it/s]


Epoch 5 | Train Loss: 4.7684, Acc: 0.4612 | Val Loss: 6.3750, Acc: 0.2867
✅ Saved new best model (Val Acc: 0.2867)


TypeError: ShopeeCLIPDataset.__init__() takes 3 positional arguments but 4 were given

In [4]:
# 테스트용 DataLoader도 collate_fn 통일
val_loader = DataLoader(
    ShopeeCLIPDataset(val_df, IMG_DIR),
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
)

evaluate(model, val_loader)

Test Accuracy: 0.2867
