### 1. 초기 설정 및 라이브러리 import

In [29]:
import os
import time
import random

import torch
import pandas as pd
import numpy as np
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

from dotenv import load_dotenv
from datetime import datetime
from zoneinfo import ZoneInfo
import wandb

from transformers import LayoutLMv3ForSequenceClassification, LayoutLMv3Processor
import pytesseract

import matplotlib.pyplot as plt
from PIL import Image

In [30]:
# wandb 연동
load_dotenv()
api_key = os.getenv('WANDB_API_KEY')

wandb.login(key=api_key)

train_time = datetime.fromtimestamp(time.time(), tz=ZoneInfo("Asia/Seoul")).strftime("%Y%m%d-%H%M%S")
wandb.init(project="competition2-cv", name=f"layoutlmv3-{train_time}")

print(train_time)





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01128888888876342, max=1.0)…

20240803-213638


In [31]:
# 시드를 고정합니다.
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

### 2. 데이터셋 및 유틸리티 함수 정의

In [32]:
# 데이터셋 클래스를 정의합니다.
class ImageDataset(Dataset):
    def __init__(self, csv, path, processor, transform=None): 
        self.df = pd.read_csv(csv)
        self.path = path
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        name, target = self.df.iloc[idx]
        image = Image.open(os.path.join(self.path, name)).convert("RGB")
        if self.transform:
            image = self.transform(image=image)["image"]

        encoded_inputs = self.processor(image, return_tensors="pt", padding="max_length", truncation=True)
        input_ids = encoded_inputs["input_ids"].squeeze()
        attention_mask = encoded_inputs["attention_mask"].squeeze()
        bbox = encoded_inputs["bbox"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "bbox": bbox,
            "labels": torch.tensor(target, dtype=torch.long)
        }

In [33]:
# one epoch 학습을 위한 함수입니다.
def train_one_epoch(loader, model, optimizer, loss_fn, device):
    model.train()
    train_loss = 0
    preds_list = []
    targets_list = []

    pbar = tqdm(loader)
    for batch in pbar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        bbox = batch["bbox"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, bbox=bbox, labels=labels)
        loss = outputs.loss
        preds = outputs.logits

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())
        targets_list.extend(labels.detach().cpu().numpy())

        pbar.set_description(f"Loss: {loss.item():.4f}")

    train_loss /= len(loader)
    train_acc = accuracy_score(targets_list, preds_list)
    train_f1 = f1_score(targets_list, preds_list, average='macro')

    ret = {
        "train_loss": train_loss,
        "train_acc": train_acc,
        "train_f1": train_f1,
    }

    wandb.log(ret)
    
    return ret

In [34]:
# 검증을 위한 함수입니다.
def validate(loader, model, device):
    model.eval()
    val_loss = 0
    preds_list = []
    targets_list = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            bbox = batch["bbox"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, bbox=bbox, labels=labels)
            loss = outputs.loss
            preds = outputs.logits

            val_loss += loss.item()
            preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())
            targets_list.extend(labels.detach().cpu().numpy())

    val_loss /= len(loader)
    val_acc = accuracy_score(targets_list, preds_list)
    val_f1 = f1_score(targets_list, preds_list, average='macro')

    ret = {
        "val_loss": val_loss,
        "val_acc": val_acc,
        "val_f1": val_f1,
    }

    wandb.log(ret)
    
    return ret

### 3. 설정 및 데이터 준비

In [35]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# data config
data_path = '../data/'

# model config
model_name = 'microsoft/layoutlmv3-base'

# training config
LR = 1e-3
EPOCHS = 1
BATCH_SIZE = 4
num_workers = 0
N_FOLDS = 2  # 추가: fold 수
PATIENCE = 3  # 얼리스탑핑을 위한 patience

wandb.config.update({
    "learning_rate": LR,
    "architecture": model_name,
    "dataset": "custom-dataset",
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "n_folds": N_FOLDS,  # 추가
    "patience": PATIENCE  # 추가
})


In [36]:
# processor 정의
processor = LayoutLMv3Processor.from_pretrained(model_name)
print(processor.image_processor.size)

{'height': 224, 'width': 224}


In [37]:
# Dataset 정의
trn_dataset = ImageDataset(
    f"{data_path}train.csv",
    f"{data_path}train/",
    processor=processor
)

tst_dataset = ImageDataset(
    f"{data_path}sample_submission.csv",
    f"{data_path}test/",
    processor=processor
)

print(len(trn_dataset), len(tst_dataset))

# Stratified K-Fold 정의
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)


1570 3140


### 4. 학습 및 검증 (K-Fold Cross Validation)

In [38]:
# 각 fold의 결과를 저장할 리스트
fold_results = []

# K-Fold Cross Validation
for fold, (train_idx, val_idx) in enumerate(skf.split(trn_dataset.df, trn_dataset.df['target'])):
    print(f"Fold {fold+1}/{N_FOLDS}")
    
    # 학습 및 검증 데이터셋 생성
    train_subset = Subset(trn_dataset, train_idx)
    val_subset = Subset(trn_dataset, val_idx)
    
    # DataLoader 정의
    train_loader = DataLoader(
        train_subset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True,
        drop_last=False
    )
    val_loader = DataLoader(
        val_subset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )
    
    # model 및 optimizer 정의
    model = LayoutLMv3ForSequenceClassification.from_pretrained(model_name, num_labels=17).to(device)
    optimizer = Adam(model.parameters(), lr=LR)
    
    # 학습 및 평가 루프
    best_val_f1 = 0
    epochs_no_improve = 0
    
    for epoch in range(EPOCHS):
        train_ret = train_one_epoch(train_loader, model, optimizer, loss_fn=None, device=device) # 분류 작업을 위해 이미 loss function이 내장되어 있음(Cross Entropy Loss)
        val_ret = validate(val_loader, model, device)
        
        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"Train Loss: {train_ret['train_loss']:.4f}, Train Acc: {train_ret['train_acc']:.4f}, Train F1: {train_ret['train_f1']:.4f}")
        print(f"Val Loss: {val_ret['val_loss']:.4f}, Val Acc: {val_ret['val_acc']:.4f}, Val F1: {val_ret['val_f1']:.4f}")
        
        if val_ret['val_f1'] > best_val_f1:
            best_val_f1 = val_ret['val_f1']
            epochs_no_improve = 0
            torch.save(model.state_dict(), f"best_model_fold{fold+1}.pth")
        else:
            epochs_no_improve += 1
            
        if epochs_no_improve >= PATIENCE:
            print(f"Early stopping at epoch {epoch+1}")
            break
    
    fold_results.append(best_val_f1)

Fold 1/2


Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loss: 2.9309: 100%|██████████| 197/197 [04:15<00:00,  1.30s/it]


Epoch 1/1
Train Loss: 2.9699, Train Acc: 0.0586, Train F1: 0.0513
Val Loss: 2.8491, Val Acc: 0.0637, Val F1: 0.0070
Fold 2/2


Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loss: 2.5966: 100%|██████████| 197/197 [04:21<00:00,  1.33s/it]


Epoch 1/1
Train Loss: 3.0025, Train Acc: 0.0522, Train F1: 0.0462
Val Loss: 2.8516, Val Acc: 0.0637, Val F1: 0.0070


In [39]:
# 전체 fold의 평균 성능 계산
mean_f1 = np.mean(fold_results)
print(f"Mean F1 score across {N_FOLDS} folds: {mean_f1:.4f}")

Mean F1 score across 2 folds: 0.0070


### 5. 테스트 데이터 예측 및 결과 저장

In [40]:
# 테스트 데이터에 대한 예측
tst_loader = DataLoader(
    tst_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

preds_list = []

for fold in range(N_FOLDS):
    model.load_state_dict(torch.load(f"best_model_fold{fold+1}.pth"))
    model.eval()
    fold_preds = []
    
    for batch in tqdm(tst_loader, desc=f"Predicting fold {fold+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        bbox = batch["bbox"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, bbox=bbox)
            preds = outputs.logits

        fold_preds.extend(preds.argmax(dim=1).detach().cpu().numpy())
    
    preds_list.append(fold_preds)

Predicting fold 1: 100%|██████████| 785/785 [11:30<00:00,  1.14it/s]
Predicting fold 2: 100%|██████████| 785/785 [11:40<00:00,  1.12it/s]


In [41]:
# 각 fold의 예측을 평균내어 최종 예측 생성
final_preds = np.mean(preds_list, axis=0).round().astype(int)

pred_df = pd.DataFrame(tst_dataset.df, columns=['ID', 'target'])
pred_df['target'] = final_preds

sample_submission_df = pd.read_csv(f"{data_path}sample_submission.csv")
assert (sample_submission_df['ID'] == pred_df['ID']).all()

pred_df.to_csv("pred.csv", index=False)

pred_df.head()

# wandb 실행 종료
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_acc,█▁
train_f1,█▁
train_loss,▁█
val_acc,▁▁
val_f1,▁▁
val_loss,▁█

0,1
train_acc,0.05223
train_f1,0.0462
train_loss,3.00255
val_acc,0.06369
val_f1,0.00704
val_loss,2.85159
