# **📄 Document type classification baseline code**
> 문서 타입 분류 대회에 오신 여러분 환영합니다! 🎉     
> 아래 baseline에서는 ResNet 모델을 로드하여, 모델을 학습 및 예측 파일 생성하는 프로세스에 대해 알아보겠습니다.

## Contents
- Prepare Environments
- Import Library & Define Functions
- Hyper-parameters
- Load Data
- Train Model
- Inference & Save File


## 1. Prepare Environments

* 데이터 로드를 위한 구글 드라이브를 마운트합니다.
* 필요한 라이브러리를 설치합니다.

In [85]:
# 구글 드라이브 마운트, Colab을 이용하지 않는다면 패스해도 됩니다.
from google.colab import drive
drive.mount('/gdrive', force_remount=True)
drive.mount('/content/drive')

Mounted at /gdrive
Mounted at /content/drive


In [86]:
# 필요한 라이브러리를 설치합니다.
!pip install timm



In [87]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## 2. Import Library & Define Functions
* 학습 및 추론에 필요한 라이브러리를 로드합니다.
* 학습 및 추론에 필요한 함수와 클래스를 정의합니다.

In [88]:
import os
import time
import random
import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

In [89]:
# 시드를 고정합니다.
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

In [90]:
# 데이터셋 클래스를 정의합니다.
class ImageDataset(Dataset):
    def __init__(self, csv, path, transform=None):
        self.df = pd.read_csv(csv).values
        self.path = path
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        name, target = self.df[idx]
        img = np.array(Image.open(os.path.join(self.path, name)))
        if self.transform:
            img = self.transform(image=img)['image']
        return img, target

In [91]:
# one epoch 학습을 위한 함수입니다.
def train_one_epoch(loader, model, optimizer, loss_fn, device):
    model.train()
    train_loss = 0
    preds_list = []
    targets_list = []

    pbar = tqdm(loader)
    for image, targets in pbar:
        image = image.to(device)
        targets = targets.to(device)

        model.zero_grad(set_to_none=True)

        preds = model(image)
        loss = loss_fn(preds, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())
        targets_list.extend(targets.detach().cpu().numpy())

        pbar.set_description(f"Loss: {loss.item():.4f}")

    train_loss /= len(loader)
    train_acc = accuracy_score(targets_list, preds_list)
    train_f1 = f1_score(targets_list, preds_list, average='macro')

    ret = {
        "train_loss": train_loss,
        "train_acc": train_acc,
        "train_f1": train_f1,
    }

    return ret

## 3. Hyper-parameters
* 학습 및 추론에 필요한 하이퍼파라미터들을 정의합니다.

In [92]:


# data config
data_path = 'datasets_fin/'

# model config
model_name = 'resnet34' # 'resnet50' 'efficientnet-b0', ...

# training config
img_size = 32
LR = 1e-3
EPOCHS = 10
BATCH_SIZE = 32
num_workers = 0

## 4. Load Data
* 학습, 테스트 데이터셋과 로더를 정의합니다.

In [93]:
# augmentation을 위한 transform 코드
trn_transform = A.Compose([
    # 이미지 크기 조정
    A.Resize(height=img_size, width=img_size),
    # images normalization
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    # numpy 이미지나 PIL 이미지를 PyTorch 텐서로 변환
    ToTensorV2(),
])

# test image 변환을 위한 transform 코드
tst_transform = A.Compose([
    A.Resize(height=img_size, width=img_size),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

In [94]:
base_path = "/content/drive/MyDrive/Colab Notebooks/Document type classification/data"

# Dataset 정의
trn_dataset = ImageDataset(
    f"{base_path}/train.csv",
     f"{base_path}/train/",
    transform=trn_transform
)
tst_dataset = ImageDataset(
    f"{base_path}/sample_submission.csv",
    f"{base_path}/test/",
    transform=tst_transform
)
print(len(trn_dataset), len(tst_dataset))

1570 3140


In [95]:
# DataLoader 정의
trn_loader = DataLoader(
    trn_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True,
    drop_last=False
)
tst_loader = DataLoader(
    tst_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

## 5. Train Model
* 모델을 로드하고, 학습을 진행합니다.

In [96]:
# 모델 로드
model = timm.create_model(
    model_name,
    pretrained=True,
    num_classes=17
).cuda()  # 직접 .cuda()를 호출하여 모델을 GPU로 이동
loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=LR)

In [97]:
torch.cuda.is_available()

True

In [98]:
for epoch in range(EPOCHS):
    ret = train_one_epoch(trn_loader, model, optimizer, loss_fn, device=device)
    ret['epoch'] = epoch

    log = ""
    for k, v in ret.items():
      log += f"{k}: {v:.4f}\n"
    print(log)

Loss: 2.5085: 100%|██████████| 50/50 [00:16<00:00,  3.09it/s]


train_loss: 2.4864
train_acc: 0.2637
train_f1: 0.2335
epoch: 0.0000



Loss: 1.6583: 100%|██████████| 50/50 [00:17<00:00,  2.91it/s]


train_loss: 1.5864
train_acc: 0.5242
train_f1: 0.4704
epoch: 1.0000



Loss: 1.8392: 100%|██████████| 50/50 [00:10<00:00,  4.57it/s]


train_loss: 1.1561
train_acc: 0.6401
train_f1: 0.5994
epoch: 2.0000



Loss: 3.7517: 100%|██████████| 50/50 [00:13<00:00,  3.68it/s]


train_loss: 0.9003
train_acc: 0.7248
train_f1: 0.6987
epoch: 3.0000



Loss: 4.6384: 100%|██████████| 50/50 [00:12<00:00,  3.93it/s]


train_loss: 0.7343
train_acc: 0.7809
train_f1: 0.7581
epoch: 4.0000



Loss: 3.6839: 100%|██████████| 50/50 [00:11<00:00,  4.54it/s]


train_loss: 0.5520
train_acc: 0.8255
train_f1: 0.8030
epoch: 5.0000



Loss: 3.3268: 100%|██████████| 50/50 [00:11<00:00,  4.48it/s]


train_loss: 0.4796
train_acc: 0.8611
train_f1: 0.8483
epoch: 6.0000



Loss: 2.4960: 100%|██████████| 50/50 [00:15<00:00,  3.32it/s]


train_loss: 0.3812
train_acc: 0.8777
train_f1: 0.8696
epoch: 7.0000



Loss: 2.7015: 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]


train_loss: 0.3433
train_acc: 0.8981
train_f1: 0.8871
epoch: 8.0000



Loss: 2.2501: 100%|██████████| 50/50 [00:11<00:00,  4.39it/s]

train_loss: 0.2533
train_acc: 0.9306
train_f1: 0.9234
epoch: 9.0000






In [99]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

# 학습 이미지 변환을 위한 증강된 transform 코드
trn_transform = A.Compose([
    A.Resize(height=img_size, width=img_size),
    A.HorizontalFlip(p=0.5),  # 50% 확률로 수평 뒤집기
    A.VerticalFlip(p=0.5),  # 50% 확률로 수직 뒤집기
    A.RandomRotate90(p=0.5),  # 50% 확률로 90도 회전
    A.Blur(blur_limit=3, p=0.2),  # 20% 확률로 블러 처리
    A.CLAHE(p=0.2),  # 20% 확률로 CLAHE 적용
    A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2, p=0.3),  # 색상 변화
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

In [100]:
# 학습 데이터셋 재정의 (데이터 증강 적용)
trn_dataset = ImageDataset(
    csv=f"{base_path}/train.csv",
    path=f"{base_path}/train/",
    transform=trn_transform  # 새로 정의한 증강 포함 transform 사용
)

In [101]:
# DataLoader 재정의
trn_loader = DataLoader(
    trn_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True,
    drop_last=True  # 마지막 미니배치 크기가 작을 경우 버림 (선택사항)
)


In [102]:
# 모델 학습 실행 부분은 동일하게 유지
for epoch in range(EPOCHS):
    ret = train_one_epoch(trn_loader, model, optimizer, loss_fn, device=device)
    ret['epoch'] = epoch
    # 로그 출력
    log = ""
    for k, v in ret.items():
        log += f"{k}: {v:.4f}\n"
    print(log)


Loss: 1.2447: 100%|██████████| 49/49 [00:20<00:00,  2.40it/s]


train_loss: 1.7816
train_acc: 0.4764
train_f1: 0.4536
epoch: 0.0000



Loss: 1.2686: 100%|██████████| 49/49 [00:19<00:00,  2.52it/s]


train_loss: 1.3083
train_acc: 0.5497
train_f1: 0.5219
epoch: 1.0000



Loss: 0.8368: 100%|██████████| 49/49 [00:11<00:00,  4.40it/s]


train_loss: 1.1540
train_acc: 0.6167
train_f1: 0.5876
epoch: 2.0000



Loss: 1.1162: 100%|██████████| 49/49 [00:11<00:00,  4.28it/s]


train_loss: 1.0444
train_acc: 0.6575
train_f1: 0.6261
epoch: 3.0000



Loss: 0.6834: 100%|██████████| 49/49 [00:11<00:00,  4.27it/s]


train_loss: 1.0248
train_acc: 0.6486
train_f1: 0.6211
epoch: 4.0000



Loss: 0.8992: 100%|██████████| 49/49 [00:11<00:00,  4.22it/s]


train_loss: 0.9678
train_acc: 0.6716
train_f1: 0.6410
epoch: 5.0000



Loss: 1.3073: 100%|██████████| 49/49 [00:11<00:00,  4.19it/s]


train_loss: 0.8595
train_acc: 0.7073
train_f1: 0.6803
epoch: 6.0000



Loss: 1.0928: 100%|██████████| 49/49 [00:11<00:00,  4.12it/s]


train_loss: 0.8896
train_acc: 0.6945
train_f1: 0.6704
epoch: 7.0000



Loss: 1.0957: 100%|██████████| 49/49 [00:12<00:00,  3.92it/s]


train_loss: 0.8539
train_acc: 0.7047
train_f1: 0.6753
epoch: 8.0000



Loss: 1.0840: 100%|██████████| 49/49 [00:12<00:00,  4.05it/s]

train_loss: 0.7885
train_acc: 0.7443
train_f1: 0.7192
epoch: 9.0000






# 6. Inference & Save File
* 테스트 이미지에 대한 추론을 진행하고, 결과 파일을 저장합니다.

In [103]:
preds_list = []

model.eval()
for image, _ in tqdm(tst_loader):
    image = image.cuda()

    with torch.no_grad():
        preds = model(image)
    preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())

100%|██████████| 99/99 [00:21<00:00,  4.61it/s]


In [104]:
pred_df = pd.DataFrame(tst_dataset.df, columns=['ID', 'target'])
pred_df['target'] = preds_list

In [105]:
sample_submission_df = pd.read_csv(f"{base_path}/sample_submission.csv")
assert (sample_submission_df['ID'] == pred_df['ID']).all()

In [106]:
pred_df.to_csv(f"{base_path}/pred.csv", index=False)

In [107]:
pred_df.head()

Unnamed: 0,ID,target
0,0008fdb22ddce0ce.jpg,16
1,00091bffdffd83de.jpg,6
2,00396fbc1f6cc21d.jpg,5
3,00471f8038d9c4b6.jpg,10
4,00901f504008d884.jpg,2
