# **📄 Document type classification baseline code**
> 문서 타입 분류 대회에 오신 여러분 환영합니다! 🎉     
> 아래 baseline에서는 ResNet 모델을 로드하여, 모델을 학습 및 예측 파일 생성하는 프로세스에 대해 알아보겠습니다.

## Contents
- Prepare Environments
- Import Library & Define Functions
- Hyper-parameters
- Load Data
- Train Model
- Inference & Save File


## 1. Prepare Environments

* 데이터 로드를 위한 구글 드라이브를 마운트합니다.
* 필요한 라이브러리를 설치합니다.

In [1]:
# 구글 드라이브 마운트, Colab을 이용하지 않는다면 패스해도 됩니다.
# from google.colab import drive
# drive.mount('/gdrive', force_remount=True)
# drive.mount('/content/drive')

Mounted at /gdrive
Mounted at /content/drive


In [2]:
# 구글 드라이브에 업로드된 대회 데이터를 압축 해제하고 로컬에 저장합니다.
# !tar -xvf drive/MyDrive/datasets_fin.tar > /dev/null

In [28]:
# 필요한 라이브러리를 설치합니다.
!pip install timm

[0m

## 2. Import Library & Define Functions
* 학습 및 추론에 필요한 라이브러리를 로드합니다.
* 학습 및 추론에 필요한 함수와 클래스를 정의합니다.

In [1]:
import os
import time
import random

import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# 시드를 고정합니다.
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

In [3]:
# read data
train_df = pd.read_csv("data/train.csv")
train_df.head()

Unnamed: 0,ID,target
0,002f99746285dfdd.jpg,16
1,008ccd231e1fea5d.jpg,10
2,008f5911bfda7695.jpg,10
3,009235e4c9c07af5.jpg,4
4,00b2f44967580c74.jpg,16


In [4]:
# Label data 수정
train_df.loc[train_df["ID"]=="8646f2c3280a4f49.jpg", ["target"]] = [3]

train_df.loc[train_df["ID"]=="0583254a73b48ece.jpg", ["target"]] = [6]
train_df.loc[train_df["ID"]=="38d1796b6ad99ddd.jpg", ["target"]] = [6]

train_df.loc[train_df["ID"]=="45f0d2dfc7e47c03.jpg", ["target"]] = [7]
train_df.loc[train_df["ID"]=="7100c5c67aecadc5.jpg", ["target"]] = [7]
train_df.loc[train_df["ID"]=="1ec14a14bbe633db.jpg", ["target"]] = [7]

train_df.loc[train_df["ID"]=="aec62dced7af97cd.jpg", ["target"]] = [14]
train_df.loc[train_df["ID"]=="c5182ab809478f12.jpg", ["target"]] = [14]

In [5]:
train_df.to_csv("data/train.csv", index=False)

In [6]:
# read data
train_df = pd.read_csv("data/train.csv")

In [7]:
print(train_df.loc[train_df["ID"]=="8646f2c3280a4f49.jpg", ["ID","target"]])
print(train_df.loc[train_df["ID"]=="0583254a73b48ece.jpg", ["ID","target"]])
print(train_df.loc[train_df["ID"]=="38d1796b6ad99ddd.jpg", ["ID","target"]])
print(train_df.loc[train_df["ID"]=="45f0d2dfc7e47c03.jpg", ["ID","target"]])
print(train_df.loc[train_df["ID"]=="1ec14a14bbe633db.jpg", ["ID","target"]])
print(train_df.loc[train_df["ID"]=="7100c5c67aecadc5.jpg", ["ID","target"]])
print(train_df.loc[train_df["ID"]=="aec62dced7af97cd.jpg", ["ID","target"]])
print(train_df.loc[train_df["ID"]=="c5182ab809478f12.jpg", ["ID","target"]])

                       ID  target
862  8646f2c3280a4f49.jpg       3
                      ID  target
38  0583254a73b48ece.jpg       6
                       ID  target
340  38d1796b6ad99ddd.jpg       6
                       ID  target
428  45f0d2dfc7e47c03.jpg       7
                       ID  target
192  1ec14a14bbe633db.jpg       7
                       ID  target
723  7100c5c67aecadc5.jpg       7
                        ID  target
1095  aec62dced7af97cd.jpg      14
                        ID  target
1237  c5182ab809478f12.jpg      14


In [7]:
# 데이터셋 클래스를 정의합니다.
class ImageDataset(Dataset):
    def __init__(self, csv, path, transform=None):
        self.df = pd.read_csv(csv).values
        self.path = path
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        name, target = self.df[idx]
        img = np.array(Image.open(os.path.join(self.path, name)))
        if self.transform:
            img = self.transform(image=img)['image']
        return img, target

In [8]:
# one epoch 학습을 위한 함수입니다.
def train_one_epoch(loader, model, optimizer, loss_fn, device):
    model.train()
    train_loss = 0
    preds_list = []
    targets_list = []

    pbar = tqdm(loader)
    for image, targets in pbar:
        image = image.to(device)
        targets = targets.to(device)

        model.zero_grad(set_to_none=True)

        preds = model(image)
        loss = loss_fn(preds, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())
        targets_list.extend(targets.detach().cpu().numpy())

        pbar.set_description(f"Loss: {loss.item():.4f}")

    train_loss /= len(loader)
    train_acc = accuracy_score(targets_list, preds_list)
    train_f1 = f1_score(targets_list, preds_list, average='macro')

    ret = {
        "train_loss": train_loss,
        "train_acc": train_acc,
        "train_f1": train_f1,
    }

    return ret

## 3. Hyper-parameters
* 학습 및 추론에 필요한 하이퍼파라미터들을 정의합니다.

In [9]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# data config
data_path = 'data/'

# model config
model_name = 'tf_efficientnet_b4' # 'resnet34' # 'resnet50' 'efficientnet-b0', ...

# training config
img_size = 256 #256
crop_size = 224 # 추가
LR = 1e-3
EPOCHS = 20 #1
BATCH_SIZE = 64 # 32
num_workers = 0

In [27]:
all_pretrained_models_available = timm.list_models(pretrained=True)
all_pretrained_models_available

# if 'bat_resnext26ts.ch_in1k' in all_pretrained_models_available:
#     print("Exist!")

['bat_resnext26ts.ch_in1k',
 'beit_base_patch16_224.in22k_ft_in22k',
 'beit_base_patch16_224.in22k_ft_in22k_in1k',
 'beit_base_patch16_384.in22k_ft_in22k_in1k',
 'beit_large_patch16_224.in22k_ft_in22k',
 'beit_large_patch16_224.in22k_ft_in22k_in1k',
 'beit_large_patch16_384.in22k_ft_in22k_in1k',
 'beit_large_patch16_512.in22k_ft_in22k_in1k',
 'beitv2_base_patch16_224.in1k_ft_in1k',
 'beitv2_base_patch16_224.in1k_ft_in22k',
 'beitv2_base_patch16_224.in1k_ft_in22k_in1k',
 'beitv2_large_patch16_224.in1k_ft_in1k',
 'beitv2_large_patch16_224.in1k_ft_in22k',
 'beitv2_large_patch16_224.in1k_ft_in22k_in1k',
 'botnet26t_256.c1_in1k',
 'caformer_b36.sail_in1k',
 'caformer_b36.sail_in1k_384',
 'caformer_b36.sail_in22k',
 'caformer_b36.sail_in22k_ft_in1k',
 'caformer_b36.sail_in22k_ft_in1k_384',
 'caformer_m36.sail_in1k',
 'caformer_m36.sail_in1k_384',
 'caformer_m36.sail_in22k',
 'caformer_m36.sail_in22k_ft_in1k',
 'caformer_m36.sail_in22k_ft_in1k_384',
 'caformer_s18.sail_in1k',
 'caformer_s18.s

## 4. Load Data
* 학습, 테스트 데이터셋과 로더를 정의합니다.

In [7]:
# # augmentation을 위한 transform 코드
# trn_transform = A.Compose([
#     # 이미지 크기 조정
#     A.Resize(height=img_size, width=img_size),
#     # images normalization
#     A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
#     # numpy 이미지나 PIL 이미지를 PyTorch 텐서로 변환
#     ToTensorV2(),
# ])

# # test image 변환을 위한 transform 코드
# tst_transform = A.Compose([
#     A.Resize(height=img_size, width=img_size),
#     A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
#     ToTensorV2(),
# ])

In [10]:
# 다양한 aug
trn_transform = A.Compose([
    A.Resize(height=img_size, width=img_size), 
    A.RandomCrop(height=crop_size, width=crop_size),
    A.OneOf([
                A.HorizontalFlip(p=1),
                A.RandomRotate90(p=1),
                A.VerticalFlip(p=1)            
    ], p=1),
    A.OneOf([
                A.MotionBlur(p=1),
                A.OpticalDistortion(p=1),
                A.GaussNoise(p=1)                 
    ], p=1),
    A.OneOf([
                A.HorizontalFlip(p=0.5),
                A.VerticalFlip(p=0.5),
                A.GaussianBlur(p=0.5),
                A.CoarseDropout(p=0.6)
    ], p=1),
    # images normalization
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    # numpy 이미지나 PIL 이미지를 PyTorch 텐서로 변환
    ToTensorV2(),
])


tst_transform = A.Compose([
    A.Resize(height=img_size, width=img_size), 
    A.RandomCrop(height=crop_size, width=crop_size),
    A.OneOf([
                A.HorizontalFlip(p=1),
                A.RandomRotate90(p=1),
                A.VerticalFlip(p=1)            
    ], p=1),
    A.OneOf([
                A.MotionBlur(p=1),
                A.OpticalDistortion(p=1),
                A.GaussNoise(p=1)                 
    ], p=1),
    A.OneOf([
                A.HorizontalFlip(p=0.5),
                A.VerticalFlip(p=0.5),
                A.GaussianBlur(p=0.5),
                A.CoarseDropout(p=0.6)
    ], p=1),
    # images normalization
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    # numpy 이미지나 PIL 이미지를 PyTorch 텐서로 변환
    ToTensorV2(),
])

In [11]:
# Dataset 정의
trn_dataset = ImageDataset(
    "data/train.csv",
    "data/train/",
    transform=trn_transform
)
tst_dataset = ImageDataset(
    "data/sample_submission.csv",
    "data/test/",
    transform=tst_transform
)
print(len(trn_dataset), len(tst_dataset))

1570 3140


In [12]:
# DataLoader 정의
trn_loader = DataLoader(
    trn_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True,
    drop_last=False
)
tst_loader = DataLoader(
    tst_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

## 5. Train Model
* 모델을 로드하고, 학습을 진행합니다.

In [13]:
# load model
model = timm.create_model(
    model_name,
    pretrained=True,
    num_classes=17
).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=LR)

In [14]:
for epoch in range(EPOCHS):
    ret = train_one_epoch(trn_loader, model, optimizer, loss_fn, device=device)
    ret['epoch'] = epoch

    log = ""
    for k, v in ret.items():
      log += f"{k}: {v:.4f}\n"
    print(log)

Loss: 0.9487: 100%|██████████| 25/25 [00:17<00:00,  1.47it/s]


train_loss: 1.2264
train_acc: 0.6153
train_f1: 0.5880
epoch: 0.0000



Loss: 0.4517: 100%|██████████| 25/25 [00:14<00:00,  1.77it/s]


train_loss: 0.4912
train_acc: 0.8248
train_f1: 0.8087
epoch: 1.0000



Loss: 0.3437: 100%|██████████| 25/25 [00:14<00:00,  1.77it/s]


train_loss: 0.3727
train_acc: 0.8732
train_f1: 0.8580
epoch: 2.0000



Loss: 0.2255: 100%|██████████| 25/25 [00:14<00:00,  1.77it/s]


train_loss: 0.2797
train_acc: 0.8981
train_f1: 0.8873
epoch: 3.0000



Loss: 0.2594: 100%|██████████| 25/25 [00:14<00:00,  1.76it/s]


train_loss: 0.2162
train_acc: 0.9210
train_f1: 0.9151
epoch: 4.0000



Loss: 0.3619: 100%|██████████| 25/25 [00:14<00:00,  1.76it/s]


train_loss: 0.2736
train_acc: 0.9032
train_f1: 0.8965
epoch: 5.0000



Loss: 0.4466: 100%|██████████| 25/25 [00:14<00:00,  1.75it/s]


train_loss: 0.2087
train_acc: 0.9376
train_f1: 0.9329
epoch: 6.0000



Loss: 0.2060: 100%|██████████| 25/25 [00:14<00:00,  1.76it/s]


train_loss: 0.1530
train_acc: 0.9427
train_f1: 0.9394
epoch: 7.0000



Loss: 0.0737: 100%|██████████| 25/25 [00:14<00:00,  1.76it/s]


train_loss: 0.1727
train_acc: 0.9389
train_f1: 0.9341
epoch: 8.0000



Loss: 0.0493: 100%|██████████| 25/25 [00:14<00:00,  1.75it/s]


train_loss: 0.1188
train_acc: 0.9567
train_f1: 0.9550
epoch: 9.0000



Loss: 0.2165: 100%|██████████| 25/25 [00:14<00:00,  1.75it/s]


train_loss: 0.0998
train_acc: 0.9675
train_f1: 0.9652
epoch: 10.0000



Loss: 0.0442: 100%|██████████| 25/25 [00:14<00:00,  1.76it/s]


train_loss: 0.1444
train_acc: 0.9497
train_f1: 0.9470
epoch: 11.0000



Loss: 0.2897: 100%|██████████| 25/25 [00:14<00:00,  1.75it/s]


train_loss: 0.1044
train_acc: 0.9637
train_f1: 0.9604
epoch: 12.0000



Loss: 0.2628: 100%|██████████| 25/25 [00:14<00:00,  1.76it/s]


train_loss: 0.1321
train_acc: 0.9567
train_f1: 0.9545
epoch: 13.0000



Loss: 0.1308: 100%|██████████| 25/25 [00:14<00:00,  1.76it/s]


train_loss: 0.0886
train_acc: 0.9682
train_f1: 0.9660
epoch: 14.0000



Loss: 0.0578: 100%|██████████| 25/25 [00:14<00:00,  1.74it/s]


train_loss: 0.1168
train_acc: 0.9605
train_f1: 0.9580
epoch: 15.0000



Loss: 0.0153: 100%|██████████| 25/25 [00:14<00:00,  1.76it/s]


train_loss: 0.0673
train_acc: 0.9809
train_f1: 0.9800
epoch: 16.0000



Loss: 0.0102: 100%|██████████| 25/25 [00:14<00:00,  1.75it/s]


train_loss: 0.0818
train_acc: 0.9739
train_f1: 0.9723
epoch: 17.0000



Loss: 0.0445: 100%|██████████| 25/25 [00:14<00:00,  1.76it/s]


train_loss: 0.1177
train_acc: 0.9656
train_f1: 0.9657
epoch: 18.0000



Loss: 0.0170: 100%|██████████| 25/25 [00:14<00:00,  1.75it/s]

train_loss: 0.0851
train_acc: 0.9739
train_f1: 0.9733
epoch: 19.0000






# 6. Inference & Save File
* 테스트 이미지에 대한 추론을 진행하고, 결과 파일을 저장합니다.

In [15]:
preds_list = []

model.eval()
for image, _ in tqdm(tst_loader):
    image = image.to(device)

    with torch.no_grad():
        preds = model(image)
    preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())

100%|██████████| 50/50 [00:21<00:00,  2.30it/s]


In [16]:
pred_df = pd.DataFrame(tst_dataset.df, columns=['ID', 'target'])
pred_df['target'] = preds_list

In [17]:
sample_submission_df = pd.read_csv("data/sample_submission.csv")
assert (sample_submission_df['ID'] == pred_df['ID']).all()

In [18]:
pred_df.to_csv("pred_tf_efficientnet_b4_20.csv", index=False)

In [16]:
pred_df.head(10)

Unnamed: 0,ID,target
0,0008fdb22ddce0ce.jpg,2
1,00091bffdffd83de.jpg,12
2,00396fbc1f6cc21d.jpg,5
3,00471f8038d9c4b6.jpg,3
4,00901f504008d884.jpg,2
5,009b22decbc7220c.jpg,15
6,00b33e0ee6d59427.jpg,0
7,00bbdcfbbdb3e131.jpg,8
8,00c03047e0fbef40.jpg,15
9,00c0dabb63ca7a16.jpg,11


In [19]:
pred_df.head(20)

Unnamed: 0,ID,target
0,0008fdb22ddce0ce.jpg,2
1,00091bffdffd83de.jpg,12
2,00396fbc1f6cc21d.jpg,5
3,00471f8038d9c4b6.jpg,3
4,00901f504008d884.jpg,2
5,009b22decbc7220c.jpg,15
6,00b33e0ee6d59427.jpg,0
7,00bbdcfbbdb3e131.jpg,8
8,00c03047e0fbef40.jpg,15
9,00c0dabb63ca7a16.jpg,11


In [17]:
import timm
print(timm.list_models())

['bat_resnext26ts', 'beit_base_patch16_224', 'beit_base_patch16_384', 'beit_large_patch16_224', 'beit_large_patch16_384', 'beit_large_patch16_512', 'beitv2_base_patch16_224', 'beitv2_large_patch16_224', 'botnet26t_256', 'botnet50ts_256', 'caformer_b36', 'caformer_m36', 'caformer_s18', 'caformer_s36', 'cait_m36_384', 'cait_m48_448', 'cait_s24_224', 'cait_s24_384', 'cait_s36_384', 'cait_xs24_384', 'cait_xxs24_224', 'cait_xxs24_384', 'cait_xxs36_224', 'cait_xxs36_384', 'coat_lite_medium', 'coat_lite_medium_384', 'coat_lite_mini', 'coat_lite_small', 'coat_lite_tiny', 'coat_mini', 'coat_small', 'coat_tiny', 'coatnet_0_224', 'coatnet_0_rw_224', 'coatnet_1_224', 'coatnet_1_rw_224', 'coatnet_2_224', 'coatnet_2_rw_224', 'coatnet_3_224', 'coatnet_3_rw_224', 'coatnet_4_224', 'coatnet_5_224', 'coatnet_bn_0_rw_224', 'coatnet_nano_cc_224', 'coatnet_nano_rw_224', 'coatnet_pico_rw_224', 'coatnet_rmlp_0_rw_224', 'coatnet_rmlp_1_rw2_224', 'coatnet_rmlp_1_rw_224', 'coatnet_rmlp_2_rw_224', 'coatnet_rmlp_2

In [20]:
meta_df = pd.read_csv("data/meta.csv")
meta_df

Unnamed: 0,target,class_name
0,0,account_number
1,1,application_for_payment_of_pregnancy_medical_e...
2,2,car_dashboard
3,3,confirmation_of_admission_and_discharge
4,4,diagnosis
5,5,driver_lisence
6,6,medical_bill_receipts
7,7,medical_outpatient_certificate
8,8,national_id_card
9,9,passport


In [21]:
pred_df['order'] = pred_df.index

In [22]:
pred_df

Unnamed: 0,ID,target,order
0,0008fdb22ddce0ce.jpg,2,0
1,00091bffdffd83de.jpg,12,1
2,00396fbc1f6cc21d.jpg,5,2
3,00471f8038d9c4b6.jpg,3,3
4,00901f504008d884.jpg,2,4
...,...,...,...
3135,ffb4b6f619fb60ea.jpg,6,3135
3136,ffb54299b1ad4159.jpg,10,3136
3137,ffc2c91dff8cf2c0.jpg,8,3137
3138,ffc4e330a5353a2a.jpg,0,3138


In [23]:
pred_df_label = pred_df.merge(meta_df, left_on='target', right_on='target')

pred_df_label_ordered=pred_df_label.sort_values(by='order')

In [24]:
pred_df_label_ordered.to_csv("pred_df_label_ordered.csv", index=False)