# **📄 Document type classification baseline code**
> 문서 타입 분류 대회에 오신 여러분 환영합니다! 🎉     
> 아래 baseline에서는 ResNet 모델을 로드하여, 모델을 학습 및 예측 파일 생성하는 프로세스에 대해 알아보겠습니다.

## Contents
- Prepare Environments
- Import Library & Define Functions
- Delete Noneed Data

## 1. Prepare Environments

* 데이터 로드를 위한 구글 드라이브를 마운트합니다.
* 필요한 라이브러리를 설치합니다.

## 2. Import Library & Define Functions
* 학습 및 추론에 필요한 라이브러리를 로드합니다.
* 학습 및 추론에 필요한 함수와 클래스를 정의합니다.

In [23]:
# base
import os
import time
import random
import warnings
warnings.filterwarnings('ignore')

# image torch 
import shutil
import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

In [24]:
# 시드를 고정합니다.
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

In [25]:
# 데이터셋 클래스를 정의합니다.
class ImageDataset(Dataset):
    def __init__(self, csv, path, transform=None):
        self.df = pd.read_csv(csv).values
        self.path = path
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        name, target = self.df[idx]
        img = np.array(Image.open(os.path.join(self.path, name)))
        if self.transform:
            img = self.transform(image=img)['image']
        return img, target

## 3. Load Data
* 학습, 테스트 데이터셋과 로더를 정의합니다.

In [26]:
# # Dataset 정의
trn_dataset = ImageDataset(
    "../data/train_label_adj.csv",
    "../data/train/"
)

# 기존 3500장짜리 서버제출할때 넣는 데이터 
tst_dataset = ImageDataset(
    "../data/sample_submission.csv",
    "../data/test/"
)
print(len(trn_dataset), len(tst_dataset))

1570 3140


## 4. Delete Data
* 필요하지 않거나 성능에 저해될 것 같은 파일 데이터셋을 정리합니다.

In [27]:
# train folder 설정
train_folder = "../data/train"

In [28]:
# 제거 목록
files_to_delete = [
    
    # 0
    "da39b6ee680e0bb4.jpg", 

    # 1
    "e887b53c900b4133.jpg", 

    # 2
    "3e74aa6ae952d1a3.jpg", "3af0108e01850ce2.jpg", "4ddc51bca63ac1ff.jpg", 
    "4e2a575b25836964.jpg", "a1ab865095b2d312.jpg", "a75e8a8662d20656.jpg", 

    # 3
    "4f67b1e9ba0fe848.jpg", "ade3fec2ce27a123.jpg", "be53872196b3ae1d.jpg", 
    "297bbf30d4be0d50.jpg", "4620f6e53442f3b6.jpg", 'be53872196b3ae1d.jpg',
    'e894c83f1cb6efac.jpg',

    # 4 
    "89d9f15b59eab688.jpg", "39a4f1b94db5e879.jpg", "7ef2b4e1897c690e.jpg", 
    "4c037c35216fdf7e.jpg",  "299a9d5cd6894ad7.jpg", "b1ff3463ccf4d719.jpg", 
    "7a5c7b2d02f2dca6.jpg",

    # 5 
    "5cab99cabced8333.jpg", "d5da20ecbe8e1f20.jpg", '26b5a257eea69adb.jpg',
    "8705d822732b6be7.jpg", "984980789f8aade3.jpg",

    # 7
    "2b04e4a315457bc0.jpg", "06e3ae9dce375043.jpg", "8392719a0503153f.jpg", 
    "3d298ee084f43b5e.jpg", "02dad82a9420ae86.jpg", '36e1647d484b88b7.jpg', 
    '3c5e27755a69ce2c.jpg', 'feeade617aa68c45.jpg', 'f66cc08a628bc668.jpg', 
    '4e5848a4ad483009.jpg', '71b4afcf511ef770.jpg', '159f76320c144a6a.jpg', 
    '8646f2c3280a4f49.jpg', 'a376fbdb67bc4a92.jpg', '77e5c96da6c81a73.jpg', 
    'aab8ed28854b23e2.jpg', '81ca8f684b6aee30.jpg',

    # 8
    "1be2e37e46da7808.jpg", '76a643f0d997792f.jpg', 'a0d422140ba03bcb.jpg', 
    'db0ad34838522238.jpg', '3ae93b2b566b6701.jpg',

    # 9
    "91504eef74a841c0.jpg", "8071934820f18a2e.jpg", '5f58e166f410eae4.jpg',

    # 10
    "2db21acd1d1402a7.jpg", "7c55b18fd58721bb.jpg", "57f3341203558d9d.jpg",

    # 11
    "5a289c094c813d4a.jpg", "7dbf8f668b479f2f.jpg", "e8c0c381c02201c9.jpg", 

    # 12
    "05806b384996854a.jpg", "457cd32cb351d570.jpg", '05806b384996854a.jpg',
    '638449335fbd1759.jpg', 'bcce67ca3fec6158.jpg', 'cb866948ad41d6e8.jpg',
    'e525e58288d02d1d.jpg', '8034f6c5674487ff.jpg', 'bcf9041be7ebbe73.jpg',
    'f8e30abea6d91d16.jpg', 'c6370d4f08c1783d.jpg', '81e5282c3c19a13b.jpg',
    '2e9848b9a0ccbf72.jpg', '4826a34b61ae4e98.jpg', '3c99358b35b7e8fa.jpg',
    '5b6de2f58985b9a4.jpg', '3f3667f946bde615.jpg', '6ab9dd9b5f52f39c.jpg',
    '2b1076abe3e4338d.jpg', '0787007d9064369b.jpg', 'a94927812fc5d31e.jpg',
    'd8ebc588e0c30142.jpg',

    # 13
    "81e5282c3c19a13b.jpg", "870e37a56286f13c.jpg", "e1df80668c20548f.jpg", 

    # 14
    "65b191b1c0b79bbc.jpg", "93964b61ead8e03e.jpg", "5ba19592cee8212c.jpg", 
    "f3d4cabc480bfc06.jpg", "4bdec47234b71ffb.jpg", "4bdec47234b71ffb.jpg",
    '93964b61ead8e03e.jpg', 'd17e596e8baca59b.jpg', '80f998b8dd043ab3.jpg',
    '2e3550bead1a72cb.jpg',

    # 16
    "7f264f1599b1a740.jpg", 'fe8093a9eab697de.jpg', 'f21d717d6df9c74d.jpg',
    'e4b25bf0fc41b317.jpg', '785860603a570231.jpg', '0604225f0353cb7c.jpg',
    '555398e0765220c1.jpg', '7f264f1599b1a740.jpg', '6a5e59ff5eca67e7.jpg',
    '2af50704a985252c.jpg', '1dc625f3328476d7.jpg', '1aeb7ac256febd27.jpg'
]

In [29]:
# 경로 가져오기
folder_path = os.path.join(train_folder)

# 목록에 있는 파일 지우기
for filename in files_to_delete:
    file_path = os.path.join(folder_path, filename)
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"{filename} deleted successfully.")
    else:
        print(f"{filename} does not exist.")

da39b6ee680e0bb4.jpg does not exist.
e887b53c900b4133.jpg does not exist.
3e74aa6ae952d1a3.jpg does not exist.
3af0108e01850ce2.jpg does not exist.
4ddc51bca63ac1ff.jpg does not exist.
4e2a575b25836964.jpg does not exist.
a1ab865095b2d312.jpg does not exist.
a75e8a8662d20656.jpg does not exist.
4f67b1e9ba0fe848.jpg does not exist.
ade3fec2ce27a123.jpg does not exist.
be53872196b3ae1d.jpg does not exist.
297bbf30d4be0d50.jpg does not exist.
4620f6e53442f3b6.jpg does not exist.
be53872196b3ae1d.jpg does not exist.
e894c83f1cb6efac.jpg does not exist.
89d9f15b59eab688.jpg does not exist.
39a4f1b94db5e879.jpg does not exist.
7ef2b4e1897c690e.jpg does not exist.
4c037c35216fdf7e.jpg does not exist.
299a9d5cd6894ad7.jpg does not exist.
b1ff3463ccf4d719.jpg does not exist.
7a5c7b2d02f2dca6.jpg does not exist.
5cab99cabced8333.jpg does not exist.
d5da20ecbe8e1f20.jpg does not exist.
26b5a257eea69adb.jpg does not exist.
8705d822732b6be7.jpg does not exist.
984980789f8aade3.jpg does not exist.
2

In [30]:
# csv파일에도 없는거 빼버리지
data = pd.read_csv('../data/train_label_adj.csv')
data2 = data[~data['ID'].isin(files_to_delete)]
data2.to_csv('../data/train_label_adj.csv', index = False)