## 패키지 설치

In [None]:
#!pip install numpy

# Pytorch 10.2 GPU install
#!pip install torch

# Pytorch 10.2 CPU install
#!pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

#!pip install torchvision
#!pip install torchinfo
#!pip install tqdm

## 패키지 불러오기

In [None]:
# CPU 코어 개수를 확인
import os
n_cores = os.cpu_count()

In [2]:
import numpy as np

import torch, torchvision
import torch.nn as nn

from torch.utils.data import DataLoader
from torchvision import transforms
import torch.optim.lr_scheduler as lr_scheduler

from Model import UNet
from Dataset import Dataset, ToTensor, Normalization, RandomFlip
from Utils import IOU_Numpy
from EarlyStopping import EarlyStopping

from tqdm import tqdm

## 설치 패키지 확인

In [None]:
print('python version:',sys.version)
print('numpy version:', np.__version__)
print('torch version:', torch.__version__)
print('torchvision version:', torchvision.__version__)

## 모델 파라미터 (Parameter)

In [3]:
# Learning rate
lr = 1e-3
# 배치 사이즈
batch_size = 24
# Iteration 수
num_epoch = 100

# 데이터를 읽어올 Path
data_dir = "../Dataset/preprocessed/segmentation/"
# 모델을 저장할 Path
ckpt_dir = "./Models/"

# 모델 이름
Model_name = "chicken_segmentation"
# 모델을 연산할 장비 설정 (가능하면 GPU, 안되면 CPU에서 계산함)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## 데이터 셋 구성

In [4]:
# Test 데이터를 읽어옴
test_dataset = Dataset(data_dir=data_dir, train=False)
# 갯수를 저장함
n_test = len(test_dataset)

# transform 적용해서 Train 데이터 셋 불러오기
train_transform = transforms.Compose([Normalization(mean=0.5, std=0.5), RandomFlip(), ToTensor()])
train_dataset = Dataset(data_dir=data_dir, train=True, transform=train_transform)
# 전체 Train 데이터 셋 개수를 저장
n_whole_train = len(train_dataset)

# Train 개수를 Test 갯수만큼 Train와 Vlidation set으루 나눔
train_ds, valid_ds = torch.utils.data.random_split(train_dataset, [int(n_whole_train - n_test), n_test])

# Data Loader를 구성함
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True, num_workers=int(n_cores/2), pin_memory=False)
valid_loader = DataLoader(dataset=valid_ds, batch_size=batch_size, shuffle=True, num_workers=int(n_cores/2), pin_memory=False)

print("The Number of Each Set")
print('Train set: %d, Validation set: %d, Test set: %d' %(len(train_ds), len(valid_ds), n_test))

The Number of Each Set
Train set: 23975, Validation set: 3304, Test set: 3304


## 모델 학습

### 모델 학습 세팅

In [5]:
fn_classifier = lambda x :  1.0 * (x > 0.5)  # threshold 0.5 기준으로 indicator function으로 classifier 구현

# initialize the early_stopping object
early_stopping = EarlyStopping(patience=15, verbose=True, path=ckpt_dir)

# 네트워크 불러오기
net = UNet().to(device) # device : cpu or gpu

# 모델을 여러 GPU에 돌리기
if torch.cuda.device_count() > 1:
    print(torch.cuda.device_count(), "GPUs are available!!")
    net = nn.DataParallel(net)

# loss 정의
fn_loss = nn.BCEWithLogitsLoss().to(device)

# Optimizer 정의
optim = torch.optim.Adam(net.parameters(), lr = lr)
optimizer_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optim,
                                                     mode="min",
                                                     factor=0.5,
                                                     patience=5,
                                                     verbose=True)

# 모델 확인
print(net)

2 GPUs are available!!
DataParallel(
  (module): UNet(
    (enc1_1): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (enc1_2): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (enc2_1): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (enc2_2): Sequential(
      (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     

### 모델 학습

In [6]:
# Automatic Mixed-Precision(AMP)를 위한 Gradscaler 선언
scaler = torch.cuda.amp.GradScaler(enabled=True) 

# Iteration 수 만큼 학습 시킴
for epoch in range(num_epoch):
    net.train()
    train_loss_arr = []
    
    # Train set을 Batch size 만큼 가져옴
    for data in tqdm(train_loader):
        # forward
        inputs = data['input'].to(device) # 데이터 device로 올리기
        label = data['label'].to(device) 
        
        # Casts operations to mixed precision 
        with torch.cuda.amp.autocast():     
            output = net(inputs)
            train_loss = fn_loss(output, label)  # output과 label 사이의 loss 계산
        
        scaler.scale(train_loss).backward()  # gradient backpropagation with AMP
        scaler.step(optim)  # backpropa 된 gradient를 이용해서 각 layer의 parameters update with AMP
        
        # Updates the scale for next iteration 
        scaler.update()
        
        # backward
        optim.zero_grad()  # gradient 초기화
        
        # loss를 저장함
        train_loss_arr += [train_loss.item()]
    
    # Validation을 진행하여 Early stopping을 적용함
    with torch.no_grad():  # validation 이기 때문에 backpropa 진행 x, 학습된 네트워크가 정답과 얼마나 가까운지 loss만 계산
        net.eval()  # 네트워크를 evaluation 용으로 선언
        val_loss_arr = []
        val_iou_arr = []
        
        for batch, data in enumerate(valid_loader):
            # forward
            inputs = data['input'].to(device)
            label = data['label'].to(device)
            output = net(inputs)
            
            # loss
            val_loss = fn_loss(output, label)
            val_loss_arr += [val_loss.item()]
            val_iou_arr += [IOU_Numpy(fn_classifier(output), label)]
    
    # Validation이 끝나면 Iteration 후 정보를 출력
    print('epoch %03d / %03d | train loss %.4f | valid loss %.4f | vallid iou %.4f' % (
            epoch+1, num_epoch, np.mean(train_loss_arr), np.mean(val_loss_arr), np.mean(val_iou_arr)))  
    
    # Optimizaer를 Update함
    optimizer_scheduler.step(1 - np.mean(val_iou_arr))
    
    # Validation IOU에 따라 Early stopping을 판단함
    models_dict = dict()
    models_dict[Model_name] = net.state_dict()
    early_stopping(1 - np.mean(val_iou_arr), models_dict)

    # Early stopping patient가 초과되면 멈춤
    if early_stopping.early_stop:
        break

print("All works done!!!")

100%|██████████| 999/999 [25:01<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 001 / 100 | train loss 0.1218 | valid loss 0.0959 | vallid iou 0.8942
Validation loss decreased (inf --> 0.105816).  Saving model ...


100%|██████████| 999/999 [24:46<00:00,  1.49s/it]


epoch 002 / 100 | train loss 0.0924 | valid loss 0.0896 | vallid iou 0.9053
Validation loss decreased (0.105816 --> 0.094731).  Saving model ...


100%|██████████| 999/999 [24:52<00:00,  1.49s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 003 / 100 | train loss 0.0874 | valid loss 0.0860 | vallid iou 0.9041
EarlyStopping counter: 1 out of 15


100%|██████████| 999/999 [24:43<00:00,  1.48s/it]


epoch 004 / 100 | train loss 0.0855 | valid loss 0.0848 | vallid iou 0.9054
Validation loss decreased (0.094731 --> 0.094648).  Saving model ...


100%|██████████| 999/999 [24:46<00:00,  1.49s/it]


epoch 005 / 100 | train loss 0.0843 | valid loss 0.0833 | vallid iou 0.9065
Validation loss decreased (0.094648 --> 0.093459).  Saving model ...


100%|██████████| 999/999 [24:49<00:00,  1.49s/it]


epoch 006 / 100 | train loss 0.0831 | valid loss 0.0824 | vallid iou 0.9102
Validation loss decreased (0.093459 --> 0.089800).  Saving model ...


100%|██████████| 999/999 [24:53<00:00,  1.49s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 007 / 100 | train loss 0.0824 | valid loss 0.0819 | vallid iou 0.9089
EarlyStopping counter: 1 out of 15


100%|██████████| 999/999 [24:48<00:00,  1.49s/it]


epoch 008 / 100 | train loss 0.0817 | valid loss 0.0817 | vallid iou 0.9112
Validation loss decreased (0.089800 --> 0.088850).  Saving model ...


100%|██████████| 999/999 [24:58<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 009 / 100 | train loss 0.0812 | valid loss 0.0806 | vallid iou 0.9082
EarlyStopping counter: 1 out of 15


100%|██████████| 999/999 [24:45<00:00,  1.49s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 010 / 100 | train loss 0.0805 | valid loss 0.0813 | vallid iou 0.9105
EarlyStopping counter: 2 out of 15


100%|██████████| 999/999 [24:53<00:00,  1.49s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 011 / 100 | train loss 0.0801 | valid loss 0.0884 | vallid iou 0.9004
EarlyStopping counter: 3 out of 15


100%|██████████| 999/999 [24:51<00:00,  1.49s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 012 / 100 | train loss 0.0797 | valid loss 0.0797 | vallid iou 0.9107
EarlyStopping counter: 4 out of 15


100%|██████████| 999/999 [25:03<00:00,  1.51s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 013 / 100 | train loss 0.0793 | valid loss 0.0792 | vallid iou 0.9085
EarlyStopping counter: 5 out of 15


100%|██████████| 999/999 [24:59<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 014 / 100 | train loss 0.0789 | valid loss 0.0797 | vallid iou 0.9089
Epoch    14: reducing learning rate of group 0 to 5.0000e-04.
EarlyStopping counter: 6 out of 15


100%|██████████| 999/999 [24:50<00:00,  1.49s/it]


epoch 015 / 100 | train loss 0.0774 | valid loss 0.0789 | vallid iou 0.9116
Validation loss decreased (0.088850 --> 0.088392).  Saving model ...


100%|██████████| 999/999 [24:58<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 016 / 100 | train loss 0.0771 | valid loss 0.0785 | vallid iou 0.9115
EarlyStopping counter: 1 out of 15


100%|██████████| 999/999 [24:56<00:00,  1.50s/it]


epoch 017 / 100 | train loss 0.0768 | valid loss 0.0784 | vallid iou 0.9118
Validation loss decreased (0.088392 --> 0.088226).  Saving model ...


100%|██████████| 999/999 [24:56<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 018 / 100 | train loss 0.0766 | valid loss 0.0783 | vallid iou 0.9112
EarlyStopping counter: 1 out of 15


100%|██████████| 999/999 [24:53<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 019 / 100 | train loss 0.0763 | valid loss 0.0783 | vallid iou 0.9110
EarlyStopping counter: 2 out of 15


100%|██████████| 999/999 [25:00<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 020 / 100 | train loss 0.0761 | valid loss 0.0787 | vallid iou 0.9107
EarlyStopping counter: 3 out of 15


100%|██████████| 999/999 [24:53<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 021 / 100 | train loss 0.0759 | valid loss 0.0784 | vallid iou 0.9114
EarlyStopping counter: 4 out of 15


100%|██████████| 999/999 [24:49<00:00,  1.49s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 022 / 100 | train loss 0.0756 | valid loss 0.0783 | vallid iou 0.9114
EarlyStopping counter: 5 out of 15


100%|██████████| 999/999 [24:54<00:00,  1.50s/it]


epoch 023 / 100 | train loss 0.0754 | valid loss 0.0785 | vallid iou 0.9135
Validation loss decreased (0.088226 --> 0.086460).  Saving model ...


100%|██████████| 999/999 [24:57<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 024 / 100 | train loss 0.0751 | valid loss 0.0785 | vallid iou 0.9114
EarlyStopping counter: 1 out of 15


100%|██████████| 999/999 [24:55<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 025 / 100 | train loss 0.0749 | valid loss 0.0785 | vallid iou 0.9121
EarlyStopping counter: 2 out of 15


100%|██████████| 999/999 [24:50<00:00,  1.49s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 026 / 100 | train loss 0.0746 | valid loss 0.0791 | vallid iou 0.9133
EarlyStopping counter: 3 out of 15


100%|██████████| 999/999 [24:52<00:00,  1.49s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 027 / 100 | train loss 0.0744 | valid loss 0.0790 | vallid iou 0.9134
EarlyStopping counter: 4 out of 15


100%|██████████| 999/999 [24:54<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 028 / 100 | train loss 0.0739 | valid loss 0.0790 | vallid iou 0.9114
EarlyStopping counter: 5 out of 15


100%|██████████| 999/999 [25:05<00:00,  1.51s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 029 / 100 | train loss 0.0737 | valid loss 0.0794 | vallid iou 0.9131
Epoch    29: reducing learning rate of group 0 to 2.5000e-04.
EarlyStopping counter: 6 out of 15


100%|██████████| 999/999 [25:08<00:00,  1.51s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 030 / 100 | train loss 0.0724 | valid loss 0.0793 | vallid iou 0.9118
EarlyStopping counter: 7 out of 15


100%|██████████| 999/999 [24:54<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 031 / 100 | train loss 0.0719 | valid loss 0.0798 | vallid iou 0.9120
EarlyStopping counter: 8 out of 15


100%|██████████| 999/999 [24:56<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 032 / 100 | train loss 0.0715 | valid loss 0.0803 | vallid iou 0.9115
EarlyStopping counter: 9 out of 15


100%|██████████| 999/999 [25:04<00:00,  1.51s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 033 / 100 | train loss 0.0713 | valid loss 0.0798 | vallid iou 0.9118
EarlyStopping counter: 10 out of 15


100%|██████████| 999/999 [25:07<00:00,  1.51s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 034 / 100 | train loss 0.0708 | valid loss 0.0803 | vallid iou 0.9116
EarlyStopping counter: 11 out of 15


100%|██████████| 999/999 [24:56<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 035 / 100 | train loss 0.0705 | valid loss 0.0812 | vallid iou 0.9123
Epoch    35: reducing learning rate of group 0 to 1.2500e-04.
EarlyStopping counter: 12 out of 15


100%|██████████| 999/999 [24:52<00:00,  1.49s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 036 / 100 | train loss 0.0695 | valid loss 0.0810 | vallid iou 0.9108
EarlyStopping counter: 13 out of 15


100%|██████████| 999/999 [24:57<00:00,  1.50s/it]
  0%|          | 0/999 [00:00<?, ?it/s]

epoch 037 / 100 | train loss 0.0692 | valid loss 0.0817 | vallid iou 0.9108
EarlyStopping counter: 14 out of 15


100%|██████████| 999/999 [25:02<00:00,  1.50s/it]


epoch 038 / 100 | train loss 0.0688 | valid loss 0.0823 | vallid iou 0.9116
EarlyStopping counter: 15 out of 15
All works done!!!
