## 패키지 설치

In [1]:
#!pip install numpy

# Pytorch 10.2 GPU install
#!pip install torch

# Pytorch 10.2 CPU install
#!pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

#!pip install torchvision
#!pip install torchinfo
#!pip install tqdm

## 패키지 불러오기

In [2]:
# CPU 코어 개수를 확인
import os
n_cores = os.cpu_count()

In [5]:
import sys
import numpy as np

import torch
import torch.nn as nn

from torch.utils.data import DataLoader, ConcatDataset
import torchvision
from torchvision import transforms
import torch.optim.lr_scheduler as lr_scheduler

from Model import UNet
from Dataset import Dataset, ToTensor, Normalization, RandomFlip
from Utils import IOU_Numpy
from EarlyStopping import EarlyStopping

from tqdm import tqdm

## 설치 패키지 확인

In [8]:
print('python version:',sys.version)
print('numpy version:', np.__version__)
print('torch version:', torch.__version__)
print('torchvision version:', torchvision.__version__)

python version: 3.7.10 (default, Feb 26 2021, 18:47:35) 
[GCC 7.3.0]
numpy version: 1.19.2
torch version: 1.7.1
torchvision version: 0.8.2


## 모델 파라미터 (Parameter)

In [9]:
# Learning rate
lr = 1e-3
# 배치 사이즈
batch_size = 24
# Iteration
num_epoch = 100

# 데이터를 읽어올 Path
top_data_dir = "../Dataset/preprocessed/segmentation/top_result/"
side_data_dir = "../Dataset/preprocessed/segmentation/side_result/"
# 모델을 저장할 Path
ckpt_dir = "./Models/"

# 모델 이름
Model_name = "egg_segmentation"
# 모델을 연산할 장비 설정 (가능하면 GPU, 안되면 CPU에서 계산함)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## 데이터 셋 구성

In [10]:
# 각 Top과 Side 데이터 별 Test set을 읽어옴
test_top_dataset = Dataset(data_dir=top_data_dir, train=False)
n_top_test = len(test_top_dataset)
test_side_dataset = Dataset(data_dir=side_data_dir, train=False)
n_side_test = len(test_side_dataset)

print("The number of each test set")
print("Top test set: %d, Side test set: %d" % (n_top_test, n_side_test))

The number of each test set
Top test set: 1411, Side test set: 1497


In [11]:
# transform 적용해서 데이터 셋 불러오기
train_transform = transforms.Compose([Normalization(mean=0.5, std=0.5), RandomFlip(), ToTensor()])
train_top_dataset = Dataset(data_dir=top_data_dir, train=True, transform=train_transform)
n_top_train = len(train_top_dataset)
train_side_dataset = Dataset(data_dir=side_data_dir, train=True, transform=train_transform)
n_side_train = len(train_side_dataset)

# Train 개수를 Test 갯수만큼 Train와 Vlidation set으루 나눔
train_top_ds, valid_top_ds = torch.utils.data.random_split(train_top_dataset, [int(n_top_train - n_top_test), n_top_test])
train_side_ds, valid_side_ds = torch.utils.data.random_split(train_side_dataset, [int(n_side_train - n_side_test), n_side_test])

# Top과 Side 데이터들을 Concatenation 시킴
train_ds = ConcatDataset([train_top_ds, train_side_ds])
valid_ds = ConcatDataset([valid_top_ds, valid_side_ds])

# Data Loader를 구성함
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True, num_workers=int(n_cores/2), pin_memory=False)
valid_loader = DataLoader(dataset=valid_ds, batch_size=batch_size, shuffle=True, num_workers=int(n_cores/2), pin_memory=False)

print("The Number of Each Set")
print('Train set: %d, Validation set: %d, Test set: %d' %(len(train_ds), len(valid_ds), n_top_test+n_side_test))

The Number of Each Set
Train set: 23204, Validation set: 2908, Test set: 2908


## 모델 학습

### 모델 학습 세팅

In [12]:
fn_classifier = lambda x :  1.0 * (x > 0.5)  # threshold 0.5 기준으로 indicator function으로 classifier 구현

# initialize the early_stopping object
early_stopping = EarlyStopping(patience=15, verbose=True, path=ckpt_dir)

# 네트워크 불러오기
net = UNet().to(device) # device : cpu or gpu

# 모델을 여러 GPU에 돌리기
if torch.cuda.device_count() > 1:
    print(torch.cuda.device_count(), "GPUs are available!!")
    net = nn.DataParallel(net)

# loss 정의
fn_loss = nn.BCEWithLogitsLoss().to(device)

# Optimizer 정의
optim = torch.optim.Adam(net.parameters(), lr = lr)
optimizer_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optim,
                                                     mode="min",
                                                     factor=0.5,
                                                     patience=5,
                                                     verbose=True)
# 모델 확인
print(net)

4 GPUs are available!!
DataParallel(
  (module): UNet(
    (enc1_1): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (enc1_2): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (enc2_1): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (enc2_2): Sequential(
      (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     

### 모델 학습

In [14]:
# Automatic Mixed-Precision(AMP)를 위한 Gradscaler 선언
scaler = torch.cuda.amp.GradScaler(enabled=True) 

# Iteration 수 만큼 학습 시킴
for epoch in range(num_epoch):
    net.train()
    train_loss_arr = []
    
    # Train set을 Batch size 만큼 가져옴
    for data in tqdm(train_loader):
        # forward
        inputs = data['input'].to(device) # 데이터 device로 올리기
        label = data['label'].to(device) 
        
        # Casts operations to mixed precision 
        with torch.cuda.amp.autocast(): 
            output = net(inputs)
            train_loss = fn_loss(output, label)  # output과 label 사이의 loss 계산
            
        scaler.scale(train_loss).backward()  # gradient backpropagation with AMP
        scaler.step(optim)  # backpropa 된 gradient를 이용해서 각 layer의 parameters update with AMP
        
        # Updates the scale for next iteration 
        scaler.update()
        
        # backward
        optim.zero_grad()  # gradient 초기화
        
        # loss를 저장함
        train_loss_arr += [train_loss.item()]
    
    # Validation을 진행하여 Early stopping을 적용함
    with torch.no_grad():  # validation 이기 때문에 backpropa 진행 x, 학습된 네트워크가 정답과 얼마나 가까운지 loss만 계산
        net.eval()  # 네트워크를 evaluation 용으로 선언
        val_loss_arr = []
        val_iou_arr = []
        
        for batch, data in enumerate(valid_loader):
            # forward
            inputs = data['input'].to(device)
            label = data['label'].to(device)
            output = net(inputs)
            
            # loss
            val_loss = fn_loss(output, label)
            val_loss_arr += [val_loss.item()]
            val_iou_arr += [IOU_Numpy(fn_classifier(output), label)]
    
    # Validation이 끝나면 Iteration 후 정보를 출력
    print('epoch %03d / %03d | train loss %.4f | valid loss %.4f | vallid iou %.4f' % (
            epoch+1, num_epoch, np.mean(train_loss_arr), np.mean(val_loss_arr), np.mean(val_iou_arr)))  
    
    # Optimizaer를 Update함
    optimizer_scheduler.step(1 - np.mean(val_iou_arr))
    
    # Validation IOU에 따라 Early stopping을 판단함
    models_dict = dict()
    models_dict[Model_name] = net.state_dict()
    early_stopping(1 - np.mean(val_iou_arr), models_dict)

    # Early stopping patient가 초과되면 멈춤
    if early_stopping.early_stop:
        break

print("All works done!!!")

100%|██████████| 967/967 [06:31<00:00,  2.47it/s]


epoch 001 / 100 | train loss 0.0162 | valid loss 0.0093 | vallid iou 0.7022
Validation loss decreased (inf --> 0.297816).  Saving model ...


100%|██████████| 967/967 [06:32<00:00,  2.46it/s]


epoch 002 / 100 | train loss 0.0051 | valid loss 0.0067 | vallid iou 0.8108
Validation loss decreased (0.297816 --> 0.189161).  Saving model ...


100%|██████████| 967/967 [06:35<00:00,  2.44it/s]


epoch 003 / 100 | train loss 0.0044 | valid loss 0.0055 | vallid iou 0.8670
Validation loss decreased (0.189161 --> 0.133027).  Saving model ...


100%|██████████| 967/967 [06:34<00:00,  2.45it/s]


epoch 004 / 100 | train loss 0.0043 | valid loss 0.0039 | vallid iou 0.8961
Validation loss decreased (0.133027 --> 0.103886).  Saving model ...


100%|██████████| 967/967 [06:37<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 005 / 100 | train loss 0.0040 | valid loss 0.0048 | vallid iou 0.8525
EarlyStopping counter: 1 out of 15


100%|██████████| 967/967 [06:36<00:00,  2.44it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 006 / 100 | train loss 0.0040 | valid loss 0.0049 | vallid iou 0.8853
EarlyStopping counter: 2 out of 15


100%|██████████| 967/967 [06:37<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 007 / 100 | train loss 0.0038 | valid loss 0.0039 | vallid iou 0.8947
EarlyStopping counter: 3 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.43it/s]


epoch 008 / 100 | train loss 0.0038 | valid loss 0.0039 | vallid iou 0.9002
Validation loss decreased (0.103886 --> 0.099771).  Saving model ...


100%|██████████| 967/967 [06:39<00:00,  2.42it/s]


epoch 009 / 100 | train loss 0.0039 | valid loss 0.0036 | vallid iou 0.9018
Validation loss decreased (0.099771 --> 0.098201).  Saving model ...


100%|██████████| 967/967 [06:38<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 010 / 100 | train loss 0.0036 | valid loss 0.0037 | vallid iou 0.9009
EarlyStopping counter: 1 out of 15


100%|██████████| 967/967 [06:39<00:00,  2.42it/s]


epoch 011 / 100 | train loss 0.0036 | valid loss 0.0035 | vallid iou 0.9022
Validation loss decreased (0.098201 --> 0.097769).  Saving model ...


100%|██████████| 967/967 [06:40<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 012 / 100 | train loss 0.0037 | valid loss 0.0036 | vallid iou 0.9017
EarlyStopping counter: 1 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.43it/s]


epoch 013 / 100 | train loss 0.0036 | valid loss 0.0035 | vallid iou 0.9067
Validation loss decreased (0.097769 --> 0.093290).  Saving model ...


100%|██████████| 967/967 [06:38<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 014 / 100 | train loss 0.0036 | valid loss 0.0057 | vallid iou 0.8651
EarlyStopping counter: 1 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 015 / 100 | train loss 0.0036 | valid loss 0.0034 | vallid iou 0.9047
EarlyStopping counter: 2 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.42it/s]


epoch 016 / 100 | train loss 0.0035 | valid loss 0.0034 | vallid iou 0.9080
Validation loss decreased (0.093290 --> 0.091998).  Saving model ...


100%|██████████| 967/967 [06:41<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 017 / 100 | train loss 0.0035 | valid loss 0.0036 | vallid iou 0.9049
EarlyStopping counter: 1 out of 15


100%|██████████| 967/967 [06:39<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 018 / 100 | train loss 0.0034 | valid loss 0.0034 | vallid iou 0.9075
EarlyStopping counter: 2 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 019 / 100 | train loss 0.0034 | valid loss 0.0034 | vallid iou 0.9041
EarlyStopping counter: 3 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 020 / 100 | train loss 0.0034 | valid loss 0.0048 | vallid iou 0.8898
EarlyStopping counter: 4 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.42it/s]


epoch 021 / 100 | train loss 0.0034 | valid loss 0.0035 | vallid iou 0.9118
Validation loss decreased (0.091998 --> 0.088201).  Saving model ...


100%|██████████| 967/967 [06:39<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 022 / 100 | train loss 0.0034 | valid loss 0.0034 | vallid iou 0.9107
EarlyStopping counter: 1 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 023 / 100 | train loss 0.0034 | valid loss 0.0034 | vallid iou 0.9033
EarlyStopping counter: 2 out of 15


100%|██████████| 967/967 [06:41<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 024 / 100 | train loss 0.0034 | valid loss 0.0033 | vallid iou 0.9092
EarlyStopping counter: 3 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 025 / 100 | train loss 0.0033 | valid loss 0.0034 | vallid iou 0.9114
EarlyStopping counter: 4 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 026 / 100 | train loss 0.0033 | valid loss 0.0034 | vallid iou 0.9094
EarlyStopping counter: 5 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 027 / 100 | train loss 0.0033 | valid loss 0.0033 | vallid iou 0.9094
Epoch    27: reducing learning rate of group 0 to 5.0000e-04.
EarlyStopping counter: 6 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 028 / 100 | train loss 0.0033 | valid loss 0.0032 | vallid iou 0.9110
EarlyStopping counter: 7 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 029 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9117
EarlyStopping counter: 8 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]


epoch 030 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9127
Validation loss decreased (0.088201 --> 0.087301).  Saving model ...


100%|██████████| 967/967 [06:40<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 031 / 100 | train loss 0.0032 | valid loss 0.0033 | vallid iou 0.9109
EarlyStopping counter: 1 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 032 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9108
EarlyStopping counter: 2 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 033 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9124
EarlyStopping counter: 3 out of 15


100%|██████████| 967/967 [06:41<00:00,  2.41it/s]


epoch 034 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9129
Validation loss decreased (0.087301 --> 0.087070).  Saving model ...


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 035 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9118
EarlyStopping counter: 1 out of 15


100%|██████████| 967/967 [06:41<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 036 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9073
EarlyStopping counter: 2 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 037 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9106
EarlyStopping counter: 3 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 038 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9101
EarlyStopping counter: 4 out of 15


100%|██████████| 967/967 [06:40<00:00,  2.41it/s]


epoch 039 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9143
Validation loss decreased (0.087070 --> 0.085735).  Saving model ...


100%|██████████| 967/967 [06:39<00:00,  2.42it/s]


epoch 040 / 100 | train loss 0.0032 | valid loss 0.0032 | vallid iou 0.9146
Validation loss decreased (0.085735 --> 0.085450).  Saving model ...


100%|██████████| 967/967 [06:39<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 041 / 100 | train loss 0.0031 | valid loss 0.0032 | vallid iou 0.9108
EarlyStopping counter: 1 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.43it/s]


epoch 042 / 100 | train loss 0.0031 | valid loss 0.0032 | vallid iou 0.9152
Validation loss decreased (0.085450 --> 0.084802).  Saving model ...


100%|██████████| 967/967 [06:39<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 043 / 100 | train loss 0.0031 | valid loss 0.0032 | vallid iou 0.9120
EarlyStopping counter: 1 out of 15


100%|██████████| 967/967 [06:39<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 044 / 100 | train loss 0.0031 | valid loss 0.0032 | vallid iou 0.9110
EarlyStopping counter: 2 out of 15


100%|██████████| 967/967 [06:37<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 045 / 100 | train loss 0.0031 | valid loss 0.0032 | vallid iou 0.9151
EarlyStopping counter: 3 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 046 / 100 | train loss 0.0031 | valid loss 0.0032 | vallid iou 0.9126
EarlyStopping counter: 4 out of 15


100%|██████████| 967/967 [06:36<00:00,  2.44it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 047 / 100 | train loss 0.0031 | valid loss 0.0032 | vallid iou 0.9130
EarlyStopping counter: 5 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 048 / 100 | train loss 0.0031 | valid loss 0.0032 | vallid iou 0.9108
Epoch    48: reducing learning rate of group 0 to 2.5000e-04.
EarlyStopping counter: 6 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 049 / 100 | train loss 0.0031 | valid loss 0.0031 | vallid iou 0.9107
EarlyStopping counter: 7 out of 15


100%|██████████| 967/967 [06:36<00:00,  2.44it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 050 / 100 | train loss 0.0030 | valid loss 0.0031 | vallid iou 0.9128
EarlyStopping counter: 8 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 051 / 100 | train loss 0.0030 | valid loss 0.0032 | vallid iou 0.9116
EarlyStopping counter: 9 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 052 / 100 | train loss 0.0030 | valid loss 0.0031 | vallid iou 0.9124
EarlyStopping counter: 10 out of 15


100%|██████████| 967/967 [06:35<00:00,  2.44it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 053 / 100 | train loss 0.0030 | valid loss 0.0031 | vallid iou 0.9122
EarlyStopping counter: 11 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 054 / 100 | train loss 0.0030 | valid loss 0.0032 | vallid iou 0.9120
Epoch    54: reducing learning rate of group 0 to 1.2500e-04.
EarlyStopping counter: 12 out of 15


100%|██████████| 967/967 [06:37<00:00,  2.43it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 055 / 100 | train loss 0.0030 | valid loss 0.0031 | vallid iou 0.9132
EarlyStopping counter: 13 out of 15


100%|██████████| 967/967 [06:39<00:00,  2.42it/s]
  0%|          | 0/967 [00:00<?, ?it/s]

epoch 056 / 100 | train loss 0.0030 | valid loss 0.0031 | vallid iou 0.9130
EarlyStopping counter: 14 out of 15


100%|██████████| 967/967 [06:38<00:00,  2.42it/s]


epoch 057 / 100 | train loss 0.0030 | valid loss 0.0031 | vallid iou 0.9117
EarlyStopping counter: 15 out of 15
All works done!!!
