In [4]:
import common

import os
import time
import random

import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from dotenv import load_dotenv
from datetime import datetime
from zoneinfo import ZoneInfo
import wandb

In [5]:
common.wandb_login_init('SKF_tf_efficientnet_b7.ns_jft_in1k')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkerneld82[0m ([33mkerneld[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /data/ephemeral/home/.netrc


train_time = SKF_tf_efficientnet_b7.ns_jft_in1k


In [6]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# data config
data_path = 'datasets_fin/'

# model config
model_name = 'tf_efficientnet_b7.ns_jft_in1k' # 'resnet50' 'efficientnet-b0', ...

# training config
trn_img_size = 600
tst_img_size = 600
LR = 1e-3
EPOCHS = 15
FOLDS = 2
BATCH_SIZE = 5
num_workers = 12
augment_ratio = 200

wandb.config.update({
    "learning_rate": LR,
    "architecture": model_name,
    "dataset": "custom-dataset",
    "epochs": EPOCHS,
    "folds": FOLDS,
    "batch_size": BATCH_SIZE,
    "train_image_size": trn_img_size,
    "test_image_size": tst_img_size,
    "num_workers": num_workers,
    'augment_ratio' : augment_ratio,
})

In [7]:
# 시드를 고정합니다.
SEED = 42

common.set_seed(SEED)

In [8]:
device

device(type='cuda')

### Load Data
* 학습, 테스트 데이터셋과 로더를 정의합니다.

In [6]:
# 학습과 검증에 사용할 폴드별 csv 파일들을 먼저 만들기.
# 나중에 이어서 학습하고 싶을때 사용하기 위해서.
common.generate_fold_train_valid_csv_files(SEED, FOLDS)

Fold 1/2, train_idx: <class 'numpy.ndarray'> 867, <class 'numpy.ndarray'> 867
Fold 2/2, train_idx: <class 'numpy.ndarray'> 867, <class 'numpy.ndarray'> 867


In [7]:
for fold in range(FOLDS):
    fold += 1
    print(f"Fold {fold}/{FOLDS}")
    
    supplies = common.get_supplies_for_train_and_valid_with_fold(seed =SEED, 
                                                                 model_name = model_name, 
                                                                 lr = LR,
                                                                 batch_size = BATCH_SIZE, 
                                                                 num_workers = num_workers, 
                                                                 fold = fold, 
                                                                 folds = FOLDS, 
                                                                 augment_ratio = augment_ratio, 
                                                                 trn_img_size = trn_img_size, 
                                                                 tst_img_size = tst_img_size, 
                                                                 device = device)
    
    common.train_with_start_end_epoch(seed = SEED, 
                                      tst_img_size = tst_img_size,
                                      batch_size = BATCH_SIZE,
                                      start_epoch_inclusive = 1, 
                                      end_epoch_exclusive = EPOCHS + 1, 
                                      augment_ratio = augment_ratio,
                                      trn_loader = supplies['trn_loader'], 
                                      val_loader = supplies['val_loader'], 
                                      model = supplies['model'], 
                                      model_name = model_name, 
                                      optimizer = supplies['optimizer'], 
                                      loss_fn = supplies['loss_fn'], 
                                      device = device, 
                                      is_save_model_checkpoint = True, 
                                      is_evaluate_train_valid = True,
                                      fold = fold,
                                      folds = FOLDS)
    

Fold 1/2

epoch: 1
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.1157: 100%|██████████| 34680/34680 [2:58:07<00:00,  3.25it/s]  


train_loss: 0.1424
train_acc: 0.9520
train_f1: 0.9521



Evaluating: 100%|██████████| 174/174 [00:20<00:00,  8.51it/s]



final_valid_loss: 0.339094966437471
final_valid_accuracy: 0.9400230680507498
final_valid_f1: 0.9379700103827973
valid's error preds count: 52/867, {7: 22, 3: 17, 14: 8, 9: 3, 11: 1, 6: 1}
Model checkpoint saved. filename: cp-tf_efficientnet_b7.ns_jft_in1k_sd_42_epc_1_aug_200_vl_0.3391_va_0.9400_vf1_0.9380_fold_1_folds_2.pt
5초 동안 잠시 sleep 합니다. 다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.
BREAK 파일이 있는지 확인합니다.
BREAK 파일이 존재합니다. 학습을 중단합니다.
Fold 2/2

epoch: 1
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.5245:   1%|          | 363/34680 [01:53<2:58:17,  3.21it/s]


KeyboardInterrupt: 

In [None]:
# wandb 실행 종료
wandb.finish()

In [None]:
import sys
sys.exit('아래 셀은 수동으로 실행하기 위해서 여기서 실행 멈춤.')

: 

# 수동으로 특정 체크포인트부터 이어서 학습하기 위한 부분


In [9]:
fold = 2

In [10]:
supplies = common.get_supplies_for_train_and_valid_with_fold(seed = SEED, 
                                                                model_name = model_name, 
                                                                lr = LR,
                                                                batch_size = BATCH_SIZE, 
                                                                num_workers = num_workers, 
                                                                fold = fold, 
                                                                folds = FOLDS, 
                                                                augment_ratio = augment_ratio, 
                                                                trn_img_size = trn_img_size, 
                                                                tst_img_size = tst_img_size, 
                                                                device = device)

In [7]:
checkpoint = common.load_model_checkpoint("cp-tf_efficientnet_b7.ns_jft_in1k_sd_42_epc_1_aug_200_vl_0.3391_va_0.9400_vf1_0.9380_fold_1_folds_2.pt", supplies['model'], supplies['optimizer'], device)

In [11]:
#next_epoch = checkpoint['epoch'] + 1
next_epoch = 1
next_epoch

1

In [12]:
common.train_with_start_end_epoch(seed = SEED, 
                                    tst_img_size = tst_img_size,
                                    batch_size = BATCH_SIZE,
                                    start_epoch_inclusive = next_epoch, 
                                    end_epoch_exclusive = EPOCHS + 1, 
                                    augment_ratio = augment_ratio,
                                    trn_loader = supplies['trn_loader'], 
                                    val_loader = supplies['val_loader'], 
                                    model = supplies['model'], 
                                    model_name = model_name, 
                                    optimizer = supplies['optimizer'], 
                                    loss_fn = supplies['loss_fn'], 
                                    device = device, 
                                    is_save_model_checkpoint = True, 
                                    is_evaluate_train_valid = True,
                                    fold = fold,
                                    folds = FOLDS)


epoch: 1
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0000: 100%|██████████| 34680/34680 [2:57:46<00:00,  3.25it/s]  


train_loss: 0.1324
train_acc: 0.9558
train_f1: 0.9556



Evaluating: 100%|██████████| 174/174 [00:20<00:00,  8.51it/s]



final_valid_loss: 1.4655953593859978
final_valid_accuracy: 0.9181084198385236
final_valid_f1: 0.9177720589422292
valid's error preds count: 71/867, {7: 23, 3: 15, 2: 13, 14: 10, 11: 3, 4: 3, 5: 1, 0: 1, 6: 1, 10: 1}
Model checkpoint saved. filename: cp-tf_efficientnet_b7.ns_jft_in1k_sd_42_epc_1_aug_200_vl_1.4656_va_0.9181_vf1_0.9178_fold_2_folds_2.pt
5초 동안 잠시 sleep 합니다. 다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.
BREAK 파일이 있는지 확인합니다.

epoch: 2
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0000: 100%|██████████| 34680/34680 [2:57:38<00:00,  3.25it/s]  


train_loss: 0.0248
train_acc: 0.9924
train_f1: 0.9923



Evaluating: 100%|██████████| 174/174 [00:19<00:00,  8.91it/s]



final_valid_loss: 0.3922620132624839
final_valid_accuracy: 0.9400230680507498
final_valid_f1: 0.9407609870829331
valid's error preds count: 52/867, {7: 20, 3: 18, 4: 5, 11: 3, 13: 2, 14: 2, 12: 1, 10: 1}
Model checkpoint saved. filename: cp-tf_efficientnet_b7.ns_jft_in1k_sd_42_epc_2_aug_200_vl_0.3923_va_0.9400_vf1_0.9408_fold_2_folds_2.pt
5초 동안 잠시 sleep 합니다. 다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.
BREAK 파일이 있는지 확인합니다.

epoch: 3
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0000: 100%|██████████| 34680/34680 [2:57:44<00:00,  3.25it/s]  


train_loss: 0.0146
train_acc: 0.9956
train_f1: 0.9956



Evaluating: 100%|██████████| 174/174 [00:19<00:00,  8.95it/s]



final_valid_loss: 1.1131826662192394
final_valid_accuracy: 0.9238754325259516
final_valid_f1: 0.9243923415835792
valid's error preds count: 66/867, {7: 21, 3: 20, 13: 5, 5: 5, 8: 4, 11: 2, 4: 2, 14: 2, 9: 1, 16: 1, 12: 1, 2: 1, 10: 1}
Model checkpoint saved. filename: cp-tf_efficientnet_b7.ns_jft_in1k_sd_42_epc_3_aug_200_vl_1.1132_va_0.9239_vf1_0.9244_fold_2_folds_2.pt
5초 동안 잠시 sleep 합니다. 다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.
BREAK 파일이 있는지 확인합니다.

epoch: 4
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0000: 100%|██████████| 34680/34680 [2:57:40<00:00,  3.25it/s]  


train_loss: 0.0121
train_acc: 0.9963
train_f1: 0.9963



Evaluating: 100%|██████████| 174/174 [00:19<00:00,  8.90it/s]



final_valid_loss: 2.489793256409262
final_valid_accuracy: 0.9238754325259516
final_valid_f1: 0.9249927587362112
valid's error preds count: 66/867, {7: 25, 3: 12, 14: 8, 5: 4, 8: 3, 2: 3, 4: 3, 16: 2, 0: 2, 9: 1, 12: 1, 13: 1, 11: 1}
Model checkpoint saved. filename: cp-tf_efficientnet_b7.ns_jft_in1k_sd_42_epc_4_aug_200_vl_2.4898_va_0.9239_vf1_0.9250_fold_2_folds_2.pt
5초 동안 잠시 sleep 합니다. 다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.
BREAK 파일이 있는지 확인합니다.

epoch: 5
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0000:  22%|██▏       | 7641/34680 [39:10<2:18:36,  3.25it/s]


KeyboardInterrupt: 