In [1]:
import common

import os
import time
import random

import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from dotenv import load_dotenv
from datetime import datetime
from zoneinfo import ZoneInfo
import wandb

In [2]:
common.wandb_login_init('SKF_densenet121.tv_in1k')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkerneld82[0m ([33mkerneld[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /data/ephemeral/home/.netrc


train_time = SKF_densenet121.tv_in1k


In [3]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# data config
data_path = 'datasets_fin/'

# model config
model_name = 'densenet121.tv_in1k' # 'resnet50' 'efficientnet-b0', ...

# training config
trn_img_size = 224
tst_img_size = 224
LR = 1e-3
EPOCHS = 20
FOLDS = 2
BATCH_SIZE = 32
num_workers = 12
augment_ratio = 200

wandb.config.update({
    "learning_rate": LR,
    "architecture": model_name,
    "dataset": "custom-dataset",
    "epochs": EPOCHS,
    "folds": FOLDS,
    "batch_size": BATCH_SIZE,
    "train_image_size": trn_img_size,
    "test_image_size": tst_img_size,
    "num_workers": num_workers,
    'augment_ratio' : augment_ratio,
})

In [4]:
# 시드를 고정합니다.
SEED = 42

common.set_seed(SEED)

In [5]:
device

device(type='cuda')

### Load Data
* 학습, 테스트 데이터셋과 로더를 정의합니다.

In [6]:
# 학습과 검증에 사용할 폴드별 csv 파일들을 먼저 만들기.
# 나중에 이어서 학습하고 싶을때 사용하기 위해서.
common.generate_fold_train_valid_csv_files(SEED, FOLDS)

Fold 1/2, train_idx: <class 'numpy.ndarray'> 867, <class 'numpy.ndarray'> 867
Fold 2/2, train_idx: <class 'numpy.ndarray'> 867, <class 'numpy.ndarray'> 867


In [7]:
for fold in range(FOLDS):
    fold += 1
    print(f"Fold {fold}/{FOLDS}")
    
    supplies = common.get_supplies_for_train_and_valid_with_fold(seed =SEED, 
                                                                 model_name = model_name, 
                                                                 lr = LR,
                                                                 batch_size = BATCH_SIZE, 
                                                                 num_workers = num_workers, 
                                                                 fold = fold, 
                                                                 folds = FOLDS, 
                                                                 augment_ratio = augment_ratio, 
                                                                 trn_img_size = trn_img_size, 
                                                                 tst_img_size = tst_img_size, 
                                                                 device = device)
    
    common.train_with_start_end_epoch(seed = SEED, 
                                      tst_img_size = tst_img_size,
                                      batch_size = BATCH_SIZE,
                                      start_epoch_inclusive = 1, 
                                      end_epoch_exclusive = EPOCHS + 1, 
                                      augment_ratio = augment_ratio,
                                      trn_loader = supplies['trn_loader'], 
                                      val_loader = supplies['val_loader'], 
                                      model = supplies['model'], 
                                      model_name = model_name, 
                                      optimizer = supplies['optimizer'], 
                                      loss_fn = supplies['loss_fn'], 
                                      device = device, 
                                      is_save_model_checkpoint = True, 
                                      is_evaluate_train_valid = True,
                                      fold = fold,
                                      folds = FOLDS)
    

Fold 1/2


model.safetensors:   0%|          | 0.00/32.3M [00:00<?, ?B/s]


epoch: 1
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0302: 100%|██████████| 5419/5419 [07:52<00:00, 11.46it/s]


train_loss: 0.2197
train_acc: 0.9231
train_f1: 0.9231



Evaluating: 100%|██████████| 28/28 [00:04<00:00,  6.39it/s]



final_valid_loss: 0.36366497473708087
final_valid_accuracy: 0.9238754325259516
final_valid_f1: 0.9193524990863016
valid's error preds count: 66/867, {3: 37, 7: 9, 4: 6, 14: 10, 13: 1, 11: 1, 6: 1, 12: 1}
Model checkpoint saved. filename: cp-densenet121.tv_in1k_sd_42_epc_1_aug_200_vl_0.3637_va_0.9239_vf1_0.9194_fold_1_folds_2.pt
5초 동안 잠시 sleep 합니다. 다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.
BREAK 파일이 있는지 확인합니다.

epoch: 2
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0138: 100%|██████████| 5419/5419 [07:44<00:00, 11.65it/s]


train_loss: 0.0753
train_acc: 0.9746
train_f1: 0.9747



Evaluating: 100%|██████████| 28/28 [00:03<00:00,  7.13it/s]



final_valid_loss: 0.35838045738367846
final_valid_accuracy: 0.9284890426758939
final_valid_f1: 0.9266130764302293
valid's error preds count: 62/867, {7: 24, 3: 21, 14: 8, 12: 3, 11: 3, 6: 1, 4: 1, 2: 1}
Model checkpoint saved. filename: cp-densenet121.tv_in1k_sd_42_epc_2_aug_200_vl_0.3584_va_0.9285_vf1_0.9266_fold_1_folds_2.pt
5초 동안 잠시 sleep 합니다. 다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.
BREAK 파일이 있는지 확인합니다.

epoch: 3
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0002: 100%|██████████| 5419/5419 [07:46<00:00, 11.62it/s]


train_loss: 0.0476
train_acc: 0.9844
train_f1: 0.9844



Evaluating: 100%|██████████| 28/28 [00:03<00:00,  7.26it/s]



final_valid_loss: 0.43309364267861383
final_valid_accuracy: 0.9273356401384083
final_valid_f1: 0.9234897569637383
valid's error preds count: 63/867, {13: 3, 3: 30, 7: 15, 5: 2, 14: 8, 10: 1, 11: 1, 6: 1, 12: 1, 4: 1}
Model checkpoint saved. filename: cp-densenet121.tv_in1k_sd_42_epc_3_aug_200_vl_0.4331_va_0.9273_vf1_0.9235_fold_1_folds_2.pt
5초 동안 잠시 sleep 합니다. 다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.
BREAK 파일이 있는지 확인합니다.

epoch: 4
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0002: 100%|██████████| 5419/5419 [07:44<00:00, 11.67it/s]


train_loss: 0.0347
train_acc: 0.9883
train_f1: 0.9883



Evaluating: 100%|██████████| 28/28 [00:03<00:00,  7.10it/s]



final_valid_loss: 0.39889156405375353
final_valid_accuracy: 0.9377162629757786
final_valid_f1: 0.9363379162452474
valid's error preds count: 54/867, {7: 18, 3: 21, 14: 6, 10: 1, 11: 1, 12: 2, 6: 1, 4: 4}
Model checkpoint saved. filename: cp-densenet121.tv_in1k_sd_42_epc_4_aug_200_vl_0.3989_va_0.9377_vf1_0.9363_fold_1_folds_2.pt
5초 동안 잠시 sleep 합니다. 다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.
BREAK 파일이 있는지 확인합니다.

epoch: 5
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0043: 100%|██████████| 5419/5419 [07:45<00:00, 11.65it/s]


train_loss: 0.0279
train_acc: 0.9906
train_f1: 0.9906



Evaluating: 100%|██████████| 28/28 [00:03<00:00,  7.21it/s]



final_valid_loss: 0.43905272560985714
final_valid_accuracy: 0.9319492502883506
final_valid_f1: 0.9302822106021034
valid's error preds count: 59/867, {13: 5, 7: 21, 3: 17, 14: 8, 12: 3, 11: 2, 6: 1, 4: 2}
Model checkpoint saved. filename: cp-densenet121.tv_in1k_sd_42_epc_5_aug_200_vl_0.4391_va_0.9319_vf1_0.9303_fold_1_folds_2.pt
5초 동안 잠시 sleep 합니다. 다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.
BREAK 파일이 있는지 확인합니다.

epoch: 6
다음 epoch 학습을 중단 예약하고 싶으면, 빈 BREAK 파일을 생성하세요.


Loss: 0.0008:  45%|████▌     | 2459/5419 [03:33<04:16, 11.54it/s]


KeyboardInterrupt: 

In [None]:
# wandb 실행 종료
wandb.finish()

In [None]:
import sys
sys.exit('아래 셀은 수동으로 실행하기 위해서 여기서 실행 멈춤.')

SystemExit: 아래 셀은 수동으로 실행하기 위해서 여기서 실행 멈춤.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# 수동으로 특정 체크포인트부터 이어서 학습하기 위한 부분


In [None]:
# checkpoint = common.load_model_checkpoint("checkpoint-resnet34_seed_42_epoch_0_isFull_False.pt", model, optimizer, device)

In [None]:
# next_epoch = checkpoint['epoch'] + 1

# common.train_with_start_end_epoch(seed = checkpoint['seed'],
#                            tst_img_size = checkpoint['tst_img_size'],
#                            batch_size = checkpoint['batch_size'],
#                            start_epoch_inclusive = next_epoch, 
#                            end_epoch_exclusive = next_epoch + 2, 
#                            augment_ratio = augment_ratio,
#                            trn_loader = trn_loader,
#                            val_loader = val_loader,
#                            model = model,
#                            model_name = model_name,
#                            optimizer = optimizer,
#                            loss_fn = loss_fn,
#                            device = device,
#                            is_save_model_checkpoint = True,
#                            is_evaluate_train_valid = True)