In [58]:
import time
import datetime
import os

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
from torch import optim as optim
import torch.distributed as dist

from timm.scheduler.cosine_lr import CosineLRScheduler
from timm.utils import accuracy, AverageMeter

from utils import NativeScalerWithGradNormCount, reduce_tensor
from models.swin_transformer import SwinTransformer
# import train_val_func as tvfunc

In [59]:
# 데이터 전처리 및 변환 정의
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [60]:
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# 학습 및 검증 데이터셋으로 분할
train_length = int(0.8 * len(train_dataset))
val_length = len(train_dataset) - train_length
train_subset, val_subset = random_split(train_dataset, [train_length, val_length])

Files already downloaded and verified
Files already downloaded and verified


In [61]:
batch_size = 64
epochs = 100
lr = 1e-3

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device check:", device)

Device check: cuda


In [62]:
train_iterator = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
val_iterator = DataLoader(val_subset, batch_size=batch_size, shuffle=True)
test_iterator = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [63]:
model = SwinTransformer(img_size=32,
                        patch_size=2,
                        in_chans=3,
                        num_classes=10,
                        embed_dim=64, # 잠재 공간의 차원
                        depths=[2, 2, 2, 2], # 각 스테이지에서의 블록 수
                        num_heads=[2, 4, 8, 16], # 각 스테이지에서의 어텐션 헤드 수
                        window_size=4, # 로컬 윈도우 크기
                        mlp_ratio=3, # MLP 레이어의 확장 비율
                        qkv_bias=True, # default, 자가 어텐션 메커니즘의 선형 변환에 편향(bias) 항 추가 여부
                        qk_scale=None, # default, Query (Q)와 Key (K) 행렬의 내적(dot product)을 정규화하는 데 사용되는 스케일링 계수
                        drop_rate=0.1, # 드롭아웃 비율
                        drop_path_rate=0.1, # 드롭 경로 비율
                        ape=False, # default, absolute position embedding이 patch embedding에 추가
                        norm_layer=nn.LayerNorm, # default
                        patch_norm=True, # default
                        use_checkpoint=False, # defult
                        fused_window_process=False # deafult
                        )

In [64]:
model.to(device)

optimizer = optim.AdamW(model.parameters(), eps=1e-8, betas=(0.9, 0.999), lr=lr, weight_decay=1e-2) # 개발자 설정 weight_decay=0.05
criterion = nn.CrossEntropyLoss()
loss_scaler = NativeScalerWithGradNormCount()

In [65]:
num_steps = int(epochs * len(train_iterator))
warmup_steps = int(3 * len(train_iterator))

lr_scheduler = CosineLRScheduler(
            optimizer,
            t_initial=(num_steps - warmup_steps) if True else num_steps, # config.TRAIN.LR_SCHEDULER.WARMUP_PREFIX
            cycle_mul=1.,
            lr_min=1e-5, # config.TRAIN.MIN_LR
            warmup_lr_init=1e-4, # config.TRAIN.WARMUP_LR
            warmup_t=warmup_steps,
            cycle_limit=1,
            t_in_epochs=False,
            warmup_prefix=True, # TRAIN.LR_SCHEDULER.WARMUP_PREFIX
        )

In [66]:
output = './model_save'

def save_checkpoint(epoch, model, max_accuracy, optimizer, lr_scheduler, loss_scaler):
    save_state = {'model': model.state_dict(),
                  'optimizer': optimizer.state_dict(),
                  'lr_scheduler': lr_scheduler.state_dict(),
                  'max_accuracy': max_accuracy,
                  'scaler': loss_scaler.state_dict(),
                  'epoch': epoch,
                  }

    save_path = os.path.join(output, f'ckpt_epoch_{epoch}.pth')
    print(f"{save_path} saving......")
    torch.save(save_state, save_path)
    print(f"{save_path} saved !!!")

In [67]:
def train_one_epoch(epochs, model, criterion, data_loader, optimizer, epoch, lr_scheduler, loss_scaler): # mixedup_fn 제외
    model.train()
    optimizer.zero_grad()

    num_steps = len(data_loader)
    batch_time = AverageMeter()
    loss_meter = AverageMeter()
    norm_meter = AverageMeter()
    scaler_meter = AverageMeter()

    start = time.time()
    end = time.time()
    for idx, (samples, targets) in enumerate(data_loader):
        samples = samples.cuda(non_blocking=True)
        targets = targets.cuda(non_blocking=True)

        with torch.cuda.amp.autocast(enabled=True): # Enable Pytorch automatic mixed precision (amp)
            outputs = model(samples)
        loss = criterion(outputs, targets)
        loss = loss / 1 # config.TRAIN.ACCUMULATION_STEPS - could be overwritten by command line argument

        # this attribute is added by timm on one optimizer (adahessian)
        is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
        grad_norm = loss_scaler(loss, optimizer, clip_grad=5.0, # gradient 크기 제한
                                parameters=model.parameters(), create_graph=is_second_order,
                                update_grad=(idx + 1) % 1 == 0)
        if (idx + 1) % 1 == 0:
            optimizer.zero_grad()
            lr_scheduler.step_update((epoch * num_steps + idx) // 1)
        loss_scale_value = loss_scaler.state_dict()["scale"]

        torch.cuda.synchronize()

        loss_meter.update(loss.item(), targets.size(0))
        if grad_norm is not None:  # loss_scaler return None if not update
            norm_meter.update(grad_norm)
        scaler_meter.update(loss_scale_value)
        batch_time.update(time.time() - end)
        end = time.time()

        if idx % 10 == 0:
            lr = optimizer.param_groups[0]['lr']
            wd = optimizer.param_groups[0]['weight_decay']
            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
            etas = batch_time.avg * (num_steps - idx)
            print(
                f'Train: [{epoch}/{epochs}][{idx}/{num_steps}]\t'
                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t wd {wd:.4f}\t'
                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
                f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
                f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f})\t'
                f'loss_scale {scaler_meter.val:.4f} ({scaler_meter.avg:.4f})\t'
                f'mem {memory_used:.0f}MB')
    epoch_time = time.time() - start
    print(f"EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}")

In [69]:
@torch.no_grad()
def validate(data_loader, model):
    criterion = torch.nn.CrossEntropyLoss()
    model.eval()

    batch_time = AverageMeter()
    loss_meter = AverageMeter()
    acc1_meter = AverageMeter()
    acc5_meter = AverageMeter()

    end = time.time()
    for idx, (images, target) in enumerate(data_loader):
        images = images.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        with torch.cuda.amp.autocast(enabled=True): # config.AMP_ENABLE
            output = model(images)

        # measure accuracy and record loss
        loss = criterion(output, target)
        acc1, acc5 = accuracy(output, target, topk=(1, 5))

        # acc1 = reduce_tensor(acc1)
        # acc5 = reduce_tensor(acc5)
        # loss = reduce_tensor(loss)

        loss_meter.update(loss.item(), target.size(0))
        acc1_meter.update(acc1.item(), target.size(0))
        acc5_meter.update(acc5.item(), target.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if idx % 10 == 0: # config.PRINT_FREQ
            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
            print(
                f'Test: [{idx}/{len(data_loader)}]\t'
                f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
                f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
                f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
                f'Mem {memory_used:.0f}MB')
    print(f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
    return acc1_meter.avg, acc5_meter.avg, loss_meter.avg

### Train

In [70]:
max_accuracy = 0.0

print("Start training")
start_time = time.time()

for epoch in range(0, epochs):
    # train_iterator.sampler.set_epoch(epoch) # 분산 학습 시에만!

    train_one_epoch(epochs, model, criterion, train_iterator, optimizer, epoch, lr_scheduler, loss_scaler)
    
    if epoch % 1 == 0 or epoch == (epochs - 1): # config.SAVE_FREQ / dist.get_rank() == 0 and (epoch % 1 == 0 or epoch == (epochs - 1))
        save_checkpoint(epoch, model, max_accuracy, optimizer, lr_scheduler, loss_scaler)

        acc1, acc5, loss = validate(val_iterator, model)
        print(f"Accuracy of the network on the {len(val_subset)} test images: {acc1:.1f}%")
        max_accuracy = max(max_accuracy, acc1)
        print(f'Max accuracy: {max_accuracy:.2f}%')

total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))

Start training
Train: [0/100][0/625]	eta 0:00:30 lr 0.000100	 wd 0.0100	time 0.0491 (0.0491)	loss 2.2734 (2.2734)	grad_norm nan (nan)	loss_scale 32768.0000 (32768.0000)	mem 370MB
Train: [0/100][10/625]	eta 0:00:23 lr 0.000105	 wd 0.0100	time 0.0392 (0.0380)	loss 2.0879 (2.2360)	grad_norm 4.1512 (nan)	loss_scale 32768.0000 (32768.0000)	mem 455MB
Train: [0/100][20/625]	eta 0:00:22 lr 0.000110	 wd 0.0100	time 0.0374 (0.0372)	loss 2.1484 (2.2181)	grad_norm 6.4588 (nan)	loss_scale 32768.0000 (32768.0000)	mem 455MB
Train: [0/100][30/625]	eta 0:00:22 lr 0.000114	 wd 0.0100	time 0.0337 (0.0372)	loss 2.2031 (2.1926)	grad_norm 4.6669 (nan)	loss_scale 32768.0000 (32768.0000)	mem 455MB
Train: [0/100][40/625]	eta 0:00:21 lr 0.000119	 wd 0.0100	time 0.0327 (0.0371)	loss 2.1211 (2.1562)	grad_norm 5.0497 (nan)	loss_scale 32768.0000 (32768.0000)	mem 455MB
Train: [0/100][50/625]	eta 0:00:21 lr 0.000124	 wd 0.0100	time 0.0392 (0.0371)	loss 2.1152 (2.1419)	grad_norm 6.1506 (nan)	loss_scale 32768.0000 (327