In [1]:
import cv2, wandb, os, random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torchmetrics
import pytorch_lightning as pl
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from torch.optim.lr_scheduler import CosineAnnealingLR
import multiprocessing

# GPU 2와 4만 사용하도록 환경 변수 설정
os.environ["CUDA_VISIBLE_DEVICES"]= "2,4"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

# 사용 가능한 CPU 코어 수 확인
num_cpus = multiprocessing.cpu_count()
print("Available CPU cores:", num_cpus)

CFG = {
    'IMG_SIZE':28,
    'EPOCHS':10,
    'LEARNING_RATE':5e-4,
    'BATCH_SIZE':32,
    'SEED':41,
    'WORKERS': min(128, os.cpu_count()),
    'NUM_SAMPLES':32,
    'NUM_CLASSES':10,
}
CFG

  from .autonotebook import tqdm as notebook_tqdm


Device: cuda
Current cuda device: 0
Count of using GPUs: 2
Available CPU cores: 64


{'IMG_SIZE': 28,
 'EPOCHS': 10,
 'LEARNING_RATE': 0.0005,
 'BATCH_SIZE': 32,
 'SEED': 41,
 'WORKERS': 64,
 'NUM_SAMPLES': 32,
 'NUM_CLASSES': 10}

## Seed 및 Data Preprocessing 

In [2]:
# for. 실험할 때마다 동일한 시드로 난수를 생성
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정




class MNISTDataModule(pl.LightningDataModule):
    def __init__(self, batch_size, data_dir: str = './data'):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,)) # 1-channel용 정규화
            #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # 3-channel용 정규화
        ])        
        self.dims = (1, 28, 28)
        self.num_classes = 10
        
    def prepare_data(self):
        # download 
        datasets.MNIST(self.data_dir, train=True, download=True)
        datasets.MNIST(self.data_dir, train=False, download=True)
        
    def setup(self, stage=None):
        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            self.mnist_full = datasets.MNIST(self.data_dir, train=True, transform=self.transform)
            self.train_idx, self.val_idx = train_test_split(list(range(len(self.mnist_full))), 
                                                            test_size=0.1, 
                                                            random_state=CFG['SEED'])
            self.train_subset = Subset(self.mnist_full, self.train_idx)
            self.val_subset = Subset(self.mnist_full, self.val_idx)
    
        # Assign test dataset for use in dataloader(s)
        if stage == 'test' or stage is None:
            self.mnist_test = datasets.MNIST(self.data_dir, train=False, transform=self.transform)


    def train_dataloader(self):
        return DataLoader(self.train_subset, batch_size=self.batch_size, shuffle=True, num_workers=CFG['WORKERS'])

    def val_dataloader(self):
        return DataLoader(self.val_subset, batch_size=self.batch_size, num_workers=CFG['WORKERS'])

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=CFG['WORKERS'])

# Model 정의

In [3]:
class RNN(pl.LightningModule):
    def __init__(self, input_size=28, hidden_size=128, output_size=10, num_layers=1):
        super(RNN, self).__init__()
        self.save_hyperparameters()
        self.accuracy = torchmetrics.Accuracy("multiclass", num_classes=CFG['NUM_CLASSES'])
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.view(x.size(0), 28, 28)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.CrossEntropyLoss()(y_hat, y)
        acc = self.accuracy(y_hat, y)       
        self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, logger=True)
        return {'loss': loss, 'train_accuracy': acc}

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.CrossEntropyLoss()(y_hat, y)
        acc = self.accuracy(y_hat, y)   
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return {'val_loss': loss, 'val_accuracy': acc}

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.CrossEntropyLoss()(y_hat, y)
        acc = self.accuracy(y_hat, y)   
        self.log('test_loss', loss)
        self.log('test_acc', acc)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=CFG['LEARNING_RATE'])
        scheduler = CosineAnnealingLR(optimizer, T_max=CFG['EPOCHS'], eta_min=0)
        return [optimizer], [scheduler]



# Image Prediction Callback

In [4]:
class ImagePredictionLogger(pl.Callback):
    """
    Args:
        val_samples (tuple): img와 label 튜플
        num_samples (int): W&B Dashboard에 표시할 이미지 개수
    """
    def __init__(self, val_samples, num_samples=CFG['NUM_SAMPLES']): 
        super(ImagePredictionLogger, self).__init__()
        self.num_samples = num_samples
        self.val_imgs, self.val_labels = val_samples
        
    def on_validation_epoch_end(self, trainer, pl_module):
        # Bring the tensors to CPU
        val_imgs = self.val_imgs.to(device=pl_module.device)
        val_labels = self.val_labels.to(device=pl_module.device)
        # Get model prediction
        logits = pl_module(val_imgs)
        preds = torch.argmax(logits, -1)
        # Log the images as wandb Image
        trainer.logger.experiment.log({
            "examples":[wandb.Image(x, caption=f"Pred:{pred}, Label:{y}") 
                           for x, pred, y in zip(val_imgs[:self.num_samples], 
                                                 preds[:self.num_samples], 
                                                 val_labels[:self.num_samples])]
            })
        
        
dm = MNISTDataModule(batch_size=CFG['BATCH_SIZE'])
dm.prepare_data() # MNIST 데이터 다운로드
dm.setup()

val_samples = next(iter(dm.val_dataloader()))
val_imgs, val_labels = val_samples[0], val_samples[1]
val_imgs.shape, val_labels.shape   

(torch.Size([32, 1, 28, 28]), torch.Size([32]))

In [5]:
model = RNN()
wandb.login(key='eed81e1c0a41dd8dd67a4ca90cea1be5a06d4eb0')
wandb_logger = WandbLogger(project='MNIST', entity='hcim', name='RNN')

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',  
    dirpath='./model/',  # 모델 저장 경로
    filename='best-model-{epoch:02d}-{val_acc:.2f}',
    save_top_k=1,  # 최고 성능의 모델 1개만 저장
    mode='max',  # 'max'는 정확도를 최대화 할 때 사용
)


trainer = pl.Trainer(
    max_epochs=CFG['EPOCHS'],
    logger=wandb_logger,
    callbacks=[checkpoint_callback,
               ImagePredictionLogger(val_samples)],
    accelerator='gpu',
    #devices=[1]
)

trainer.fit(model, dm)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mchan4im[0m ([33mcargo_transport[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision f

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /root/model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | accuracy | MulticlassAccuracy | 0     
1 | rnn      | RNN                | 20.2 K
2 | fc       | Linear             | 1.3 K 
------------------------------------------------
21.5 K    Trainable params
0         Non-trainable params
21.5 K    Total params
0.086     Total estimated model params size (MB)


Epoch 9: 100%|██████████| 1688/1688 [00:27<00:00, 62.43it/s, v_num=dy30, val_loss=0.110, val_acc=0.968]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 1688/1688 [00:27<00:00, 62.42it/s, v_num=dy30, val_loss=0.110, val_acc=0.968]


In [7]:
best_model_path = checkpoint_callback.best_model_path
print("Best model path:", best_model_path)  # 경로 확인

#best_model = RNN.load_from_checkpoint(best_model_path)
trainer.test(model, dm.test_dataloader())
wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]


Best model path: /root/model/best-model-epoch=00-val_acc=0.88.ckpt
Testing DataLoader 0: 100%|██████████| 313/313 [00:03<00:00, 100.96it/s]


0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
test_acc,▁
test_loss,▁
train_acc_epoch,▁▆▇▇▇▇████
train_acc_step,▁▂▅▆▆██▇▇▇▆█▇█▇█▇▆▇▇██▆▇██████▇█████████
train_loss_epoch,█▃▃▂▂▂▁▁▁▁
train_loss_step,█▇▄▃▅▂▂▂▃▃▃▂▄▁▂▂▂▃▂▂▁▂▂▂▁▁▁▁▁▂▂▁▁▂▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▅▅▆▆▇▇███
val_loss,█▄▅▃▃▂▂▁▁▁

0,1
epoch,10.0
test_acc,0.9677
test_loss,0.11056
train_acc_epoch,0.97524
train_acc_step,1.0
train_loss_epoch,0.08759
train_loss_step,0.02482
trainer/global_step,16880.0
val_acc,0.96833
val_loss,0.10953
