In [8]:
import cv2, wandb, os, random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
import pytorch_lightning as pl
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from torch.optim.lr_scheduler import CosineAnnealingLR
import multiprocessing

# GPU 2와 4만 사용하도록 환경 변수 설정
os.environ["CUDA_VISIBLE_DEVICES"]= "2,4"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

# 사용 가능한 CPU 코어 수 확인
num_cpus = multiprocessing.cpu_count()
print("Available CPU cores:", num_cpus)

CFG = {
    'IMG_SIZE':28,
    'EPOCHS':5,
    'SEED':41,
    'WORKERS': min(128, os.cpu_count()),
    'NUM_SAMPLES':32,
    'NUM_CLASSES':10,
    'LR':1e-3
}
CFG

Device: cuda
Current cuda device: 0
Count of using GPUs: 2
Available CPU cores: 64


{'IMG_SIZE': 28,
 'EPOCHS': 5,
 'SEED': 41,
 'WORKERS': 64,
 'NUM_SAMPLES': 32,
 'NUM_CLASSES': 10,
 'LR': 0.001}

## Seed 및 Data Preprocessing 

In [9]:
# for. 실험할 때마다 동일한 시드로 난수를 생성
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정




class MNISTDataModule(pl.LightningDataModule):
    def __init__(self, batch_size, data_dir: str = './data'):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,)) # 1-channel용 정규화
            #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # 3-channel용 정규화
        ])        
        self.dims = (1, 28, 28)
        self.num_classes = 10
        
    def prepare_data(self):
        # download 
        datasets.MNIST(self.data_dir, train=True, download=True)
        datasets.MNIST(self.data_dir, train=False, download=True)
        
    def setup(self, stage=None):
        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            self.mnist_full = datasets.MNIST(self.data_dir, train=True, transform=self.transform)
            self.train_idx, self.val_idx = train_test_split(list(range(len(self.mnist_full))), 
                                                            test_size=0.1, 
                                                            random_state=CFG['SEED'])
            self.train_subset = Subset(self.mnist_full, self.train_idx)
            self.val_subset = Subset(self.mnist_full, self.val_idx)
    
        # Assign test dataset for use in dataloader(s)
        if stage == 'test' or stage is None:
            self.mnist_test = datasets.MNIST(self.data_dir, train=False, transform=self.transform)


    def train_dataloader(self):
        return DataLoader(self.train_subset, batch_size=self.batch_size, shuffle=True, num_workers=CFG['WORKERS'])

    def val_dataloader(self):
        return DataLoader(self.val_subset, batch_size=self.batch_size, num_workers=CFG['WORKERS'])

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=CFG['WORKERS'])

# Model 정의

In [10]:
class MLP(pl.LightningModule):
    def __init__(self, n_classes=10, n_layer_1=128, n_layer_2=256, lr=CFG['LR'],):
        super(MLP, self).__init__()
        self.layer_1 = nn.Linear(28 * 28, n_layer_1)
        self.layer_2 = nn.Linear(n_layer_1, n_layer_2)
        self.layer_3 = nn.Linear(n_layer_2, n_classes)
        
        self.save_hyperparameters()
        self.lr = lr
        self.accuracy = torchmetrics.Accuracy("multiclass", num_classes=CFG['NUM_CLASSES'])

    def forward(self, x):
        batch_size, channels, width, height = x.size() # (b, 1, 28, 28) -> (b, 1*28*28)
        x = x.view(batch_size, -1)
        x = self.layer_1(x)
        x = F.relu(x)
        x = self.layer_2(x)
        x = F.relu(x)
        x = self.layer_3(x)

        x = F.log_softmax(x, dim=1)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.CrossEntropyLoss()(y_hat, y)
        acc = self.accuracy(y_hat, y)       
        self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, logger=True)
        return {'loss': loss, 'train_accuracy': acc}

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.CrossEntropyLoss()(y_hat, y)
        acc = self.accuracy(y_hat, y)   
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return {'val_loss': loss, 'val_accuracy': acc}

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.CrossEntropyLoss()(y_hat, y)
        acc = self.accuracy(y_hat, y)   
        self.log('test_loss', loss)
        self.log('test_acc', acc)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=CFG['LR'])
        scheduler = CosineAnnealingLR(optimizer, T_max=CFG['EPOCHS'], eta_min=0)
        return [optimizer], [scheduler]



# Image Prediction Callback

In [11]:
class ImagePredictionLogger(pl.Callback):
    """
    Args:
        val_samples (tuple): img와 label 튜플
        num_samples (int): W&B Dashboard에 표시할 이미지 개수
    """
    def __init__(self, val_samples, num_samples=CFG['NUM_SAMPLES']): 
        super(ImagePredictionLogger, self).__init__()
        self.num_samples = num_samples
        self.val_imgs, self.val_labels = val_samples
        
    def on_validation_epoch_end(self, trainer, pl_module):
        # Bring the tensors to CPU
        val_imgs = self.val_imgs.to(device=pl_module.device)
        val_labels = self.val_labels.to(device=pl_module.device)
        # Get model prediction
        logits = pl_module(val_imgs)
        preds = torch.argmax(logits, -1)
        # Log the images as wandb Image
        trainer.logger.experiment.log({
            "examples":[wandb.Image(x, caption=f"Pred:{pred}, Label:{y}") 
                           for x, pred, y in zip(val_imgs[:self.num_samples], 
                                                 preds[:self.num_samples], 
                                                 val_labels[:self.num_samples])]
            })

# 🧹Sweep!

In [12]:
sweep_config = {
  "method": "random",   # Random search
  "metric": {           # want to maximize val_acc
      "name": "valid_loss",
      "goal": "minimize"
  },
  "parameters": {
        "n_layer_1": {
            # Choose from pre-defined values
            "values": [32, 64, 128, 256, 512]
        },
        "n_layer_2": {
            # Choose from pre-defined values
            "values": [32, 64, 128, 256, 512, 1024]
        },
        "lr": {
            # log uniform distribution between exp(min) and exp(max)
            "distribution": "log_uniform",
            "min": -9.21,   # exp(-9.21) = 1e-4
            "max": -4.61    # exp(-4.61) = 1e-2
        },
        'batch_size': {
            # integers between 32 and 256
            # with evenly-distributed logarithms 
            'distribution': 'q_log_uniform_values',
            'q': 8,
            'min': 16,
            'max': 64,
      }
    }
}

sweep_id = wandb.sweep(sweep_config, project="MNIST")


def sweep_iteration():
    with wandb.init() as run:
    # set up W&B logger
        wandb.login(key='eed81e1c0a41dd8dd67a4ca90cea1be5a06d4eb0')
        wandb_logger = WandbLogger(project='MNIST', entity='hcim', name='MLP', log_model='all')
        
        checkpoint_callback = ModelCheckpoint(
            monitor='val_loss',  
            dirpath='./model/',  # 모델 저장 경로
            filename='best-model-{epoch:02d}-{val_acc:.2f}',
            save_top_k=1,  # 최고 성능의 모델 1개만 저장
            mode='min',  # 'min'는 va_loss를 최소화 할 때 사용
        )
        

        # setup model - note how we refer to sweep parameters with wandb.config
        model = MLP(
            n_layer_1=wandb.config.n_layer_1,
            n_layer_2=wandb.config.n_layer_2,
            lr=wandb.config.lr
        )
        
        dm = MNISTDataModule(batch_size=wandb.config.batch_size)
        dm.prepare_data() # MNIST 데이터 다운로드
        dm.setup()

        val_samples = next(iter(dm.val_dataloader()))
        val_imgs, val_labels = val_samples[0], val_samples[1]
        val_imgs.shape, val_labels.shape   

        trainer = pl.Trainer(
            max_epochs=CFG['EPOCHS'],
            logger=wandb_logger,
            callbacks=[checkpoint_callback,
                    ImagePredictionLogger(val_samples)],
            devices=[0]
        )
        # train
        trainer.fit(model, dm)
        trainer.test(model, dm.test_dataloader())
        
        # Log the best model as a W&B artifact
        artifact = wandb.Artifact('model-artifact', type='model')
        artifact.add_file(checkpoint_callback.best_model_path)
        run.log_artifact(artifact)



Create sweep with ID: 4djk259o
Sweep URL: https://wandb.ai/cargo_transport/MNIST/sweeps/4djk259o


In [13]:
sweep_id = wandb.sweep(sweep_config, project="MNIST")
wandb.agent(sweep_id, function=sweep_iteration)



Create sweep with ID: d0w4w88k
Sweep URL: https://wandb.ai/cargo_transport/MNIST/sweeps/d0w4w88k


[34m[1mwandb[0m: Agent Starting Run: j19w57qj with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	lr: 0.0006030322466231752
[34m[1mwandb[0m: 	n_layer_1: 128
[34m[1mwandb[0m: 	n_layer_2: 256
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 100 K 
1 | layer_2  | Linear             | 33.0 K
2 | layer_3  | Linear             | 2.6 K 
3 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
136 K     Trainable params
0         Non-trainable params
136 K     Total par

Epoch 4: 100%|██████████| 3375/3375 [00:41<00:00, 82.05it/s, v_num=57qj, val_loss=0.0843, val_acc=0.974]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 3375/3375 [00:41<00:00, 81.51it/s, v_num=57qj, val_loss=0.0843, val_acc=0.974]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]


Testing DataLoader 0: 100%|██████████| 625/625 [00:04<00:00, 134.68it/s]


0,1
epoch,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇█
test_acc,▁
test_loss,▁
train_acc_epoch,▁▅▆▇█
train_acc_step,▁▆█▆█▅▆█▆█▆▆▆▅▃▆▆█▆█▆▆██▆██▃██████████▆█
train_loss_epoch,█▄▃▂▁
train_loss_step,█▃▂▃▁▄▅▂▂▁▂▂▃▄▅▃▂▁▂▂▂▄▁▁▃▁▁▄▁▁▂▁▁▁▁▁▁▁▂▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▃▅██
val_loss,█▆▄▂▁

0,1
epoch,5.0
test_acc,0.9763
test_loss,0.07506
train_acc_epoch,0.98791
train_acc_step,1.0
train_loss_epoch,0.03935
train_loss_step,0.0049
trainer/global_step,16875.0
val_acc,0.97433
val_loss,0.08433


[34m[1mwandb[0m: Agent Starting Run: omr3ez8f with config:
[34m[1mwandb[0m: 	batch_size: 24
[34m[1mwandb[0m: 	lr: 0.00039294972351128904
[34m[1mwandb[0m: 	n_layer_1: 512
[34m[1mwandb[0m: 	n_layer_2: 32
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 401 K 
1 | layer_2  | Linear             | 16.4 K
2 | layer_3  | Linear             | 330   
3 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
418 K     Trainable params
0         Non-trainable params
418 K     Total par

Epoch 4: 100%|██████████| 2250/2250 [00:29<00:00, 75.76it/s, v_num=ez8f, val_loss=0.0828, val_acc=0.975]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 2250/2250 [00:30<00:00, 74.79it/s, v_num=ez8f, val_loss=0.0828, val_acc=0.975]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]


Testing DataLoader 0: 100%|██████████| 417/417 [00:03<00:00, 129.04it/s]


0,1
epoch,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇█
test_acc,▁
test_loss,▁
train_acc_epoch,▁▅▇▇█
train_acc_step,▁▅▅▇▁▅▇▇▇▅▇█▇█▄▇██▇▇█▇▇▇███▇█▇▅█▇█▇▇███▇
train_loss_epoch,█▄▂▂▁
train_loss_step,█▄▃▂▇▅▃▃▂▅▃▁▄▂▄▄▁▁▂▂▁▂▄▂▁▁▁▂▁▃▂▁▂▁▂▂▁▁▁▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_acc,▁▄▆▇█
val_loss,█▅▃▂▁

0,1
epoch,5.0
test_acc,0.9786
test_loss,0.07183
train_acc_epoch,0.9867
train_acc_step,1.0
train_loss_epoch,0.04452
train_loss_step,0.003
trainer/global_step,11250.0
val_acc,0.97533
val_loss,0.08281


[34m[1mwandb[0m: Agent Starting Run: g0coymle with config:
[34m[1mwandb[0m: 	batch_size: 56
[34m[1mwandb[0m: 	lr: 0.0029153204335138063
[34m[1mwandb[0m: 	n_layer_1: 64
[34m[1mwandb[0m: 	n_layer_2: 64
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 50.2 K
1 | layer_2  | Linear             | 4.2 K 
2 | layer_3  | Linear             | 650   
3 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
55.1 K    Trainable params
0         Non-trainable params
55.1 K    Total par

Epoch 4: 100%|██████████| 965/965 [00:15<00:00, 62.58it/s, v_num=ymle, val_loss=0.145, val_acc=0.959]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 965/965 [00:15<00:00, 61.57it/s, v_num=ymle, val_loss=0.145, val_acc=0.959]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]


Testing DataLoader 0: 100%|██████████| 179/179 [00:01<00:00, 120.94it/s]


0,1
epoch,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇█
test_acc,▁
test_loss,▁
train_acc_epoch,▁▅▇██
train_acc_step,▁▂▅▅▆▇▆▆▅▇▇▇▆▅▆▇▇▆▆▆▆▇▆▇▇▆▇▇▆▆▇▇▆▆▇▇▆▇█▇
train_loss_epoch,█▃▂▁▁
train_loss_step,█▅▃▂▃▁▂▂▃▂▂▂▂▃▂▂▂▂▃▂▂▁▂▁▂▂▂▃▂▂▂▁▂▂▁▁▂▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
val_acc,▁▄▆▇█
val_loss,█▆▃▂▁

0,1
epoch,5.0
test_acc,0.9617
test_loss,0.12794
train_acc_epoch,0.96585
train_acc_step,0.91071
train_loss_epoch,0.11336
train_loss_step,0.259
trainer/global_step,4825.0
val_acc,0.95933
val_loss,0.1452


[34m[1mwandb[0m: Agent Starting Run: se6qgw5x with config:
[34m[1mwandb[0m: 	batch_size: 24
[34m[1mwandb[0m: 	lr: 0.009186443884718235
[34m[1mwandb[0m: 	n_layer_1: 512
[34m[1mwandb[0m: 	n_layer_2: 256
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 401 K 
1 | layer_2  | Linear             | 131 K 
2 | layer_3  | Linear             | 2.6 K 
3 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
535 K     Trainable params
0         Non-trainable params
535 K     Total par

Epoch 4: 100%|██████████| 2250/2250 [00:29<00:00, 76.18it/s, v_num=gw5x, val_loss=0.0798, val_acc=0.979]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 2250/2250 [00:29<00:00, 75.48it/s, v_num=gw5x, val_loss=0.0798, val_acc=0.979]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]


Testing DataLoader 0: 100%|██████████| 417/417 [00:03<00:00, 130.88it/s]


0,1
epoch,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇█
test_acc,▁
test_loss,▁
train_acc_epoch,▁▅▆▇█
train_acc_step,▅▁█▅▆██▆██▆█▅▆██▅█▆▅██▆███████▅█████████
train_loss_epoch,█▄▃▂▁
train_loss_step,▄█▂▄▂▁▂▂▂▁▃▂▂▂▁▁▄▁▅▂▂▁▃▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_acc,▁▃▇▆█
val_loss,█▆▂▃▁

0,1
epoch,5.0
test_acc,0.98
test_loss,0.06551
train_acc_epoch,0.99004
train_acc_step,1.0
train_loss_epoch,0.03201
train_loss_step,0.0147
trainer/global_step,11250.0
val_acc,0.97867
val_loss,0.07985


[34m[1mwandb[0m: Agent Starting Run: q9oudyzi with config:
[34m[1mwandb[0m: 	batch_size: 40
[34m[1mwandb[0m: 	lr: 0.00012410187032061465
[34m[1mwandb[0m: 	n_layer_1: 256
[34m[1mwandb[0m: 	n_layer_2: 256
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 200 K 
1 | layer_2  | Linear             | 65.8 K
2 | layer_3  | Linear             | 2.6 K 
3 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
269 K     Trainable params
0         Non-trainable params
269 K     Total par

Epoch 4: 100%|██████████| 1350/1350 [00:19<00:00, 68.67it/s, v_num=dyzi, val_loss=0.0843, val_acc=0.977]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 1350/1350 [00:19<00:00, 67.81it/s, v_num=dyzi, val_loss=0.0843, val_acc=0.977]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]


Testing DataLoader 0: 100%|██████████| 250/250 [00:01<00:00, 126.95it/s]


0,1
epoch,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇█
test_acc,▁
test_loss,▁
train_acc_epoch,▁▅▇▇█
train_acc_step,▃▁▃▇▇▆▅▇▆▂▅▆▅▄█▇▇█▇▄▆▇██▇▇███▆▇███▆█▆███
train_loss_epoch,█▄▂▂▁
train_loss_step,▆█▆▂▃▃▄▂▃▇▃▂▅▅▂▂▂▁▂▄▃▂▁▁▃▂▁▁▂▂▂▁▁▁▃▁▂▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_acc,▁▄▇▇█
val_loss,█▅▃▂▁

0,1
epoch,5.0
test_acc,0.9775
test_loss,0.07223
train_acc_epoch,0.98817
train_acc_step,0.975
train_loss_epoch,0.03904
train_loss_step,0.03474
trainer/global_step,6750.0
val_acc,0.97683
val_loss,0.08434


[34m[1mwandb[0m: Agent Starting Run: 3y7b15mo with config:
[34m[1mwandb[0m: 	batch_size: 24
[34m[1mwandb[0m: 	lr: 0.0010815730280771796
[34m[1mwandb[0m: 	n_layer_1: 64
[34m[1mwandb[0m: 	n_layer_2: 32
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 50.2 K
1 | layer_2  | Linear             | 2.1 K 
2 | layer_3  | Linear             | 330   
3 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
52.6 K    Trainable params
0         Non-trainable params
52.6 K    Total par

Epoch 4: 100%|██████████| 2250/2250 [00:30<00:00, 74.96it/s, v_num=15mo, val_loss=0.146, val_acc=0.958]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 2250/2250 [00:30<00:00, 74.18it/s, v_num=15mo, val_loss=0.146, val_acc=0.958]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]


Testing DataLoader 0: 100%|██████████| 417/417 [00:03<00:00, 128.37it/s]


0,1
epoch,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇█
test_acc,▁
test_loss,▁
train_acc_epoch,▁▅▇▇█
train_acc_step,▁▂▅▄▅▃▅▅▇▇▅▆▇▇▇▆██▇▇██▇▆██████▇▇██▇▇▇▆██
train_loss_epoch,█▄▂▁▁
train_loss_step,█▆▄▄▅▅▃▄▃▂▃▃▂▂▂▃▁▁▁▂▁▁▂▂▁▁▁▁▁▁▁▂▁▁▁▂▂▂▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_acc,▁▄▇▇█
val_loss,█▆▃▂▁

0,1
epoch,5.0
test_acc,0.9638
test_loss,0.12221
train_acc_epoch,0.96583
train_acc_step,1.0
train_loss_epoch,0.11207
train_loss_step,0.05011
trainer/global_step,11250.0
val_acc,0.95783
val_loss,0.14603


[34m[1mwandb[0m: Agent Starting Run: rcc141gd with config:
[34m[1mwandb[0m: 	batch_size: 48
[34m[1mwandb[0m: 	lr: 0.0002507580771176963
[34m[1mwandb[0m: 	n_layer_1: 256
[34m[1mwandb[0m: 	n_layer_2: 1024
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 200 K 
1 | layer_2  | Linear             | 263 K 
2 | layer_3  | Linear             | 10.2 K
3 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
474 K     Trainable params
0         Non-trainable params
474 K     Total par

Epoch 4: 100%|██████████| 1125/1125 [00:17<00:00, 63.50it/s, v_num=41gd, val_loss=0.0752, val_acc=0.978]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 1125/1125 [00:17<00:00, 62.57it/s, v_num=41gd, val_loss=0.0752, val_acc=0.978]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]


Testing DataLoader 0: 100%|██████████| 209/209 [00:01<00:00, 122.95it/s]


0,1
epoch,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇█
test_acc,▁
test_loss,▁
train_acc_epoch,▁▅▆▇█
train_acc_step,▁▂▄▆▆▅▆▆▅▆▇▆█▇▅▇▇▇▇▆▇█▆█▇▇█▆█▇▆████▇▆▇▇▇
train_loss_epoch,█▄▃▂▁
train_loss_step,█▇▅▃▄▃▄▄▄▂▁▆▁▁▅▂▃▂▂▃▁▁▂▂▁▁▁▃▁▃▂▁▁▁▁▁▂▂▁▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_acc,▁▅▆▇█
val_loss,█▅▃▂▁

0,1
epoch,5.0
test_acc,0.9802
test_loss,0.06151
train_acc_epoch,0.99159
train_acc_step,0.97917
train_loss_epoch,0.02821
train_loss_step,0.03953
trainer/global_step,5625.0
val_acc,0.97833
val_loss,0.0752


[34m[1mwandb[0m: Agent Starting Run: tk4zwa0s with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	lr: 0.000498420571637257
[34m[1mwandb[0m: 	n_layer_1: 64
[34m[1mwandb[0m: 	n_layer_2: 64
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 50.2 K
1 | layer_2  | Linear             | 4.2 K 
2 | layer_3  | Linear             | 650   
3 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
55.1 K    Trainable params
0         Non-trainable params
55.1 K    Total par

Epoch 4: 100%|██████████| 3375/3375 [00:42<00:00, 78.83it/s, v_num=wa0s, val_loss=0.118, val_acc=0.967]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 3375/3375 [00:43<00:00, 78.39it/s, v_num=wa0s, val_loss=0.118, val_acc=0.967]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]


Testing DataLoader 0: 100%|██████████| 625/625 [00:04<00:00, 138.24it/s]


0,1
epoch,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇█
test_acc,▁
test_loss,▁
train_acc_epoch,▁▅▇▇█
train_acc_step,▂▅▆█▅▁▇█▇▆▇▆▇▆█▇▇██▇▆██▇█▇█▇██▇█▇███▇▇█▇
train_loss_epoch,█▄▂▁▁
train_loss_step,▆▄▃▁▆█▂▁▃▂▂▅▂▂▂▂▂▁▁▂▃▁▁▄▁▂▁▂▂▁▄▁▂▁▁▁▄▃▁▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▄▅▇█
val_loss,█▅▄▂▁

0,1
epoch,5.0
test_acc,0.9691
test_loss,0.10006
train_acc_epoch,0.97694
train_acc_step,1.0
train_loss_epoch,0.07574
train_loss_step,0.04065
trainer/global_step,16875.0
val_acc,0.96717
val_loss,0.11762


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ka8sui1s with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	lr: 0.00331067028577384
[34m[1mwandb[0m: 	n_layer_1: 256
[34m[1mwandb[0m: 	n_layer_2: 512
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 200 K 
1 | layer_2  | Linear             | 131 K 
2 | layer_3  | Linear             | 5.1 K 
3 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
337 K     Trainable params
0         Non-trainable params
337 K     Total par

Epoch 4: 100%|██████████| 3375/3375 [00:42<00:00, 79.62it/s, v_num=ui1s, val_loss=0.0811, val_acc=0.976]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 3375/3375 [00:42<00:00, 79.12it/s, v_num=ui1s, val_loss=0.0811, val_acc=0.976]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]


Testing DataLoader 0: 100%|██████████| 625/625 [00:04<00:00, 129.91it/s]


0,1
epoch,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇█
test_acc,▁
test_loss,▁
train_acc_epoch,▁▅▆▇█
train_acc_step,▁▆▁█▆▃██▆█▃▆▆▆█▁██████████████▆█████████
train_loss_epoch,█▄▃▂▁
train_loss_step,▆▂█▂▃▃▂▁▄▂▃▂▂▄▁▇▁▁▁▂▁▁▁▁▁▁▁▂▁▁▂▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▅▅██
val_loss,█▄▄▁▁

0,1
epoch,5.0
test_acc,0.9792
test_loss,0.07195
train_acc_epoch,0.99046
train_acc_step,1.0
train_loss_epoch,0.03058
train_loss_step,0.00596
trainer/global_step,16875.0
val_acc,0.97633
val_loss,0.08107


[34m[1mwandb[0m: Agent Starting Run: wokc8hiq with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	lr: 0.0010287041276271936
[34m[1mwandb[0m: 	n_layer_1: 256
[34m[1mwandb[0m: 	n_layer_2: 32
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4]

  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 200 K 
1 | layer_2  | Linear             | 8.2 K 
2 | layer_3  | Linear             | 330   
3 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
209 K     Trainable params
0         Non-trainable params
209 K     Total par

Epoch 0:  68%|██████▊   | 1149/1688 [00:12<00:05, 94.49it/s, v_num=8hiq]   

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Traceback (most recent call last):
  File "/tmp/ipykernel_1711725/1270799488.py", line 74, in sweep_iteration
    trainer.fit(model, dm)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit
    call._call_and_handle_interrupt(
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 987, in _run
    results = self._run_stage()
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1033, in _run_stage
    self.fit_loop.run()
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loops/fit_loop.py", line 205, in run
    self.advance()
  File "/usr/local/

![Artifacts/.png](image.png)

In [15]:
import wandb
run = wandb.init()
artifact = run.use_artifact('cargo_transport/MNIST/model-rcc141gd:v4', type='model')
artifact_dir = artifact.download()

[34m[1mwandb[0m:   1 of 1 files downloaded.  
