In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

import plotly.graph_objects as go

import random
import pandas as pd
import numpy as np

!pip install torchinfo
from torchinfo import summary



In [23]:
# Random Seed 고정 (학습 반복 시행 시에도 동일한 결과가 나오도록)

seed = 20250301

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

**1. 데이터셋 로딩 및 데이터 분석**

In [24]:
# 데이터셋 로딩

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_dataset = torchvision.datasets.MNIST(root='./data',
                                           train=True,
                                           transform=transform,
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root='./data',
                                          train=False,
                                          transform=transform,
                                          download=True)


In [25]:
# 시간 절약을 위해, 학습 데이터에서 랜덤하게 일부 샘플만 추출

from torch.utils.data import Subset, DataLoader

NUM_TRAIN_SAMPLES = 10000
BATCH_SIZE = 32

subset_indices = random.sample(range(len(train_dataset)), NUM_TRAIN_SAMPLES)
train_subset = Subset(train_dataset, subset_indices)

train_loader = DataLoader(train_subset,
                          batch_size=BATCH_SIZE,
                          shuffle=True)

# 테스트 데이터셋은 학습 대상이 아니므로 그대로 이용
test_loader = DataLoader(test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=False)

In [26]:
# 클래스 불균형 분석

# 학습 데이터
train_labels = torch.tensor([train_subset.dataset.targets[i] for i in subset_indices])
train_class_counts = torch.bincount(train_labels)
print(train_class_counts)

NUM_CLASSES = len(train_class_counts)

tensor([ 949, 1161,  976, 1066,  974,  895, 1010, 1036,  971,  962])


In [27]:
train_class_percentage = np.array(train_class_counts) * 100.0 / sum(train_class_counts)

train_y_distrib = pd.DataFrame({'class': list(range(NUM_CLASSES)),
                                'count': train_class_counts,
                                'percentage (%)': train_class_percentage})

train_y_distrib

Unnamed: 0,class,count,percentage (%)
0,0,949,9.49
1,1,1161,11.61
2,2,976,9.76
3,3,1066,10.66
4,4,974,9.74
5,5,895,8.95
6,6,1010,10.1
7,7,1036,10.36
8,8,971,9.71
9,9,962,9.62


In [28]:
# 테스트 데이터
test_labels = test_loader.dataset.targets
test_class_counts = torch.bincount(test_labels)
print(test_class_counts)

tensor([ 980, 1135, 1032, 1010,  982,  892,  958, 1028,  974, 1009])


In [29]:
test_class_percentage = np.array(test_class_counts) * 100.0 / sum(test_class_counts)

test_y_distrib = pd.DataFrame({'class': list(range(NUM_CLASSES)),
                               'count': test_class_counts,
                               'percentage (%)': test_class_percentage})

test_y_distrib

Unnamed: 0,class,count,percentage (%)
0,0,980,9.8
1,1,1135,11.35
2,2,1032,10.32
3,3,1010,10.1
4,4,982,9.82
5,5,892,8.92
6,6,958,9.58
7,7,1028,10.28
8,8,974,9.74
9,9,1009,10.09


**2. CNN 모델 정의**

In [30]:
# CNN 모델 정의

class CNN(nn.Module):

    # args:
    # - conv_activation : 모든 Conv. Layer 에 적용할 Activation Function
    # - fc_activation   : Fully-Connected Layer 에 적용할 Activation Function

    def __init__(self, conv_activation, fc_activation):
        super(CNN, self).__init__()
        self.conv_activation = conv_activation
        self.fc_activation = fc_activation

        # Create Conv. and Linear layer
        def create_layer(main_layer, activation):

            if activation == 'sigmoid':
                activation_layer = nn.Sigmoid()

            elif activation == 'relu':
                activation_layer = nn.ReLU()

            elif activation == 'leaky_relu_0.01':
                activation_layer = nn.LeakyReLU(negative_slope=0.01)

            elif activation == 'leaky_relu_0.1':
                activation_layer = nn.LeakyReLU(negative_slope=0.1)

            elif activation == 'prelu':
                activation_layer = nn.PReLU()

            elif activation == 'elu':
                activation_layer = nn.ELU(alpha=1.0)

            elif activation == 'tanh':
                activation_layer = nn.Tanh()

            elif activation == 'silu' or activation == 'swish':
                activation_layer = nn.SiLU()

            return nn.Sequential(
                main_layer,
                activation_layer
            )

        # Conv
        self.conv1 = create_layer(
            main_layer=nn.Conv2d(1, 32, kernel_size=3, padding=1),
            activation=self.conv_activation
        )
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = create_layer(
            main_layer=nn.Conv2d(32, 64, kernel_size=3),
            activation=self.conv_activation
        )
        self.pool2 = nn.MaxPool2d(2, 2)
        self.conv3 = create_layer(
            main_layer=nn.Conv2d(64, 64, kernel_size=3),
            activation=self.conv_activation
        )

        # Fully Connected
        self.fc1 = create_layer(
            main_layer=nn.Linear(64 * 4 * 4, 64),
            activation=self.fc_activation
        )
        self.fc_final = nn.Sequential(
            nn.Linear(64, 10),
            nn.Softmax()  # Classification Task 의 Output Layer 이므로 Softmax 고정
        )

    def forward(self, x):

        # Conv
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.conv3(x)

        x = x.view(-1, 64 * 4 * 4)

        # Fully Connected
        x = self.fc1(x)
        x = self.fc_final(x)

        return x

In [31]:
# 모델 구조 출력

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN(conv_activation='relu', fc_activation='sigmoid').to(device)

print(summary(model, input_size=(BATCH_SIZE, 1, 28, 28)))

Layer (type:depth-idx)                   Output Shape              Param #
CNN                                      [32, 10]                  --
├─Sequential: 1-1                        [32, 32, 28, 28]          --
│    └─Conv2d: 2-1                       [32, 32, 28, 28]          320
│    └─ReLU: 2-2                         [32, 32, 28, 28]          --
├─MaxPool2d: 1-2                         [32, 32, 14, 14]          --
├─Sequential: 1-3                        [32, 64, 12, 12]          --
│    └─Conv2d: 2-3                       [32, 64, 12, 12]          18,496
│    └─ReLU: 2-4                         [32, 64, 12, 12]          --
├─MaxPool2d: 1-4                         [32, 64, 6, 6]            --
├─Sequential: 1-5                        [32, 64, 4, 4]            --
│    └─Conv2d: 2-5                       [32, 64, 4, 4]            36,928
│    └─ReLU: 2-6                         [32, 64, 4, 4]            --
├─Sequential: 1-6                        [32, 64]                  --
│    └

  return inner()


**3. 데이터셋 분리**

* Train Data -> Train Data + Valid Data (epoch) + Valid Data (하이퍼파라미터 최적화)

In [32]:
# 데이터셋 분리

from torch.utils.data import random_split

# 샘플 수
num_train = 3000
num_valid_epoch = 2000
num_valid_hpo = 5000

assert NUM_TRAIN_SAMPLES == num_train + num_valid_epoch + num_valid_hpo

# 데이터셋 분리
train_dataset, valid_epoch_dataset, valid_hpo_dataset =\
    random_split(train_subset, [num_train, num_valid_epoch, num_valid_hpo])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_epoch_loader = DataLoader(valid_epoch_dataset, batch_size=BATCH_SIZE, shuffle=False)
valid_hpo_loader = DataLoader(valid_hpo_dataset, batch_size=BATCH_SIZE, shuffle=False)

**4. 하이퍼파라미터 최적화 학습 실시 함수**

* 하이퍼파라미터 최적화 라이브러리는 Optuna 사용
* 하이퍼파라미터 탐색 120 회 실시
* 하이퍼파라미터 목록
  * Conv1, Conv2, Conv3 Layer 각각의 Dropout Rate
  * 첫번째 Fully Connected Layer 의 Dropout Rate


In [33]:
MAX_EPOCHS = 65536
EARLY_STOPPING_ROUNDS = 5  # Early Stopping Patience (epochs)
TRIAL_COUNT = 120          # HPO trial count

In [34]:
from sklearn.metrics import accuracy_score
from copy import deepcopy

In [35]:
# Optuna 설정

!pip install optuna
import optuna
import logging

optuna.logging.set_verbosity(logging.WARNING)



In [36]:
# 모델 학습 실시

# args :
# - model           : 학습할 모델
# - train_loader    : Training Data Loader
# - train_loss_list : 각 epoch 에서의 train loss 기록

# returns :
# - train_loss : 모델의 Train Loss

def run_train(model, train_loader, train_loss_list):
    model.train()
    train_loss = 0.0
    cnt = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # train 실시
        model.optimizer.zero_grad()
        outputs = model(images)

        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        model.optimizer.step()

        train_loss += loss.item()
        cnt += 1

    train_loss_list.append(train_loss / len(train_loader))
    return train_loss_list[-1]

In [37]:
# 모델 validation 실시

# args :
# - model        : validation 할 모델
# - valid_loader : Validation Data Loader

# returns :
# - accuracy : 모델의 validation 정확도

def run_validation(model, valid_loader):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for images, labels in valid_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)

            # validation 실시 및 정확도 측정
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    return accuracy

In [38]:
# 모델 학습 및 validation 전체 프로세스

# args :
# - model              : 학습할 모델
# - train_loader       : Training Data Loader
# - valid_epoch_loader : 각 epoch 마다 validation 할 Valid Data Loader
# - valid_hpo_loader   : 최종적으로 해당 하이퍼파라미터 조합에 대한 Valid Data Loader
# - verbose            : 학습 중 프로세스 출력 여부

# returns :
# - final_acc        : 해당 하이퍼파라미터 조합에 대한 최종 Accuracy (valid_acc 이 가장 높았던 epoch 의 모델로 측정)
# - best_epoch_model : valid_acc 이 가장 높았던 epoch 에서 생성된 모델

def run_model_common(model, train_loader, valid_epoch_loader, valid_hpo_loader,
                     verbose=False):

    train_loss_list = []       # train loss
    valid_acc_list = []        # valid accuracy
    max_valid_acc = 0.0        # max validation accuracy
    best_valid_acc_epoch = -1  # valid_acc 이 가장 높았던 epoch
    best_epoch_model = None    # valid_acc 이 가장 높았던 epoch 의 모델

    # 1. 학습 실시
    for epoch in range(MAX_EPOCHS):

        # 1-1. train model
        train_loss = run_train(model, train_loader, train_loss_list)

        # 1-2. validate model (with EPOCH VALID SET)
        epoch_acc = run_validation(model, valid_epoch_loader)
        valid_acc_list.append(epoch_acc)

        # 1-3. Early Stopping 처리 (overfitting 방지)
        if epoch_acc > max_valid_acc:
            max_valid_acc = epoch_acc
            best_valid_acc_epoch = epoch

            best_epoch_model = CNN(conv_activation=model.conv_activation,
                                   fc_activation=model.fc_activation).to(device)

            best_epoch_model.load_state_dict(model.state_dict())

            if verbose:
                print('best model updated')

        if epoch - best_valid_acc_epoch >= EARLY_STOPPING_ROUNDS:
            break

        # 1-4. 결과 출력
        if verbose:
            print(f"Epoch {epoch+1}, Loss: {train_loss:.4f}, Accuracy: {epoch_acc:.4f}")

    # check best-epoch model correctly loaded
    checked_acc = run_validation(best_epoch_model, valid_epoch_loader)

    if verbose:
        print(f"Best Epoch: {best_valid_acc_epoch}, Best Valid Acc: {max_valid_acc}")
        print(f"Valid Acc (with Epoch valid set) on Loaded Best Model: {checked_acc}")

    assert abs(max_valid_acc - checked_acc) < 1e-8

    # 2. validate best-epoch model (with HPO VALID SET)
    final_acc = run_validation(best_epoch_model, valid_hpo_loader)

    if verbose:
        print(f"Final Acc (with HPO valid set) on Loaded Best Model: {final_acc}")

    return final_acc, best_epoch_model

In [39]:
print(device)

cuda


**4-1. 실험 실시**

In [40]:
hpo_best_acc = 0              # 모든 Hyper-param 조합의 HPO Valid set 정확도 중 가장 높은 것
best_hyperparam_set = None    # HPO Valid set 정확도가 가장 높은 Hyper-param 조합
best_hyperparam_model = None  # best_hyperparam_set 의 Hyper-param 조합으로 학습된 모델

In [41]:
verbose_at_first_trial = True  # 1st trial 에만 학습 중 정보 출력

def objective(trial):
    global hpo_best_acc, best_hyperparam_set, best_hyperparam_model, verbose_at_first_trial

    # hyper-params
    activation_list = ['sigmoid', 'relu', 'leaky_relu_0.01', 'leaky_relu_0.1',
                       'prelu', 'elu', 'tanh', 'silu']
    params = {
        'conv_activation': trial.suggest_categorical('conv_activation', activation_list),
        'fc_activation': trial.suggest_categorical('fc_activation', activation_list),
        'learning_rate': trial.suggest_float('learning_rate', 0.0005, 0.01, log=True)
    }

    # define and run model
    model = CNN(conv_activation=params['conv_activation'],
                fc_activation=params['fc_activation']).to(device)

    model.optimizer = torch.optim.AdamW(model.parameters(),
                                        lr=params['learning_rate'])

    final_acc, best_epoch_model = run_model_common(model,
                                                   train_loader,
                                                   valid_epoch_loader,
                                                   valid_hpo_loader,
                                                   verbose=verbose_at_first_trial)

    verbose_at_first_trial = False

    # global best model 갱신
    if final_acc > hpo_best_acc:
        hpo_best_acc = final_acc
        best_hyperparam_set = params

        best_hyperparam_model = CNN(conv_activation=best_hyperparam_set['conv_activation'],
                                    fc_activation=best_hyperparam_set['fc_activation']).to(device)
        best_hyperparam_model.load_state_dict(best_epoch_model.state_dict())

        print(f'best_hyperparam_model updated with Accuracy={hpo_best_acc:.4f}')

    print(f"Params: {params}, Accuracy: {final_acc:.4f}")
    return final_acc

In [42]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=TRIAL_COUNT)

  return self._call_impl(*args, **kwargs)


best model updated
Epoch 1, Loss: 1.9567, Accuracy: 0.5945
best model updated
Epoch 2, Loss: 1.7678, Accuracy: 0.7340
best model updated
Epoch 3, Loss: 1.7196, Accuracy: 0.7470
Epoch 4, Loss: 1.6991, Accuracy: 0.7405
best model updated
Epoch 5, Loss: 1.7122, Accuracy: 0.7595
best model updated
Epoch 6, Loss: 1.6982, Accuracy: 0.7655
Epoch 7, Loss: 1.6868, Accuracy: 0.7215
Epoch 8, Loss: 1.6930, Accuracy: 0.7570
Epoch 9, Loss: 1.6968, Accuracy: 0.7600
Epoch 10, Loss: 1.6994, Accuracy: 0.7575
Best Epoch: 5, Best Valid Acc: 0.7655
Valid Acc (with Epoch valid set) on Loaded Best Model: 0.7655
Final Acc (with HPO valid set) on Loaded Best Model: 0.7626
best_hyperparam_model updated with Accuracy=0.7626
Params: {'conv_activation': 'elu', 'fc_activation': 'relu', 'learning_rate': 0.0020161198602182983}, Accuracy: 0.7626
best_hyperparam_model updated with Accuracy=0.9762
Params: {'conv_activation': 'elu', 'fc_activation': 'sigmoid', 'learning_rate': 0.0024462886019396427}, Accuracy: 0.9762
Par

In [43]:
# Test Dataset 성능 평가

print(f'best hyper-param: {best_hyperparam_set}, best acc: {hpo_best_acc}')

best hyper-param: {'conv_activation': 'prelu', 'fc_activation': 'tanh', 'learning_rate': 0.001987984615656252}, best acc: 0.9794


In [44]:
# best_hyperparam_model 이 정상적으로 load 되었는지 최종 확인

checked_hpo_acc = run_validation(best_hyperparam_model, valid_hpo_loader)
print(f"Valid Acc (with HPO valid set) on Best Hyper-param Model: {checked_hpo_acc}")

assert abs(hpo_best_acc - checked_hpo_acc) < 1e-8

Valid Acc (with HPO valid set) on Best Hyper-param Model: 0.9794


In [45]:
# 테스트셋에 대한 최종 정확도

hpo_final_acc = run_validation(best_hyperparam_model, test_loader)

print(f'Final HPO Acc (with test set) : {hpo_final_acc}')

Final HPO Acc (with test set) : 0.9802


**5. HPO 성능 결과 확인**

In [46]:
from optuna.visualization import plot_optimization_history

In [47]:
# HPO 추이

fig = plot_optimization_history(study)
fig.update_layout(width=1100,
                  height=700,
                  yaxis_title='Accuracy (HPO valid set)')
fig.show()

In [50]:
fig.update_layout(width=1100,
                  height=700,
                  yaxis_title='Accuracy (HPO valid set)',
                  yaxis=dict(range=[0.8, 0.983]))
fig.show()

**6. 각 Hyperparameter 값에 따른 성능 분포 확인**

In [51]:
# trial DataFrame 가져오기

trials_df = study.trials_dataframe()

In [52]:
trials_df

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_conv_activation,params_fc_activation,params_learning_rate,state
0,0,0.7626,2025-03-01 13:22:41.097689,2025-03-01 13:23:05.998403,0 days 00:00:24.900714,elu,relu,0.002016,COMPLETE
1,1,0.9762,2025-03-01 13:23:05.998542,2025-03-01 13:23:38.847111,0 days 00:00:32.848569,elu,sigmoid,0.002446,COMPLETE
2,2,0.9724,2025-03-01 13:23:38.847267,2025-03-01 13:24:07.534923,0 days 00:00:28.687656,leaky_relu_0.1,sigmoid,0.001002,COMPLETE
3,3,0.9352,2025-03-01 13:24:07.535071,2025-03-01 13:24:22.161363,0 days 00:00:14.626292,prelu,relu,0.002156,COMPLETE
4,4,0.9678,2025-03-01 13:24:22.161500,2025-03-01 13:24:46.943706,0 days 00:00:24.782206,leaky_relu_0.01,elu,0.000889,COMPLETE
...,...,...,...,...,...,...,...,...,...
115,115,0.9760,2025-03-01 14:11:55.081448,2025-03-01 14:12:44.104692,0 days 00:00:49.023244,prelu,tanh,0.001626,COMPLETE
116,116,0.9760,2025-03-01 14:12:44.104866,2025-03-01 14:13:15.365209,0 days 00:00:31.260343,elu,sigmoid,0.001379,COMPLETE
117,117,0.9724,2025-03-01 14:13:15.365451,2025-03-01 14:13:43.628091,0 days 00:00:28.262640,relu,tanh,0.001869,COMPLETE
118,118,0.9784,2025-03-01 14:13:43.628286,2025-03-01 14:14:18.218560,0 days 00:00:34.590274,prelu,tanh,0.002756,COMPLETE


In [56]:
# Conv. Layer 및 Fully-Connected Layer 의 Activation Function 에 따른 정확도

import plotly.express as px

count_data = trials_df.groupby(by=['params_conv_activation',
                                   'params_fc_activation'], as_index=False)['value'].max()

fig = px.bar(count_data,
             x='params_conv_activation', y='value',
             color='params_fc_activation',
             barmode='group',
             title='Max Accuracy by Conv. and Fully-Connected Layer Activation Function')

fig.update_layout(width=1200, height=550,
                  yaxis=dict(range=[0.8, 1]),
                  yaxis_title='Accuracy')
fig.show()

In [57]:
# Learning Rate 에 따른 정확도
# Coloring = Conv. Layer Activation Function Type

fig = px.scatter(trials_df,
                 x="params_learning_rate", y="value",
                 color="params_conv_activation",
                 title="Accuracy by Learning Rate")

fig.update_layout(width=800, height=500,
                  xaxis_title='learning_rate',
                  yaxis_title='accuracy')
fig.show()

In [61]:
fig.update_layout(xaxis=dict(range=[0, 0.006]),
                  yaxis=dict(range=[0.6, 1.0]))
fig.show()

In [62]:
fig.update_layout(xaxis=dict(range=[0, 0.006]),
                  yaxis=dict(range=[0.9, 0.98]))
fig.show()

In [63]:
# Learning Rate 에 따른 정확도
# Coloring = Fully-Connected Layer Activation Function Type

fig = px.scatter(trials_df,
                 x="params_learning_rate", y="value",
                 color="params_fc_activation",
                 title="Accuracy by Learning Rate")

fig.update_layout(width=800, height=500,
                  xaxis_title='learning_rate',
                  yaxis_title='accuracy')
fig.show()

In [64]:
fig.update_layout(xaxis=dict(range=[0, 0.006]),
                  yaxis=dict(range=[0.6, 1.0]))
fig.show()

In [65]:
fig.update_layout(xaxis=dict(range=[0, 0.006]),
                  yaxis=dict(range=[0.9, 0.98]))
fig.show()