In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
from torch.optim.lr_scheduler import LinearLR
import numpy as np
import pandas as pd
import cv2
import os

### GPU 사용 설정

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### 데이터 가져오기

In [3]:
# /content/drive/MyDrive/고모부_머신러닝/dogncat
paths = []
dataset_type = []
labels = []

def make_dataframe(dirpath):
    for dirname, _, filenames in os.walk(dirpath):
        for filename in filenames:
            file_path = dirname+'/'+filename
            paths.append(file_path)

            if '/training_set' in file_path:
                dataset_type.append('train')
            elif '/test_set' in file_path:
                dataset_type.append('test')
            else:
                dataset_type.append('N/A')
            
            if 'dogs' in file_path:
                labels.append('DOG')
            elif 'cats' in file_path:
                labels.append('CAT')
            else:
                labels.append('N/A')

    df = pd.DataFrame({'path' : paths, 'type' : dataset_type, 'label' : labels})

    return df

In [4]:
cnd_df = make_dataframe('/content/drive/MyDrive/고모부_머신러닝/dogncat')
cnd_df.head()

Unnamed: 0,path,type,label
0,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
1,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
2,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
3,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
4,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT


In [5]:
cnd_df = cnd_df[cnd_df['path'].str.contains('.jpg')].copy()
cnd_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10028 entries, 0 to 10031
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    10028 non-null  object
 1   type    10028 non-null  object
 2   label   10028 non-null  object
dtypes: object(3)
memory usage: 313.4+ KB


In [45]:
# train, valid dataset 만들기

from sklearn.model_selection import train_test_split

# train, validation 분리
train_df, valid_df = train_test_split(cnd_df[cnd_df['type']=='train'], test_size = 0.25)

# train dataset
train_data = train_df['path'].values
train_label = train_df['label'].values
train_label_indices = train_df['label'].replace(['CAT', 'DOG'], [0, 1]).values

# validation dataset
valid_data = valid_df['path'].values
valid_label = valid_df['label'].values
valid_label_indices = valid_df['label'].replace(['CAT', 'DOG'], [0,1]).values


In [51]:
train_df.head()

Unnamed: 0,path,type,label
8713,/content/drive/MyDrive/고모부_머신러닝/dognc...,train,DOG
6647,/content/drive/MyDrive/고모부_머신러닝/dognc...,train,DOG
6660,/content/drive/MyDrive/고모부_머신러닝/dognc...,train,DOG
6563,/content/drive/MyDrive/고모부_머신러닝/dognc...,train,DOG
6276,/content/drive/MyDrive/고모부_머신러닝/dognc...,train,DOG


In [13]:
count_c = 0
count_d = 0
for file_name in valid_data:
    if 'cats' in file_name:
        count_c += 1
    elif 'dogs' in file_name:
        count_d += 1

print(count_c, count_d)

996 1006


### 샘플용 데이터 만들기

In [54]:
sample_dog = train_df[train_df['label'] == 'DOG'].sample(60)
sample_cat = train_df[train_df['label'] == 'CAT'].sample(60)

sample_df = pd.concat([sample_dog, sample_cat])
# sample_df = sample_df.sample(frac=1).reset_index()
sample_tr, sample_val = train_test_split(sample_df, test_size=0.2)

In [63]:
sample_val['label'].value_counts()

DOG    13
CAT    11
Name: label, dtype: int64

In [64]:
sample_tr_data = sample_tr['path'].values
sample_tr_label = sample_tr['label'].replace(['CAT', 'DOG'], [0,1]).values
sample_val_data = sample_val['path'].values
sample_val_label = sample_val['label'].replace(['CAT', 'DOG'], [0,1]).values

In [36]:
print(len(train_data))
print(len(train_label_indices))

print(len(valid_data))
print(len(valid_label_indices))


6003
6003
2002
2002


In [14]:
# test dataset 만들기
test_data = cnd_df['path'][cnd_df['type']=='test']
test_data

0       /content/drive/MyDrive/고모부_머신러닝/dognc...
1       /content/drive/MyDrive/고모부_머신러닝/dognc...
2       /content/drive/MyDrive/고모부_머신러닝/dognc...
3       /content/drive/MyDrive/고모부_머신러닝/dognc...
4       /content/drive/MyDrive/고모부_머신러닝/dognc...
                              ...                        
2020    /content/drive/MyDrive/고모부_머신러닝/dognc...
2021    /content/drive/MyDrive/고모부_머신러닝/dognc...
2022    /content/drive/MyDrive/고모부_머신러닝/dognc...
2023    /content/drive/MyDrive/고모부_머신러닝/dognc...
2024    /content/drive/MyDrive/고모부_머신러닝/dognc...
Name: path, Length: 2023, dtype: object

### Custom dataset 만들기

In [15]:
class MyDataset(Dataset):
    def __init__(self, datapath, label=None):
        super(MyDataset, self).__init__()

        self.path = datapath
        self.label = label

    def __len__(self):
        return len(self.path)

    def __getitem__(self,idx):

        image = cv2.cvtColor(cv2.imread(self.path[idx]), cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (244,244))
        image = np.asarray(image, dtype=np.float32).transpose(2,0,1)
        normal_image = (image - np.amin(image)) / (np.amax(image) - np.amin(image))
        
        if self.label is not None:
            label = self.label[idx]

        return normal_image, label


In [16]:
train_dataset = MyDataset(train_data, train_label_indices)
valid_dataset = MyDataset(valid_data, valid_label_indices)
test_dataset = MyDataset(test_data)

In [66]:
type(len(train_dataset))

int

### 샘플용 dataset만들기

In [70]:
sample_tr_dataset = MyDataset(sample_tr_data, sample_tr_label)
sample_val_dataset = MyDataset(sample_val_data, sample_val_label)

### Loader 만들기

In [18]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False) # 왜 valid loader에서는 shuffle을 false로 하지?

In [94]:
loader_data = next(iter(train_loader))

### 샘플용 loader 만들기

In [86]:
sample_tr_loader = DataLoader(sample_tr_dataset, batch_size=8, shuffle=True)
sample_val_loader = DataLoader(sample_val_dataset, batch_size=8, shuffle=False)

In [77]:
len(sample_val_loader)

3

### 모델 만들기

In [101]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 5, kernel_size=3, padding='same')
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(5*244*244, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # print('input size :', x.shape)
        # print('input max :', torch.amax(x, dim=(1,2,3)))
        # print('input min :', torch.amin(x, dim=(1,2,3)))
        # print('input l2norm :', torch.linalg.vector_norm(x, dim=(1,2,3)))

        conv = self.conv1(x)
        # print('conv size :', conv.shape)
        # print('conv max :', torch.amax(conv, dim=(1,2,3)))
        # print('conv min :', torch.amin(conv, dim=(1,2,3)))
        # print('conv l2norm :', torch.linalg.vector_norm(conv, dim=(1,2,3)))

        conv_out = self.relu(conv)
        # print('conv_out size :', conv_out.shape)
        # print('conv_out max :', torch.amax(conv_out, dim=(1,2,3)))
        # print('conv_out min :', torch.amin(conv_out, dim=(1,2,3)))
        # print('conv_out l2norm :', torch.linalg.vector_norm(conv_out, dim=(1,2,3)))

        fc_input = conv_out.view(conv_out.size(0), -1)
        # print('fc_input size :', fc_input.shape)
        # print('fc_input max :', torch.amax(fc_input, dim=(1)))
        # print('fc_input min :', torch.amin(fc_input, dim=(1)))
        # print('fc_input l2norm :', torch.linalg.vector_norm(fc_input, dim=(1)))

        fc_logit = self.fc1(fc_input)
        # print('fc_logit size :', fc_logit.shape)
        # print('fc_logit max :', torch.amax(fc_logit, dim=(1)))
        # print('fc_logit min :', torch.amin(fc_logit, dim=(1)))
        # print('fc_logit l2norm :', torch.linalg.vector_norm(fc_logit, dim=(1)))
        # print()

        fc_output = self.softmax(fc_logit)

        return fc_logit, fc_output

In [102]:
model = CNN().to(device)
model

CNN(
  (conv1): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (relu): ReLU()
  (fc1): Linear(in_features=297680, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)

### loss function

In [103]:
loss_fn = nn.CrossEntropyLoss().to(device)

### optimzer, lr_scheduler 설정

In [104]:
optimizer = SGD(model.parameters(), lr=0.001)
scheduler = LinearLR(optimizer)

### 모델 training

In [23]:
def train_one_epoch(loader):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(loader):
        image_data, label_data = data
        image_data = image_data.to(device)
        label_data = label_data.to(device)

        optimizer.zero_grad()

        hyphothesis = model(image_data)

        loss = loss_fn(hyphothesis, label_data)

        loss.backward()

        # print(f'batch{i+1} loss : {loss}')
        # print()
        # print(f'gradient max :', torch.amax(torch.tensor([torch.amax(param.grad) for param in model.parameters()])))
        # print(f'gradient min :', torch.amin(torch.tensor([torch.amax(param.grad) for param in model.parameters()])))
        # print(f'gradient l2norm :', torch.amax(torch.tensor([torch.linalg.vector_norm(param.grad) for param in model.parameters()])))
        # print()
        optimizer.step()

        running_loss += loss.item()

        # last_loss = running_loss / len(loader)
        # print(f'Average batch loss :', last_loss)
        # print()
        # last_loss = 0


        if i+1 == len(loader):
            last_loss = running_loss / len(loader)
            print(f'Average batch loss :', last_loss)
            print()
            running_loss = 0

    return last_loss


### Training, Evaluation

In [105]:
epoch = 50
running_loss = 0.
running_vloss = 0.
for i in range(epoch):
    print(f'========= Epoch{i+1} =========')
    print()
    # train
    model.train(True)
    for batch_idx, data in enumerate(sample_tr_loader):
        image_data, label_data = data
        image_data = image_data.to(device)
        label_data = label_data.to(device)
        optimizer.zero_grad()
        tr_logit, tr_output= model(image_data)
        # print('hyphothesis :', tr_output)
        loss = loss_fn(tr_logit, label_data)
        print(f'batch{batch_idx+1} loss :', loss)
        # print()
        # print(f'gradient max :', torch.amax(torch.tensor([torch.amax(param.grad) for param in model.parameters()])))
        # print(f'gradient min :', torch.amin(torch.tensor([torch.amax(param.grad) for param in model.parameters()])))
        # print(f'gradient l2norm :', torch.amax(torch.tensor([torch.linalg.vector_norm(param.grad) for param in model.parameters()])))
        # print()
        running_loss += loss.item()
        loss.backward()
        optimizer.step()

        if batch_idx + 1 == len(sample_tr_loader):
            avg_loss = running_loss / len(sample_tr_loader)
            print('Loss/train :', avg_loss)
            running_loss = 0.

    model.train(False)

    #evaluate
    for batch_idx, val_data in enumerate(sample_val_loader):
        valid_image, valid_label = val_data
        valid_image = valid_image.to(device)
        valid_label = valid_label.to(device)
        val_logit, val_output = model(valid_image)
        vloss = loss_fn(val_logit, valid_label)
        running_vloss += vloss.item()
        
        if batch_idx + 1 == len(sample_val_loader):
            avg_vloss = running_vloss / len(sample_val_loader)
            print('Loss/valid :', avg_vloss)
            running_vloss = 0.



batch1 loss : tensor(0.6325, device='cuda:0', grad_fn=<NllLossBackward0>)
batch2 loss : tensor(0.9709, device='cuda:0', grad_fn=<NllLossBackward0>)
batch3 loss : tensor(2.6009, device='cuda:0', grad_fn=<NllLossBackward0>)
batch4 loss : tensor(1.1862, device='cuda:0', grad_fn=<NllLossBackward0>)
batch5 loss : tensor(5.0551, device='cuda:0', grad_fn=<NllLossBackward0>)
batch6 loss : tensor(0.9226, device='cuda:0', grad_fn=<NllLossBackward0>)
batch7 loss : tensor(2.0527, device='cuda:0', grad_fn=<NllLossBackward0>)
batch8 loss : tensor(1.3822, device='cuda:0', grad_fn=<NllLossBackward0>)
batch9 loss : tensor(2.4416, device='cuda:0', grad_fn=<NllLossBackward0>)
batch10 loss : tensor(2.8555, device='cuda:0', grad_fn=<NllLossBackward0>)
batch11 loss : tensor(3.2049, device='cuda:0', grad_fn=<NllLossBackward0>)
batch12 loss : tensor(2.8531, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss/train : 2.1798370530207953
Loss/valid : 3.1596528689066568

batch1 loss : tensor(2.7537, device='cuda:0

### avg_loss, avg_vloss가 계속 같은 값으로 나오는 현상
- 학습이 진행되도 avg_loss와 avg_vloss가 계속 같은 값으로 나오는 현상이 발생했는데 처음에는 데이터 셋이나 모델에 문제가 있나 해서 살펴보았는데 딱히 문제는 없었다. 어디가 문제인가 살펴보니 loss가 변하지 않는다는 것은 파라미터 업데이트가 안된다는 것을 의미했고 다시 살펴보니 모델을 변경한 후에 optimizer를 새롭게 설정해주지 않아서 생긴 문제였다.

### 전체 데이터로 모델을 돌리기 전에 샘플 데이터로 우선적으로 돌려보기
- 모델 training은 많은 비용이 들어가는 작업이다. 특히, 데이터가 많고 클수록. 그렇기 때문에 비용을 절약하기 위해서는 전체 학습 데이터로 모델을 학습하기 전에 샘플 데이터로 우선적으로 학습을 진행하여 모델이 정상적으로 작동하는지 확인하는 작업이 필요하다.