In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
from torch.optim.lr_scheduler import LinearLR
import numpy as np
import pandas as pd
import cv2
import os
from sklearn.metrics import accuracy_score, recall_score, precision_score
import time

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### GPU 사용 설정

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

### 데이터 가져오기

In [8]:
# /content/drive/MyDrive/고모부_머신러닝/dogncat
paths = []
dataset_type = []
labels = []

def make_dataframe(dirpath):
    for dirname, _, filenames in os.walk(dirpath):
        for filename in filenames:
            file_path = dirname+'/'+filename
            paths.append(file_path)

            if '/training_set' in file_path:
                dataset_type.append('train')
            elif '/test_set' in file_path:
                dataset_type.append('test')
            else:
                dataset_type.append('N/A')
            
            if 'dogs' in file_path:
                labels.append('DOG')
            elif 'cats' in file_path:
                labels.append('CAT')
            else:
                labels.append('N/A')

    df = pd.DataFrame({'path' : paths, 'type' : dataset_type, 'label' : labels})

    return df

In [9]:
cnd_df = make_dataframe('/content/drive/MyDrive/고모부_머신러닝/dogncat')
cnd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10032 entries, 0 to 10031
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    10032 non-null  object
 1   type    10032 non-null  object
 2   label   10032 non-null  object
dtypes: object(3)
memory usage: 235.2+ KB


In [10]:
cnd_df.head(10)

Unnamed: 0,path,type,label
0,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
1,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
2,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
3,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
4,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
5,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
6,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
7,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
8,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT
9,/content/drive/MyDrive/고모부_머신러닝/dognc...,test,CAT


In [11]:
cnd_df = cnd_df[cnd_df['path'].str.contains('.jpg')].copy() # '.jpg'파일만 저장
cnd_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10028 entries, 0 to 10031
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    10028 non-null  object
 1   type    10028 non-null  object
 2   label   10028 non-null  object
dtypes: object(3)
memory usage: 313.4+ KB


In [12]:
# train, valid dataset 만들기

from sklearn.model_selection import train_test_split

# train, validation 분리
train_df, valid_df = train_test_split(cnd_df[cnd_df['type']=='train'], test_size = 0.25)

# train dataset
train_data = train_df['path'].values
train_label = train_df['label'].values
train_label_indices = train_df['label'].replace(['CAT', 'DOG'], [0, 1]).values

# validation dataset
valid_data = valid_df['path'].values
valid_label = valid_df['label'].values
valid_label_indices = valid_df['label'].replace(['CAT', 'DOG'], [0,1]).values


In [13]:
train_df.head()

Unnamed: 0,path,type,label
7850,/content/drive/MyDrive/고모부_머신러닝/dognc...,train,DOG
7729,/content/drive/MyDrive/고모부_머신러닝/dognc...,train,DOG
7618,/content/drive/MyDrive/고모부_머신러닝/dognc...,train,DOG
7250,/content/drive/MyDrive/고모부_머신러닝/dognc...,train,DOG
8738,/content/drive/MyDrive/고모부_머신러닝/dognc...,train,DOG


In [14]:
print(len(train_data))
print(len(valid_data))

6003
2002


In [15]:
count_c = 0
count_d = 0
for file_name in valid_data:
    if 'cats' in file_name:
        count_c += 1
    elif 'dogs' in file_name:
        count_d += 1

print(count_c, count_d)

1006 996


### 샘플용 데이터 만들기

In [16]:
sample_dog = train_df[train_df['label'] == 'DOG'].sample(30)
sample_cat = train_df[train_df['label'] == 'CAT'].sample(30)

sample_df = pd.concat([sample_dog, sample_cat])
# sample_df = sample_df.sample(frac=1).reset_index()
sample_tr, sample_val = train_test_split(sample_df, test_size=0.2)

In [17]:
sample_tr['label'].value_counts()

DOG    24
CAT    24
Name: label, dtype: int64

In [18]:
sample_tr_data = sample_tr['path'].values
sample_tr_label = sample_tr['label'].replace(['CAT', 'DOG'], [0,1]).values
sample_val_data = sample_val['path'].values
sample_val_label = sample_val['label'].replace(['CAT', 'DOG'], [0,1]).values

### 테스트 데이터 만들기

In [19]:
# test 데이터 만들기
test_data = cnd_df['path'][cnd_df['type']=='test']
test_data

0       /content/drive/MyDrive/고모부_머신러닝/dognc...
1       /content/drive/MyDrive/고모부_머신러닝/dognc...
2       /content/drive/MyDrive/고모부_머신러닝/dognc...
3       /content/drive/MyDrive/고모부_머신러닝/dognc...
4       /content/drive/MyDrive/고모부_머신러닝/dognc...
                              ...                        
2020    /content/drive/MyDrive/고모부_머신러닝/dognc...
2021    /content/drive/MyDrive/고모부_머신러닝/dognc...
2022    /content/drive/MyDrive/고모부_머신러닝/dognc...
2023    /content/drive/MyDrive/고모부_머신러닝/dognc...
2024    /content/drive/MyDrive/고모부_머신러닝/dognc...
Name: path, Length: 2023, dtype: object

### Custom dataset 만들기

In [20]:
class MyDataset(Dataset):
    def __init__(self, datapath, label=None):
        super(MyDataset, self).__init__()

        self.path = datapath
        self.label = label

    def __len__(self):
        return len(self.path)

    def __getitem__(self,idx):

        image = cv2.cvtColor(cv2.imread(self.path[idx]), cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (244,244))
        image = np.asarray(image, dtype=np.float32).transpose(2,0,1)
        normal_image = (image - np.amin(image)) / (np.amax(image) - np.amin(image))
        
        if self.label is not None:
            label = self.label[idx]

        return normal_image, label


In [21]:
train_dataset = MyDataset(train_data, train_label_indices)
valid_dataset = MyDataset(valid_data, valid_label_indices)
test_dataset = MyDataset(test_data)

### 샘플용 dataset만들기

In [22]:
sample_tr_dataset = MyDataset(sample_tr_data, sample_tr_label)
sample_val_dataset = MyDataset(sample_val_data, sample_val_label)

In [23]:
print(len(sample_tr_dataset))
print(len(sample_val_dataset))

48
12


### Loader 만들기

In [24]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False) # 왜 valid loader에서는 shuffle을 false로 하지?

In [25]:
len(train_loader)

376

In [26]:
len(valid_loader)

126

### 샘플용 loader 만들기

In [27]:
sample_tr_loader = DataLoader(sample_tr_dataset, batch_size=4, shuffle=True)
sample_val_loader = DataLoader(sample_val_dataset, batch_size=2, shuffle=False)

In [28]:
len(sample_val_loader)

6

### 모델 만들기

In [29]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 5, kernel_size=3, padding='same')
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(5*244*244, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # print('input size :', x.shape)
        # print('input max :', torch.amax(x, dim=(1,2,3)))
        # print('input min :', torch.amin(x, dim=(1,2,3)))
        # print('input l2norm :', torch.linalg.vector_norm(x, dim=(1,2,3)))

        conv = self.conv1(x)
        # print('conv size :', conv.shape)
        # print('conv max :', torch.amax(conv, dim=(1,2,3)))
        # print('conv min :', torch.amin(conv, dim=(1,2,3)))
        # print('conv l2norm :', torch.linalg.vector_norm(conv, dim=(1,2,3)))

        conv_out = self.relu(conv)
        # print('conv_out size :', conv_out.shape)
        # print('conv_out max :', torch.amax(conv_out, dim=(1,2,3)))
        # print('conv_out min :', torch.amin(conv_out, dim=(1,2,3)))
        # print('conv_out l2norm :', torch.linalg.vector_norm(conv_out, dim=(1,2,3)))

        fc_input = conv_out.view(conv_out.size(0), -1)
        # print('fc_input size :', fc_input.shape)
        # print('fc_input max :', torch.amax(fc_input, dim=(1)))
        # print('fc_input min :', torch.amin(fc_input, dim=(1)))
        # print('fc_input l2norm :', torch.linalg.vector_norm(fc_input, dim=(1)))

        fc_logit = self.fc1(fc_input)
        # print('fc_logit size :', fc_logit.shape)
        # print('fc_logit max :', torch.amax(fc_logit, dim=(1)))
        # print('fc_logit min :', torch.amin(fc_logit, dim=(1)))
        # print('fc_logit l2norm :', torch.linalg.vector_norm(fc_logit, dim=(1)))
        # print()

        return fc_logit

In [30]:
model = CNN().to(device)
model

CNN(
  (conv1): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (relu): ReLU()
  (fc1): Linear(in_features=297680, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)

### loss function

In [31]:
loss_fn = nn.CrossEntropyLoss().to(device)

### optimzer, lr_scheduler 설정

In [32]:
optimizer = SGD(model.parameters(), lr=0.001)
scheduler = LinearLR(optimizer)

### 모델 training

### Training, Evaluation

In [33]:
def calculate_conf_matrix(predicted, label, class_idx):
    TP_num = 0
    TN_num = 0
    FP_num = 0
    FN_num = 0
    for i in range(len(predicted)):
        if predicted[i] == class_idx and label[i] == class_idx:
            TP_num += 1

        elif predicted[i] == class_idx and label[i] != class_idx:
            FP_num += 1
            
        elif predicted[i] != class_idx and label[i] == class_idx:
            FN_num += 1

        elif predicted[i] != class_idx and label[i] != class_idx:
            TN_num += 1

    return TP_num, TN_num, FP_num, FN_num

def calculate_accuracy(conf_matrix_list):
    try:
        accuracy = (conf_matrix_list[0] + conf_matrix_list[1]) / (conf_matrix_list[0] + conf_matrix_list[1] + conf_matrix_list[2] + conf_matrix_list[3])
    except ZeroDivisionError:
        accuracy = 0
    return accuracy

def calculate_recall(conf_matrix_list):
    try: 
        recall = conf_matrix_list[0] / (conf_matrix_list[0] + conf_matrix_list[3])
    except ZeroDivisionError:
        recall = 0
    return recall

def calculate_precision(conf_matrix_list):
    try:
        precision = conf_matrix_list[0] / (conf_matrix_list[0] + conf_matrix_list[2])
    except ZeroDivisionError:
        precision = 0
    return precision

def calculate_acc_rec_pre(conf_matrix_dict : dict):
    for class_idx in conf_matrix_dict.keys():
        accuracy = calculate_accuracy(conf_matrix_dict[class_idx])
        recall = calculate_recall(conf_matrix_dict[class_idx])
        precision = calculate_precision(conf_matrix_dict[class_idx])

        print(f'class{class_idx} >>> accuracy : {accuracy}, recall : {recall}, precision : {precision}')

    

In [34]:
epoch = 100
running_loss = 0.
running_vloss = 0.
class_num = 2
accuracy = 0.
recall = 0.
precision = 0.

for i in range(epoch):
    print()
    print(f'========= Epoch{i+1} =========')
    print()
    # train
    train_start = time.time()
    batch_step = 0
    model.train()
    for batch_idx, data in enumerate(train_loader):
        image_data, label_data = data
        image_data = image_data.to(device)
        label_data = label_data.to(device)
        optimizer.zero_grad()
        tr_output= model(image_data)
        loss = loss_fn(tr_output, label_data)
        # print(f'batch{batch_idx+1} loss :', loss)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()

        batch_step += 1

        if batch_idx + 1 == len(train_loader):
            avg_loss = running_loss / len(train_loader)
            print('Loss/train :', avg_loss)
            print('total batch step :', batch_step)
            running_loss = 0.
            batch_step = 0

    scheduler.step()
    train_end = time.time()
    print('train_time :', round(train_end - train_start, 2))

    print()

    #evaluate
    model.eval()
    TP_TN_FP_FN = {}
    for batch_idx, val_data in enumerate(valid_loader): # training data로 우선 evaluation해보기
        valid_image, valid_label = val_data
        valid_image = valid_image.to(device)
        valid_label = valid_label.to(device)
        val_output = model(valid_image)
        val_output_idx = torch.argmax(val_output, dim=1)
        # print(f'predicted : {val_output_idx}, actual : {valid_label}')

        for i, class_idx in enumerate(range(class_num)):
                TP, TN, FP, FN = calculate_conf_matrix(val_output_idx, valid_label, class_idx)
                if batch_idx == 0:
                    TP_TN_FP_FN[class_idx] = np.array([TP, TN, FP, FN])
                else:
                    TP_TN_FP_FN[class_idx] += np.array([TP, TN, FP, FN])

        # print(f'TP_TN_FP_FN : {TP_TN_FP_FN}')

        vloss = loss_fn(val_output, valid_label)
        running_vloss += vloss.item()
        
        if batch_idx + 1 == len(valid_loader):
            avg_vloss = running_vloss / len(valid_loader)
            calculate_acc_rec_pre(TP_TN_FP_FN)
            print('Loss/valid :', avg_vloss)
            # print('Accuracy :', accuracy_score(valid_label.cpu().detach().numpy(), val_output_idx.cpu().detach().numpy()))
            # print('Recall :', recall_score(valid_label.cpu().detach().numpy(), val_output_idx.cpu().detach().numpy()))
            # print('Precision :', precision_score(valid_label.cpu().detach().numpy(), val_output_idx.cpu().detach().numpy()))
            
            running_vloss = 0.




Loss/train : 1.027820708507553
total batch step : 376
train_time : 3069.67

class0 >>> accuracy : 0.5244755244755245, recall : 0.8996023856858847, precision : 0.515375854214123
class1 >>> accuracy : 0.5244755244755245, recall : 0.14558232931726908, precision : 0.5894308943089431
Loss/valid : 0.6851141869075714


Loss/train : 0.6857396576632845
total batch step : 376
train_time : 77.96

class0 >>> accuracy : 0.5999000999000998, recall : 0.4840954274353877, precision : 0.6332899869960988
class1 >>> accuracy : 0.5999000999000998, recall : 0.7168674698795181, precision : 0.5790754257907542
Loss/valid : 0.6701072899122087


Loss/train : 0.6651143571163746
total batch step : 376
train_time : 78.09

class0 >>> accuracy : 0.5924075924075924, recall : 0.36779324055666, precision : 0.6727272727272727
class1 >>> accuracy : 0.5924075924075924, recall : 0.8192771084337349, precision : 0.5619834710743802
Loss/valid : 0.6675090803986504


Loss/train : 0.6525126391268791
total batch step : 376
train

### avg_loss, avg_vloss가 계속 같은 값으로 나오는 현상
- 학습이 진행되도 avg_loss와 avg_vloss가 계속 같은 값으로 나오는 현상이 발생했는데 처음에는 데이터 셋이나 모델에 문제가 있나 해서 살펴보았는데 딱히 문제는 없었다. 어디가 문제인가 살펴보니 loss가 변하지 않는다는 것은 파라미터 업데이트가 안된다는 것을 의미했고 다시 살펴보니 모델을 변경한 후에 optimizer를 새롭게 설정해주지 않아서 생긴 문제였다.

### 전체 데이터로 모델을 돌리기 전에 샘플 데이터로 우선적으로 돌려보기
- 모델 training은 많은 비용이 들어가는 작업이다. 특히, 데이터가 많고 클수록. 그렇기 때문에 비용을 절약하기 위해서는 전체 학습 데이터로 모델을 학습하기 전에 샘플 데이터로 우선적으로 학습을 진행하여 모델이 정상적으로 작동하는지 확인하는 작업이 필요하다.

### accuracy, recall, precision을 구해봤는데 0.75, 0, 0이 나왔다. 왜지..?
- recall이랑 precision이 0이 나왔다는 건 TP가 0이라는 건데 그렇게 되면 accuracy가 0.75가 나올 수가 없다. 데이터셋이 불균형한 경우가 아닌 이상 나와봐야 0.5인데..(현재 -> 개 : 30, 고양이 : 30)
- 일단 모든 결과값을 출력해서 어디가 잘못된 것인지 확인해보자...

### epochs 100으로 training해본 결과
- epochs1에서는 1520초. 그 이후에는 매 epochs당 약 40초.
- 왜 2번째 epochs부터는 training 시간이 빨라질까?
- 일단 데이터가 다 들어가는지 확인하기 위해 몇번의 batch step이 돌아가는지 print해보기