## Dataset & DataLoader 살펴보기

-   Pytorch에서 배치 크기만큼 데이터를 조절하기 위한 메카니즘
-   Dataset : 사용 데이터를 기반으로 사용자 정의 클래스 작성
-   DataLoader : 지정된 Dataset에서 지정된 batch size만큼 피쳐와 타겟을 추출하여 전달


### [1] 모듈 로딩 및 데이터 준비 <hr>


In [44]:
### 모듈 로딩
# import os
# os.environ['KMP_DUPLICATE_LIB_OK']='True'
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

import torchvision

print(torchvision.__version__)


0.15.2a0


In [2]:
### 데이터 준비

x_data = torch.IntTensor(
    [[10, 20, 30], [20, 30, 40], [30, 40, 50], [40, 50, 60], [50, 60, 70]]
)
y_data = torch.IntTensor([[20], [30], [40], [50], [60]])

print(f"x_data = > {x_data.shape}, {x_data.ndim}D")
print(f"y_data = > {y_data.shape}, {y_data.ndim}D")


x_data = > torch.Size([5, 3]), 2D
y_data = > torch.Size([5, 1]), 2D


### [2] 데이터셋 생성 <hr>


#### [2-1] TensorDataset 활용 : Dataset의 sub_class


In [3]:
# TensorDataset 클래스 로딩
from torch.utils.data import TensorDataset


In [4]:
dataset = TensorDataset(
    x_data, y_data
)  # x_data, y_data shape[0]이 다르면 dataset 생성 안 됨
dataset


<torch.utils.data.dataset.TensorDataset at 0x17ba4df04c0>

In [5]:
dataset.tensors


(tensor([[10, 20, 30],
         [20, 30, 40],
         [30, 40, 50],
         [40, 50, 60],
         [50, 60, 70]], dtype=torch.int32),
 tensor([[20],
         [30],
         [40],
         [50],
         [60]], dtype=torch.int32))

In [6]:
### __getitem__() 메서드 호출
dataset[0]


(tensor([10, 20, 30], dtype=torch.int32), tensor([20], dtype=torch.int32))

In [7]:
len(dataset)


5

#### [2-2] 사용자정의 데이터셋 생성


In [8]:
### 데이터 준비
filename = "../data/text/iris.csv"

irisDF = pd.read_csv(filename)
irisDF.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [9]:
irisNP = np.loadtxt(filename, delimiter=",", skiprows=1, usecols=(0, 1, 2, 3))
irisNP[:2]


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2]])

In [10]:
## 데이터의 타입 체크
type(irisDF), type(irisNP), irisDF.__class__.__name__, irisNP.__class__.__name__


(pandas.core.frame.DataFrame, numpy.ndarray, 'DataFrame', 'ndarray')

In [11]:
if irisDF.__class__.__name__ == "DataFrame":
    print("DF")
else:
    print("------")


DF


In [12]:
isinstance(irisDF, pd.DataFrame), isinstance(irisNP, pd.DataFrame), isinstance(
    irisNP, np.ndarray
)


(True, False, True)

In [13]:
isinstance([10], list), isinstance({"A": 22}, list)


(True, False)

In [88]:
### 사용자정의 DataSet 클래스
# - 데이터의 Tensor 변환
class DLDataset(Dataset):
    # 초기화 콜백함수(callback function)
    def __init__(self, x_data, y_data):
        super().__init__()
        # x,y 데이터 ==> ndarray
        x_data = x_data.values if isinstance(x_data, pd.DataFrame) else x_data
        y_data = y_data.values if isinstance(y_data, pd.DataFrame) else y_data

        # ndarray ==> tensor
        self.feature = torch.FloatTensor(x_data)
        self.target = torch.LongTensor(y_data).squeeze()

    # 데이터셋의 갯수 체크 콜백함수(callback function)
    def __len__(self):
        return self.target.shape[0]

    # 특정 인덱스 데이터 + 라벨 반환 콜백함수(callback function)
    def __getitem__(self, index):
        return self.feature[index], self.target[index]


In [89]:
## 피쳐와 라벨로 분리
featureDF = irisDF[irisDF.columns[:-1]]
targetDF = irisDF[irisDF.columns[-1]]

print(f"featureDF => {featureDF.shape}, {featureDF.ndim}D")
print(f"targetDF  => {targetDF.shape}, {targetDF.ndim}D")


featureDF => (150, 4), 2D
targetDF  => (150,), 1D


In [90]:
# object 타입 타겟 ===> int 타입 타겟 변환
from sklearn.preprocessing import LabelEncoder

targetNP = LabelEncoder().fit_transform(targetDF)
targetNP = targetNP.reshape(-1, 1)

print(targetNP.shape, targetNP.ndim)


(150, 1) 2


In [91]:
# 데이터셋 생성 -> DF, NP
my_dataset = DLDataset(featureDF, targetNP)


In [92]:
my_dataset[0], featureDF.iloc[0], targetDF[0]


((tensor([5.1000, 3.5000, 1.4000, 0.2000]), tensor(0)),
 sepal_length    5.1
 sepal_width     3.5
 petal_length    1.4
 petal_width     0.2
 Name: 0, dtype: float64,
 'setosa')

In [93]:
# 데이터셋 생성 -> NP, NP
my_dataset2 = DLDataset(irisNP, targetNP)
my_dataset2[0]


(tensor([5.1000, 3.5000, 1.4000, 0.2000]), tensor(0))

-   [2-3] 학습용, 검증용 테스트용 Dataset <hr>


In [94]:
### ===> 파이토치
from torch.utils.data import random_split

# 학습용, 검증용, 테스트 데이터 비율
seed = torch.Generator().manual_seed(42)

trainDS, validDS, testDS = random_split(my_dataset2, [0.7, 0.1, 0.2], generator=seed)

print(
    f"trainDS => {len(trainDS)}개, validDS => {len(validDS)}개, testDS => {len(testDS)}개"
)

print(f"Subset 속성 =>\nindices : {trainDS.indices}\ndataset : {trainDS.dataset}")
print(f"Subset 속성 =>\nindices : {validDS.indices}\ndataset : {validDS.dataset}")


trainDS => 105개, validDS => 15개, testDS => 30개
Subset 속성 =>
indices : [42, 95, 30, 64, 52, 35, 130, 40, 82, 17, 108, 94, 68, 97, 117, 127, 41, 44, 57, 140, 149, 32, 23, 102, 16, 113, 71, 18, 67, 66, 0, 25, 101, 112, 91, 3, 59, 116, 86, 84, 106, 142, 43, 39, 26, 98, 93, 20, 87, 19, 120, 114, 7, 63, 76, 89, 36, 45, 37, 56, 58, 122, 51, 145, 24, 21, 105, 62, 15, 11, 48, 133, 88, 50, 6, 134, 111, 8, 49, 75, 69, 124, 4, 147, 80, 100, 99, 141, 47, 107, 13, 109, 129, 28, 38, 53, 121, 5, 55, 31, 73, 74, 54, 29, 12]
dataset : <__main__.DLDataset object at 0x0000017BC2D44BE0>
Subset 속성 =>
indices : [22, 104, 81, 1, 103, 125, 85, 2, 96, 128, 27, 118, 77, 110, 146]
dataset : <__main__.DLDataset object at 0x0000017BC2D44BE0>


[3] DataLoader 생성 : 학습용, 검증용, 테스트용 <hr>


In [95]:
# DataLoader 생성
# drop_last 매개변수 : 배치사이즈로 데이터셋 분리 후 남는 데이터 처리 방법 설정 [기본 : False]
batch = 5
trainDL = DataLoader(trainDS, batch_size=batch)
validDL = DataLoader(validDS, batch_size=batch)
testDL = DataLoader(testDS, batch_size=batch)


In [96]:
# Epoch당 반복 단위
print(f"batch_size : {batch}")
print(
    f"trainDS => {len(trainDS)}개, validDS => {len(validDS)}개, testDS => {len(testDS)}개"
)
print(
    f"trainDL => {len(trainDL)}개, validDL => {len(validDL)}개, testDL => {len(testDL)}개"
)


batch_size : 5
trainDS => 105개, validDS => 15개, testDS => 30개
trainDL => 21개, validDL => 3개, testDL => 6개


[4] Model 클래스 정의 : 입/출력 피쳐 수, 층 수, 은닉층의 노드수 <hr>

-   구조 설계
    -   입력측 : 입력 <= 피쳐 갯수, iris 4개
    -   은닉층 : 마음대로 알아서 잘
    -   출력측 : 출력 <= [분류] 타겟 클래스 갯수, [회귀] 1개


In [97]:
# 모델 클래스 정의
# 클래스명 : CModel
class CModel(nn.Module):
    # 구성요소 정의 함수
    def __init__(self, in_, out_):
        super().__init__()
        self.input_layer = nn.Linear(in_, 100)
        self.hidden_layer = nn.Linear(100, 27)
        self.output_layer = nn.Linear(27, out_)
        self.relu = nn.ReLU()

    # 순방향 학습 진행 함수
    def forward(self, x):
        x = self.input_layer(x)  # W1X1+W2X2+...+WnXn+b 100개 반환
        x = self.relu(x)  # relu() 결과 100개 반환
        x = self.hidden_layer(x)  # W1X1+W2X2+...+WnXn+b 27개 반환
        x = self.relu(x)  # relu() 결과 27개 반환
        x = self.output_layer(x)  # W1X1+W2X2+...+WnXn+b out_개 반환
        return x


[5] 학습 준비 : 실행디바이스, 모델, 최적화, 손실함수, 학습횟수, 학습함수, 평가함수, 예측함수 <hr>


In [98]:
# 실행 디바이스 설정
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

# 학습 횟수
EPOCHS = 50


In [99]:
# 모델 인스턴스
IN, OUT = my_dataset2.feature.shape[1], len(torch.unique(my_dataset2.target))
model = CModel(IN, OUT).to(DEVICE)
print(f"IN: {IN}, OUT: {OUT}")
print(model)


IN: 4, OUT: 3
CModel(
  (input_layer): Linear(in_features=4, out_features=100, bias=True)
  (hidden_layer): Linear(in_features=100, out_features=27, bias=True)
  (output_layer): Linear(in_features=27, out_features=3, bias=True)
  (relu): ReLU()
)


In [100]:
# 손실함수
LOSS_FN = nn.CrossEntropyLoss().to(DEVICE)

# 최적화 인스턴스
OPTIMIZER = optim.Adam(model.parameters())


-   학습 및 검증관련 함수 정의


In [113]:
### ===> 학습 진행함수
def training():
    # 학습모드 => 정규화, 경사하강법, 드랍아웃 등의 기능을 활성화
    model.train()

    # 배치크기만큼 학습 진행 및 저장
    train_loss = []
    for cnt, (feature, target) in enumerate(trainDL):
        # 배치크가만큼의 학습 데이터 준비
        feature, target = feature.to(DEVICE), target.to(DEVICE)

        # 학습
        pre_target = model(feature)
        # print(f'pre_target => {pre_target.shape}, {pre_target.ndim}D')
        # print(f'target     => {target.shape}, {target.ndim}D')

        # 손실계산
        loss = LOSS_FN(pre_target, target)
        train_loss.append(loss.item())

        # W,b 업데이트
        OPTIMIZER.zero_grad()
        loss.backward()
        OPTIMIZER.step()

        # 배치 단위 학습 진행 메세지 출력
        # print(f"[Train {cnt} batch Loss] ==> {loss.item():.4f}")

    # 에포크 단위 학습 진행 메세지 출력
    print(f"[Train loss] ==> {loss.item():.4f}")

    return train_loss


In [114]:
### ===> 검증 및 평가 진행함수
def testing():
    model.eval()


In [115]:
### ===> 예측 함수
def predict():
    model.eval()


[6] 학습 진행 <hr>


In [116]:
for eps in range(EPOCHS + 1):
    # 학습
    train_loss = training()
    # 검증
    # testing()
    print(f"[{eps}/{EPOCHS}] {sum(train_loss)/len(train_loss)}")


[Train loss] ==> 0.0005
[0/50] 0.006206681532646707
[Train loss] ==> 0.0004
[1/50] 0.006115017592535531
[Train loss] ==> 0.0005
[2/50] 0.005850160969809319
[Train loss] ==> 0.0004
[3/50] 0.005841752132601826
[Train loss] ==> 0.0004
[4/50] 0.005832664227671644
[Train loss] ==> 0.0004
[5/50] 0.005809915737440211
[Train loss] ==> 0.0004
[6/50] 0.005426515055573657
[Train loss] ==> 0.0004
[7/50] 0.005437404360106614
[Train loss] ==> 0.0004
[8/50] 0.005266017439550653
[Train loss] ==> 0.0003
[9/50] 0.005628965044403837
[Train loss] ==> 0.0004
[10/50] 0.004992682599376643
[Train loss] ==> 0.0003
[11/50] 0.005111400021546399
[Train loss] ==> 0.0003
[12/50] 0.005047437883366088
[Train loss] ==> 0.0003
[13/50] 0.004962520392187538
[Train loss] ==> 0.0003
[14/50] 0.0048018172760099075
[Train loss] ==> 0.0003
[15/50] 0.004455418613620817
[Train loss] ==> 0.0003
[16/50] 0.004658917128576182
[Train loss] ==> 0.0003
[17/50] 0.0044017177601095425
[Train loss] ==> 0.0003
[18/50] 0.004549553498128219
[

In [55]:
EPOCHS = 1000
for epoch in range(EPOCHS + 1):
    for feature, target in trainDL:
        prediction = model(feature.float())
        train_loss = F.cross_entropy(prediction, target.squeeze().long())
        OPTIMIZER.zero_grad()
        train_loss.backward()
        OPTIMIZER.step()
    with torch.no_grad():
        for feature_val, target_val in validDL:
            pre_val = model(feature_val.float())
            val_loss = F.cross_entropy(pre_val, target_val.squeeze().long())

    if epoch % 10 == 0:
        print(
            f"Epoch {epoch:4d}/{EPOCHS} Train_loss: {train_loss.item():.6f} Val_loss: {val_loss.item():.6f}"
        )


Epoch    0/1000 Train_loss: 1.0349 Val_loss: 1.0406
Epoch   10/1000 Train_loss: 0.2642 Val_loss: 0.3417
Epoch   20/1000 Train_loss: 0.0836 Val_loss: 0.1914
Epoch   30/1000 Train_loss: 0.0389 Val_loss: 0.1445
Epoch   40/1000 Train_loss: 0.0223 Val_loss: 0.1196
Epoch   50/1000 Train_loss: 0.0144 Val_loss: 0.1036
Epoch   60/1000 Train_loss: 0.0101 Val_loss: 0.0922
Epoch   70/1000 Train_loss: 0.0073 Val_loss: 0.0834
Epoch   80/1000 Train_loss: 0.0055 Val_loss: 0.0761
Epoch   90/1000 Train_loss: 0.0041 Val_loss: 0.0692
Epoch  100/1000 Train_loss: 0.0032 Val_loss: 0.0641
Epoch  110/1000 Train_loss: 0.0025 Val_loss: 0.0583
Epoch  120/1000 Train_loss: 0.0019 Val_loss: 0.0537
Epoch  130/1000 Train_loss: 0.0013 Val_loss: 0.0476
Epoch  140/1000 Train_loss: 0.0012 Val_loss: 0.0454
Epoch  150/1000 Train_loss: 0.0009 Val_loss: 0.0416
Epoch  160/1000 Train_loss: 0.0008 Val_loss: 0.0389
Epoch  170/1000 Train_loss: 0.0006 Val_loss: 0.0352
Epoch  180/1000 Train_loss: 0.0005 Val_loss: 0.0331
Epoch  190/1

In [50]:
# DataLoader 속성
for _, (feature, target) in enumerate(trainDL):
    print(f"[{_}] feature {feature} \ntarget {target}")
    ## 로더에서 가지고 온 데이터 만큼 학습 진행


[0] feature tensor([[4.4000, 3.2000, 1.3000, 0.2000],
        [5.7000, 3.0000, 4.2000, 1.2000],
        [4.8000, 3.1000, 1.6000, 0.2000],
        [5.6000, 2.9000, 3.6000, 1.3000],
        [6.9000, 3.1000, 4.9000, 1.5000]], dtype=torch.float64) 
target tensor([[0],
        [1],
        [0],
        [1],
        [1]], dtype=torch.int32)
[1] feature tensor([[5.0000, 3.2000, 1.2000, 0.2000],
        [7.4000, 2.8000, 6.1000, 1.9000],
        [5.0000, 3.5000, 1.3000, 0.3000],
        [5.8000, 2.7000, 3.9000, 1.2000],
        [5.1000, 3.5000, 1.4000, 0.3000]], dtype=torch.float64) 
target tensor([[0],
        [2],
        [0],
        [1],
        [0]], dtype=torch.int32)
[2] feature tensor([[6.7000, 2.5000, 5.8000, 1.8000],
        [5.6000, 2.7000, 4.2000, 1.3000],
        [6.2000, 2.2000, 4.5000, 1.5000],
        [6.2000, 2.9000, 4.3000, 1.3000],
        [7.7000, 3.8000, 6.7000, 2.2000]], dtype=torch.float64) 
target tensor([[2],
        [1],
        [1],
        [1],
        [2]], dtype=to