Data File ==> DataFrame, Numpy (전처리) ==> Tensor ==> Dataset(피처 + 타겟) ==> DataLoader 생성

### Dataset & DataLoader 살펴보기
- Pytorch에서 배치크기만 데이터를 조절하기 위한 메카니즘
- Dataset : 사용 데이터를 기반으로 사용자 정의 클래스 작성
- DataLoader : 지정된 Dataset에서 지정된 batch size만큼 피처와 타겟을 추출하여 전달

[1] 모듈로딩 및 데이터 준비

In [35]:
### ===> 모듈 로딩
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

In [36]:
### ===> 데이터 준비
x_data = torch.IntTensor([[10, 20, 30] , [20, 30, 40], [30, 40, 50], [40, 50, 60], [50, 60, 70]])
y_data = torch.IntTensor([[20], [30], [40], [50], [60]])

print(f"x_data : {x_data.shape} {x_data.dtype}D")
print(f"y_data : {y_data.shape} {y_data.dtype}D")

x_data : torch.Size([5, 3]) torch.int32D
y_data : torch.Size([5, 1]) torch.int32D


[2] 데이터셋 생성

- [2-1] TensorDataset 활용 : Dataset의 sub_class

In [37]:
# TensorDataset 클래스 로딩
from torch.utils.data import TensorDataset

In [38]:
dataset = TensorDataset(x_data, y_data)
dataset

<torch.utils.data.dataset.TensorDataset at 0x25728b53f40>

In [39]:
dataset.tensors

(tensor([[10, 20, 30],
         [20, 30, 40],
         [30, 40, 50],
         [40, 50, 60],
         [50, 60, 70]], dtype=torch.int32),
 tensor([[20],
         [30],
         [40],
         [50],
         [60]], dtype=torch.int32))

In [40]:
## __getitem__() 메서드 호출
dataset[0]

(tensor([10, 20, 30], dtype=torch.int32), tensor([20], dtype=torch.int32))

In [41]:
len(dataset)

5

- [2-2] 사용자 정의 데이터셋 생성

In [42]:
file = '../data/iris.csv'
irisDF = pd.read_csv(file, header = None)
irisDF.columns = ['sepal_length', 'sepal_width', 'petal_length','petal_width','variety']
irisDF

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,variety
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [43]:
irisNP = np.loadtxt(file, delimiter = ',', usecols = [0,1,2,3])
irisNP

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [44]:
# 데이터의 타입 체크
type(irisDF), type(irisNP), irisDF.__class__.__name__, irisNP.__class__.__name__

(pandas.core.frame.DataFrame, numpy.ndarray, 'DataFrame', 'ndarray')

In [45]:
if irisDF.__class__.__name__ == 'DataFrame' :
    print('DF')
else:
    print('-----')

DF


In [46]:
print(isinstance(irisDF, pd.DataFrame), isinstance(irisNP, pd.DataFrame), isinstance(irisDF, np.ndarray),isinstance(irisNP, np.ndarray), sep = '\n')

True
False
False
True


In [47]:
### 사용자 정의 DataSet 클래스
# - 데이터의 Tensor 변환

class DLDataset(Dataset):
    # 초기화 함수 콜백함수 (callback function)
    def __init__(self, x_data, y_data):
        super().__init__()
        
        # x,y 데이터 ==> ndarray
        x_data = x_data.values if isinstance(x_data, pd.DataFrame) else x_data
        y_data = y_data.values if isinstance(y_data, pd.DataFrame) else y_data
        # 넘파이면 그대로 ㄱㅊ    다만 데이터프레임이면 values만 뺴서 갖고오자능
        
        # ndarray ==> tensor
        self.feature = torch.tensor(x_data)
        self.target = torch.tensor(y_data)
        
    # 데이터셋의 개수 체크 함수 콜백함수 (callback function)
    def __len__(self):
        return self.target.shape[0]
    
    # 특정 인덱스 데이터 + 라벨 반환 콜백함수 (callback function)
    def __getitem__(self, index):
        return self.feature[index], self.target[index]


In [48]:
## 피처와 라벨로 분리
featureDF = irisDF[irisDF.columns[:-1]]
targetDF = irisDF[irisDF.columns[-1]]

print(f"featureDF => {featureDF.shape} , {featureDF.ndim}D")
print(f"targetDF => {targetDF.shape} , {targetDF.ndim}D")

featureDF => (150, 4) , 2D
targetDF => (150,) , 1D


In [58]:
# object 타입 타겟 ===> int 타입 타겟 변환
from sklearn.preprocessing import LabelEncoder

targetNP = LabelEncoder().fit_transform(targetDF)
targetNP = targetNP.reshape(-1,1)
print(targetNP.shape, targetNP.ndim)

(150, 1) 2


In [59]:
# 데이터셋 생성
my_dataset = DLDataset(featureDF, targetNP)

In [60]:
my_dataset[0]

(tensor([5.1000, 3.5000, 1.4000, 0.2000], dtype=torch.float64), tensor([0]))

In [62]:
featureDF.iloc[0]

sepal_length    5.1
sepal_width     3.5
petal_length    1.4
petal_width     0.2
Name: 0, dtype: float64

In [63]:
targetDF[0]

0