### User Defined Dataset
- Pytorch에서 딥러닝 시 대량의 데이터 사용에 따른 부하(H/W, S/W) 및 많은 시간 소요에 대한 해결책으로 제시
- 대량 데이터셋 전용 처리 모듈 제공
- Dataset과 DataLoader
    - DataSet: 데이터 전처리, 텐서화 등의 작업 진행
    - DataLoader: Dataset instance를 사용해서 배치크기 만큼 데이터를 추출

In [40]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# LabelEncoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
torch.manual_seed(46)

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {DEVICE}")

device: cpu


In [3]:
FILE_PATH = '../../EX_ML/data/iris.csv'

In [4]:
iris_df = pd.read_csv(FILE_PATH)
iris_df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [5]:
encoder = LabelEncoder()
encoder.fit(iris_df['variety'])
iris_df['variety'] = encoder.transform(iris_df['variety'])
iris_df['variety'].value_counts()

variety
0    50
1    50
2    50
Name: count, dtype: int64

- 사용자 정의 데이터셋 클래스 생성
---
- 클래스 목적: 학습용 데이터셋 텐서화 및 전처리 진행
- 부모 클래스: torch.utils.data.DataSet
- 클래스 이름: CustomDataSet
- 매개변수: feature_df, label_df
---

In [6]:
class CustomDataSet(Dataset):
    # data load & preprocessing
    def __init__(self, featuer_df, label_df):
        super().__init__()
        self.feature_df = featuer_df
        self.label_df = label_df
        self.n_rows = featuer_df.shape[0]
        self.n_features = featuer_df.shape[1]
    
    # len(data)
    def __len__(self):
        
        return self.n_rows
    
    # index & return target into tensor
    def __getitem__(self, idx):
        feature_ts = torch.FloatTensor(self.feature_df.iloc[idx].values)
        label_ts = torch.FloatTensor(self.label_df.iloc[idx].values)
        
        return feature_ts, label_ts

- 사용자 정의 데이터셋 클래스 생성
---
- 클래스 기능: 파일기반 데이터셋, 하는건 위와 같음.
- 부모 클래스: torch.utils.data.DataSet
- 클래스 이름: CustomFlieDataSet
- 매개변수: FLIE_PATH
---

In [7]:
# 확장자 별 데이터 DataFrame 변환
# func name: convertDataFrame
# paprameter: 
# return
def converDataFrame(file_path, exit_header=0):
    ext = file_path.rsplit('.')[-1]
    
    if ext == 'csv':
        return pd.read_csv(file_path, header=exit_header)
    elif ext == 'json':
        return pd.read_json(file_path, header=exit_header)
    elif ext in ['xlsx', 'xls']:
        return pd.read_excel(file_path, header=exit_header)
    else:
        return pd.read_table(file_path, header=exit_header)

In [51]:
class CustomFlieDataSet(Dataset):
    # data load & preprocessing
    def __init__(self, file_path):
        super().__init__()
        data_df = converDataFrame(file_path)
        
        self.feature_df = data_df[data_df.columns[:-1]]
        self.label_df = data_df[[data_df.columns[-1]]]
        if self.label_df[self.label_df.columns[0]].dtypes == 'object':
            encoder = OneHotEncoder()
            encoder.fit(self.label_df)
            self.label_onehot_df = encoder.transform(self.label_df)
        
        self.n_rows = self.feature_df.shape[0]
        self.n_features = self.feature_df.shape[1]
    
    # len(data)
    def __len__(self):
        
        return self.n_rows
    
    # index & return target into tensor
    def __getitem__(self, idx):
        feature_ts = torch.FloatTensor(self.feature_df.iloc[idx].values)
        label_ts = torch.FloatTensor(self.label_df.iloc[idx].values)
        
        return feature_ts, label_ts

- create dataset instance
---

In [52]:
file_ds = CustomFlieDataSet(FILE_PATH)

In [53]:
file_ds.feature_df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [58]:
file_ds.label_onehot_df      # 조금더 이해가 필요할듯...

<150x3 sparse matrix of type '<class 'numpy.float64'>'
	with 150 stored elements in Compressed Sparse Row format>

In [13]:
feature_df = iris_df[iris_df.columns[:-1]]
label_df = iris_df[[iris_df.columns[-1]]]

print(f"feature & label shape: {feature_df.shape}, {label_df.shape}")
print(f"feature & label ndim: {feature_df.ndim}D, {label_df.ndim}D")

feature & label shape: (150, 4), (150, 1)
feature & label ndim: 2D, 2D


In [14]:
# iris datasets
iris_ds = CustomDataSet(feature_df, label_df)

In [15]:
iris_ds.feature_df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [16]:
iris_ds.label_df.head()

Unnamed: 0,variety
0,0
1,0
2,0
3,0
4,0


In [17]:
iris_ds[0]

(tensor([5.1000, 3.5000, 1.4000, 0.2000]), tensor([0.]))

- connect DataLoader
- create DataLoader instance
---

In [18]:
# need: DataLoader instance, batch_size: default 1
# a lots of hyperparameters
iris_dl = DataLoader(iris_ds, batch_size=32)    # usually used batch_size is 32

In [19]:
for data_ts, lable_ts in iris_dl:
    print(data_ts.shape, lable_ts.shape)
    break

torch.Size([32, 4]) torch.Size([32, 1])
