# utils.data Module

In [5]:
# !pip uninstall -y torch==1.9.0
# !pip install torch==1.7.0

Found existing installation: torch 1.9.0
Uninstalling torch-1.9.0:
  Successfully uninstalled torch-1.9.0
Collecting torch==1.7.0
  Using cached torch-1.7.0-cp37-cp37m-manylinux1_x86_64.whl (776.7 MB)
Collecting dataclasses
  Using cached dataclasses-0.6-py3-none-any.whl (14 kB)
Processing /home/jovyan/.cache/pip/wheels/56/b0/fe/4410d17b32f1f0c3cf54cdfb2bc04d7b4b8f4ae377e2229ba0/future-0.18.2-py3-none-any.whl
Installing collected packages: dataclasses, future, torch
Successfully installed dataclasses-0.6 future-0.18.2 torch-1.7.0


In [1]:
import torch
import time
import pandas as pd
from torch.utils import data
import numpy as np

## Declair Dataset Class

In [2]:
class NumDataset(data.Dataset):
    '''Init: 使用變數宣告- data來源、總量、preprocess方法...等等'''
    def __init__(self,fname,number_length):
        super().__init__()
        self.df=pd.read_csv(fname).head(number_length)
        self.number_length=number_length
        
        self.data=self.df.data.values
        self.label=self.df.label.values
    '''一定要宣告長度，自訂義'''
    def __len__(self):
        return self.number_length
    '''定義回傳一筆資料的方式，input會是某個index, 輸出data, 以及label'''
    def __getitem__(self, idx):
        data=self.data[idx]
        label=self.label[idx]
        return data,label

In [3]:
'''之後就可以用這個dataset class 來query'''
dataset=NumDataset("data.csv",6)

In [4]:
'''可iterate，每次會依序call __getitem__'''
for d,l in dataset:
    print(f"{d}{type(d)}|{l} {type(l)}")

0.3314613320867217<class 'numpy.float64'>|0 <class 'numpy.int64'>
0.2032911268458797<class 'numpy.float64'>|1 <class 'numpy.int64'>
-1.515299754708565<class 'numpy.float64'>|2 <class 'numpy.int64'>
-0.9327277456926928<class 'numpy.float64'>|3 <class 'numpy.int64'>
-0.7552733088729586<class 'numpy.float64'>|4 <class 'numpy.int64'>
-0.12054968098895108<class 'numpy.float64'>|5 <class 'numpy.int64'>


## Split to Subsets

In [5]:
'''用dataset.random_split 可以做Dataset class切分，加起來長度要跟原本一樣長'''
L=len(dataset)
seta,setb,setc=data.dataset.random_split(dataset,[L//3,L//3,L-L//3*2])

In [6]:
for d,l in seta:
    print(f"{d}{type(d)}|{l} {type(l)}")

-0.7552733088729586<class 'numpy.float64'>|4 <class 'numpy.int64'>
-0.9327277456926928<class 'numpy.float64'>|3 <class 'numpy.int64'>


In [7]:
for d,l in setb:
    print(f"{d}{type(d)}|{l} {type(l)}")

-0.12054968098895108<class 'numpy.float64'>|5 <class 'numpy.int64'>
-1.515299754708565<class 'numpy.float64'>|2 <class 'numpy.int64'>


In [8]:
for d,l in setc:
    print(f"{d}{type(d)}|{l} {type(l)}")

0.3314613320867217<class 'numpy.float64'>|0 <class 'numpy.int64'>
0.2032911268458797<class 'numpy.float64'>|1 <class 'numpy.int64'>


## Concat datasets

In [9]:
'''把兩個dataset接在一起'''
setd=data.dataset.ConcatDataset((seta,setb))

In [10]:
for d,l in setd:
    print(f"{d}{type(d)}|{l} {type(l)}")

-0.7552733088729586<class 'numpy.float64'>|4 <class 'numpy.int64'>
-0.9327277456926928<class 'numpy.float64'>|3 <class 'numpy.int64'>
-0.12054968098895108<class 'numpy.float64'>|5 <class 'numpy.int64'>
-1.515299754708565<class 'numpy.float64'>|2 <class 'numpy.int64'>


In [11]:
'''也支援直接用加的作concate'''
for d,l in seta+setb:
    print(f"{d}{type(d)}|{l} {type(l)}")

-0.7552733088729586<class 'numpy.float64'>|4 <class 'numpy.int64'>
-0.9327277456926928<class 'numpy.float64'>|3 <class 'numpy.int64'>
-0.12054968098895108<class 'numpy.float64'>|5 <class 'numpy.int64'>
-1.515299754708565<class 'numpy.float64'>|2 <class 'numpy.int64'>


## Iterable Dataset

In [36]:
class NumIterDataset(data.IterableDataset):
    '''Init: 使用變數宣告- data來源、總量、preprocess方法...等等'''
    def __init__(self,fname,number_length):
        super(NumIterDataset).__init__()
        self.fname=fname
        self.number_length=number_length
    '''長度不宣告也沒差還是可以跑，有時候可以是無窮迴圈dataset'''
#     def __len__(self):
#         return self.number_length
    '''定義iteration的方式，反正此dataset被call到之後開始跑iteration'''
    def __iter__(self):
        file_itr = open(self.fname)
        _=next(file_itr)
        for idx,line in enumerate(file_itr):
            if idx<self.number_length:
                d,l=line.split(",")
                yield float(d),int(l)
            else:
                break
iter_dataset=NumIterDataset("data.csv",6)

In [37]:
for d,l in iter_dataset:
    print(f"{d}{type(d)}|{l} {type(l)}")

0.3314613320867217<class 'float'>|0 <class 'int'>
0.20329112684587966<class 'float'>|1 <class 'int'>
-1.515299754708565<class 'float'>|2 <class 'int'>
-0.9327277456926928<class 'float'>|3 <class 'int'>
-0.7552733088729586<class 'float'>|4 <class 'int'>
-0.12054968098895107<class 'float'>|5 <class 'int'>


**無法做split,因為split後的dataset沒有長度**

## ChainDataset

In [38]:
'''把兩個dataset接在一起，兩個有可能規則不一樣'''
seta=NumIterDataset("data.csv",3)
setb=NumIterDataset("data.csv",3)
iter_dataset=data.dataset.ChainDataset((seta,setb))

In [39]:
for d,l in iter_dataset:
    print(f"{d}{type(d)}|{l} {type(l)}")

0.3314613320867217<class 'float'>|0 <class 'int'>
0.20329112684587966<class 'float'>|1 <class 'int'>
-1.515299754708565<class 'float'>|2 <class 'int'>
0.3314613320867217<class 'float'>|0 <class 'int'>
0.20329112684587966<class 'float'>|1 <class 'int'>
-1.515299754708565<class 'float'>|2 <class 'int'>


## Tensor Dataset

In [25]:
'''直接把準備好的tensor塞進dataset'''
t_dataset=data.dataset.TensorDataset(torch.rand(5),torch.arange(0,5))

In [26]:
'''load出來是tensor'''
for d,l in t_dataset:
    print(f"{d}{type(d)}|{l} {type(l)}")

0.46013665199279785<class 'torch.Tensor'>|0 <class 'torch.Tensor'>
0.4718555212020874<class 'torch.Tensor'>|1 <class 'torch.Tensor'>
0.9486867785453796<class 'torch.Tensor'>|2 <class 'torch.Tensor'>
0.7282485365867615<class 'torch.Tensor'>|3 <class 'torch.Tensor'>
0.6869804263114929<class 'torch.Tensor'>|4 <class 'torch.Tensor'>
