# Data Loader- Dataset Opeerations 2
這邊介紹更多操作，其中有一部分需要用到兩個以上的Dataset

## 內容
* 基本loader
* take/skip
* map
* zip
* concatenate

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

**以tf.data.Dataset.from_generator這種loader類型示範**

先準備好一個data generator，會yield指定數量的數字

In [2]:
def f(rng):
    for i in range(rng):
        yield i

In [3]:
list(iter(f(3)))

[0, 1, 2]

---

## Take, Skip

In [4]:
d=tf.data.Dataset.from_generator(f,args=[10],output_types=tf.float32)

In [5]:
%%time
'''Take可以擷取dataset一部分'''
for i,x in enumerate(d.take(3)):
        print(x)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
CPU times: user 25.9 ms, sys: 25.2 ms, total: 51.1 ms
Wall time: 48.7 ms


In [6]:
%%time
for i,x in enumerate(d.take(4)):
        print(x)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>
CPU times: user 28.4 ms, sys: 21.8 ms, total: 50.2 ms
Wall time: 47.6 ms


In [7]:
%%time
'''Skip可以跳過dataset一部分'''
for i,x in enumerate(d.skip(5)):
        print(x)

<tf.Tensor: shape=(), dtype=float32, numpy=5.0>
<tf.Tensor: shape=(), dtype=float32, numpy=6.0>
<tf.Tensor: shape=(), dtype=float32, numpy=7.0>
<tf.Tensor: shape=(), dtype=float32, numpy=8.0>
<tf.Tensor: shape=(), dtype=float32, numpy=9.0>
CPU times: user 23.4 ms, sys: 29.1 ms, total: 52.5 ms
Wall time: 49.6 ms


In [8]:
%%time
'''Skip可以跳過dataset一部分'''
for i,x in enumerate(d.skip(6)):
        print(x)

<tf.Tensor: shape=(), dtype=float32, numpy=6.0>
<tf.Tensor: shape=(), dtype=float32, numpy=7.0>
<tf.Tensor: shape=(), dtype=float32, numpy=8.0>
<tf.Tensor: shape=(), dtype=float32, numpy=9.0>
CPU times: user 23.3 ms, sys: 35.5 ms, total: 58.7 ms
Wall time: 55.7 ms


---

## Map

In [9]:
'''套用指定算式做前處理，例如一開始是檔案名稱的dataset，可依序讀檔，前處理'''
d=tf.data.Dataset.from_generator(f,args=[6],output_types=tf.float32).map(lambda x:x*2)

In [10]:
'''依指定算式做前處理'''
for i,x in enumerate(d):
    print(x)

tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(2.0, shape=(), dtype=float32)
tf.Tensor(4.0, shape=(), dtype=float32)
tf.Tensor(6.0, shape=(), dtype=float32)
tf.Tensor(8.0, shape=(), dtype=float32)
tf.Tensor(10.0, shape=(), dtype=float32)


In [11]:
d1=tf.data.Dataset.from_generator(f,args=[1000],output_types=tf.float32).map(lambda x:x*2)
d2=tf.data.Dataset.from_generator(f,args=[1000],output_types=tf.float32).map(lambda x:x*2,num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [12]:
%%time
for i,x in enumerate(d1):
    pass

CPU times: user 225 ms, sys: 84.8 ms, total: 310 ms
Wall time: 250 ms


In [13]:
%%time
'''使用num_parallel_calls做平行化會快一點'''
for i,x in enumerate(d2):
    pass

CPU times: user 225 ms, sys: 97.2 ms, total: 322 ms
Wall time: 204 ms


---

## zip

In [14]:
'''把兩個dataset的資料並聯，可用於data跟label相並'''
d0=tf.data.Dataset.from_generator(f,args=[3],output_types=tf.float32).shuffle(3)
d1=tf.data.Dataset.from_generator(f,args=[4],output_types=tf.float32)
d2=tf.data.Dataset.zip((d0,d1))

In [15]:
%%time
'''數量對不上時，最後面的會被砍掉'''
for i,x in enumerate(d2):
    print(x)

(<tf.Tensor: shape=(), dtype=float32, numpy=1.0>, <tf.Tensor: shape=(), dtype=float32, numpy=0.0>)
(<tf.Tensor: shape=(), dtype=float32, numpy=2.0>, <tf.Tensor: shape=(), dtype=float32, numpy=1.0>)
(<tf.Tensor: shape=(), dtype=float32, numpy=0.0>, <tf.Tensor: shape=(), dtype=float32, numpy=2.0>)
CPU times: user 58.9 ms, sys: 79.1 ms, total: 138 ms
Wall time: 134 ms


---

## Concatenate

In [16]:
'''把兩個dataset接起來'''
d0=tf.data.Dataset.from_generator(f,args=[3],output_types=tf.float32).shuffle(3)
d1=tf.data.Dataset.from_generator(f,args=[4],output_types=tf.float32)
d2=tf.data.Dataset.concatenate(d0,d1)

In [17]:
%%time
for i,x in enumerate(d0):
    print(x)
for i,x in enumerate(d1):
    print(x)

<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>
CPU times: user 37.1 ms, sys: 88.2 ms, total: 125 ms
Wall time: 123 ms


In [18]:
%%time
'''接起來可能會稍慢但是很方便'''
for i,x in enumerate(d2):
    print(x)

<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>
CPU times: user 38.6 ms, sys: 85.2 ms, total: 124 ms
Wall time: 120 ms


In [19]:
'''不過有繼承cache所以如果有先pre-run過就沒差太多'''
d0=tf.data.Dataset.from_generator(f,args=[3],output_types=tf.float32).shuffle(3).cache()
d1=tf.data.Dataset.from_generator(f,args=[4],output_types=tf.float32).cache()
d2=tf.data.Dataset.concatenate(d0,d1)

In [20]:
%%time
'''兩個分開跑'''
for i,x in enumerate(d0):
    print(x)
for i,x in enumerate(d1):
    print(x)

<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>
CPU times: user 37.9 ms, sys: 88.9 ms, total: 127 ms
Wall time: 124 ms


In [21]:
%%time
'''跑concate起來的，因為兩個都有pre-run過所以很快'''
for i,x in enumerate(d2):
    print(x)

<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>
CPU times: user 13.5 ms, sys: 3.48 ms, total: 17 ms
Wall time: 13.7 ms
