# TF2.0 Loader製作- Interaction

## 內容
    * 基本loader
    * map
    * zip
    * concatenate

In [1]:
import tensorflow as tf
from functools import partial
from pprint import pprint
import numpy as np
import matplotlib.pyplot as plt

**以tf.data.Dataset.from_generator這種loader類型示範**

先準備好一個data generator，會yield指定數量的數字

In [2]:
def f(rng):
    for i in range(rng):
        yield i

In [43]:
list(iter(f(3)))

[0, 1, 2]

---

## Map

In [312]:
'''套用指定算式做前處理，例如一開始是檔案名稱的dataset，可依序讀檔，前處理'''
d=tf.data.Dataset.from_generator(f,args=[6],output_types=tf.float32).map(lambda x:x*2)

In [313]:
'''依指定算式做前處理'''
for i,x in enumerate(d):
    print(x)

tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(2.0, shape=(), dtype=float32)
tf.Tensor(4.0, shape=(), dtype=float32)
tf.Tensor(6.0, shape=(), dtype=float32)
tf.Tensor(8.0, shape=(), dtype=float32)
tf.Tensor(10.0, shape=(), dtype=float32)


In [332]:
d1=tf.data.Dataset.from_generator(f,args=[1000],output_types=tf.float32).map(lambda x:x*2)
d2=tf.data.Dataset.from_generator(f,args=[1000],output_types=tf.float32).map(lambda x:x*2,num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [333]:
%%time
for i,x in enumerate(d1):
    pass

CPU times: user 268 ms, sys: 77 ms, total: 344 ms
Wall time: 318 ms


In [334]:
%%time
'''使用num_parallel_calls做平行化會快一點'''
for i,x in enumerate(d2):
    pass

CPU times: user 290 ms, sys: 93.1 ms, total: 383 ms
Wall time: 237 ms


---

## zip

In [293]:
'''把兩個dataset的資料並聯，可用於data跟label相並'''
d0=tf.data.Dataset.from_generator(f,args=[3],output_types=tf.float32).shuffle(3)
d1=tf.data.Dataset.from_generator(f,args=[4],output_types=tf.float32)
d2=tf.data.Dataset.zip((d0,d1))

In [294]:
%%time
'''數量對不上時，最後面的會被砍掉'''
for i,x in enumerate(d2):
    print(x)

(<tf.Tensor: shape=(), dtype=float32, numpy=2.0>, <tf.Tensor: shape=(), dtype=float32, numpy=0.0>)
(<tf.Tensor: shape=(), dtype=float32, numpy=1.0>, <tf.Tensor: shape=(), dtype=float32, numpy=1.0>)
(<tf.Tensor: shape=(), dtype=float32, numpy=0.0>, <tf.Tensor: shape=(), dtype=float32, numpy=2.0>)
CPU times: user 48.9 ms, sys: 72.1 ms, total: 121 ms
Wall time: 117 ms


---

## Concatenate

In [281]:
'''把兩個dataset接起來'''
d0=tf.data.Dataset.from_generator(f,args=[3],output_types=tf.float32).shuffle(3)
d1=tf.data.Dataset.from_generator(f,args=[4],output_types=tf.float32)
d2=tf.data.Dataset.concatenate(d0,d1)

In [282]:
%%time
for i,x in enumerate(d0):
    pprint(x)
for i,x in enumerate(d1):
    pprint(x)

<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>
CPU times: user 39 ms, sys: 88.6 ms, total: 128 ms
Wall time: 123 ms


In [283]:
%%time
'''接起來可能會稍慢但是很方便'''
for i,x in enumerate(d2):
    pprint(x)

<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>
CPU times: user 64.3 ms, sys: 68.7 ms, total: 133 ms
Wall time: 129 ms


In [284]:
'''不過有繼承cache所以如果有先pre-run過就沒差太多'''
d0=tf.data.Dataset.from_generator(f,args=[3],output_types=tf.float32).shuffle(3).cache()
d1=tf.data.Dataset.from_generator(f,args=[4],output_types=tf.float32).cache()
d2=tf.data.Dataset.concatenate(d0,d1)

In [285]:
%%time
for i,x in enumerate(d0):
    pprint(x)
for i,x in enumerate(d1):
    pprint(x)

<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>
CPU times: user 45.2 ms, sys: 85 ms, total: 130 ms
Wall time: 125 ms


In [286]:
%%time
for i,x in enumerate(d2):
    pprint(x)

<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=0.0>
<tf.Tensor: shape=(), dtype=float32, numpy=1.0>
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>
CPU times: user 19.1 ms, sys: 527 µs, total: 19.6 ms
Wall time: 16.1 ms
