# **自定義資料集（Custom Dataset）**
神經網路的訓練中，往往資料量都是相當龐大的（無法一次讀取進記憶體的資料量），因此需要透過 Dataset 的建立，拆分成數個較小的資料，批次讀取進模型訓練。

## 本章節內容大綱
* ### [tf.data.Dataset](#Dataset)
    * #### from_tensors
    * #### from_tensor_slices
    * #### from_generator
* ### [Dataset operation](#Operation)
---

## 匯入套件

In [None]:
import tensorflow as tf

<a name="Dataset"></a>
## tf.data.Dataset

In [None]:
value = tf.random.uniform((5, 2))
target = tf.random.uniform((5,), maxval=5, dtype=tf.int64)

* ### from_tensors

In [None]:
dataset1 = tf.data.Dataset.from_tensors((value, target))

In [None]:
dataset1

In [None]:
it = iter(dataset1)
print(next(it))

In [None]:
for idx, elem in enumerate(dataset1):
    print(f'{idx}. {elem}')

In [None]:
list(dataset1.as_numpy_iterator())

* ### from_tensor_slices

In [None]:
dataset2 = tf.data.Dataset.from_tensor_slices((value, target))

In [None]:
it = iter(dataset2)
print('0.', next(it))
print('1.', next(it))

In [None]:
for idx, elem in enumerate(dataset2):
    print(f'{idx}. {elem}')

In [None]:
list(dataset2.as_numpy_iterator())

* ### from_generator

In [None]:
# generator function
def sample(value, target):
    i = 0
    stop = 5
    while i < stop:
        yield (value[i, :], target[i])
        i += 1


dataset3 = tf.data.Dataset.from_generator(sample,
                                          args=(value, target),
                                          output_types=(tf.float32, tf.uint8))

In [None]:
it = iter(dataset3)
print('0.', next(it))
print('1.', next(it))

In [None]:
list(dataset3.as_numpy_iterator())

### ZipDataset

In [None]:
x_dataset = tf.data.Dataset.from_tensor_slices(value)
y_dataset = tf.data.Dataset.from_tensor_slices(target)

In [None]:
dataset4 = tf.data.Dataset.zip((x_dataset, y_dataset))

In [None]:
# zip dataset 批次輸出的是來自兩個資料集的樣本
for idx, elem in dataset4:
    print(f'{idx}, {elem.numpy()}')

In [None]:
list(dataset4.as_numpy_iterator())

<a name="Operation"></a>
## Dataset 物件的操作

In [None]:
random_v = tf.random.normal((10, 4))
dataset = tf.data.Dataset.from_tensor_slices(random_v)

In [None]:
for idx, elem in enumerate(dataset):
    print(f'{idx}. {elem.numpy()}')

* ### shuffle

In [None]:
shuffle_dataset = dataset.shuffle(buffer_size=3, reshuffle_each_iteration=True)

In [None]:
shuffle_dataset

In [None]:
for idx, elem in enumerate(shuffle_dataset):
    print(f'{idx}. {elem.numpy()}')

* ### batch

In [None]:
batch_dataset = dataset.batch(batch_size=2)

In [None]:
for idx, elem in enumerate(batch_dataset):
    print(f'{idx}. {elem.numpy()}')

* ### repeat

In [None]:
repeat_dataset = dataset.repeat(count=2)

In [None]:
for idx, elem in enumerate(repeat_dataset):
    print(f'{idx}. {elem.numpy()}')

* #### take

In [None]:
take_dataset = dataset.take(count=5)

In [None]:
for idx, elem in enumerate(take_dataset):
    print(f'{idx}. {elem.numpy()}')

* ### prefetch: 在訓練時，同時讀取下一批資料，並做轉換。

In [None]:
import time
class ArtificialDataset(tf.data.Dataset):
    def _generator(num_samples):
        # Opening the file
        time.sleep(0.03)

        for sample_idx in range(num_samples):
            # Reading data (line, record) from the file
            time.sleep(0.015)

            yield (sample_idx,)

    def __new__(cls, num_samples=3):
        return tf.data.Dataset.from_generator(
            cls._generator,
            output_signature=tf.TensorSpec(shape=(1,), dtype=tf.int64),
            args=(num_samples,)
        )

In [None]:
# 模擬訓練運行時間
def benchmark(dataset, num_epochs=2):
    start_time = time.perf_counter()
    for epoch_num in range(num_epochs):
        for sample in dataset:
            # Performing a training step
            time.sleep(0.01)
    print("Execution time:", time.perf_counter() - start_time)

In [None]:
benchmark(ArtificialDataset())

![](https://i.imgur.com/9JGnltT.png)

In [None]:
benchmark(
    ArtificialDataset()
    .prefetch(tf.data.AUTOTUNE)
)

![](https://i.imgur.com/s2OWzTP.png)

* ### cache: 可將讀出的資料留在快取記憶體，之後重複使用。

In [None]:
benchmark(
    ArtificialDataset()
    .cache()
)

![](https://i.imgur.com/CIKqA5l.png)