# tf.data Basic Mechanism

This notebook is inspired by the [official TensorFlow Guide](https://www.tensorflow.org/guide/data).

In [1]:
import tensorflow as tf
import numpy as np

## Extract data From Memory

In [2]:
x = np.array([8, 3, 0, 8, 2, 1])
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [7]:
x

array([8, 3, 0, 8, 2, 1])

The `Dataset` object is a Python iterable. This makes it possible to consume its
elements using a for loop:

In [3]:
for elem in dataset:
    print(elem)

tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)


In [4]:
it = iter(dataset)

In [6]:
print(next(it).numpy())

3


## A more Realisitic Example.
a dataset with 200 samples and 10 features

In [10]:
x = tf.random.uniform([200, 10])
tf_dataset = tf.data.Dataset.from_tensor_slices(x)

In [11]:
for i, elem in enumerate(tf_dataset):
    print(elem)
    if i > 10:
        break

tf.Tensor(
[0.597754   0.554912   0.573985   0.71291053 0.9184737  0.39477813
 0.7358029  0.8952974  0.02176118 0.546155  ], shape=(10,), dtype=float32)
tf.Tensor(
[0.19919181 0.5963838  0.25596595 0.41930163 0.6867647  0.127123
 0.9119897  0.06392038 0.74711764 0.11514914], shape=(10,), dtype=float32)
tf.Tensor(
[0.17310655 0.61516094 0.79892015 0.5103539  0.27302158 0.0331192
 0.09656012 0.60670817 0.33081293 0.5840672 ], shape=(10,), dtype=float32)
tf.Tensor(
[0.22187018 0.98300123 0.12284315 0.49635708 0.3872652  0.54655313
 0.94389856 0.03722155 0.05015683 0.04314578], shape=(10,), dtype=float32)
tf.Tensor(
[0.45467985 0.75664186 0.69882774 0.24388158 0.84250057 0.84621716
 0.14228678 0.11126041 0.13774371 0.40562153], shape=(10,), dtype=float32)
tf.Tensor(
[0.9243996  0.8916346  0.26576865 0.78188336 0.49977922 0.23688781
 0.8735614  0.8708105  0.5708258  0.86342394], shape=(10,), dtype=float32)
tf.Tensor(
[0.3246554  0.89491856 0.9434403  0.11232805 0.3151616  0.0623275
 0.20481

In [13]:
tf_dataset.element_spec

TensorSpec(shape=(10,), dtype=tf.float32, name=None)

In [17]:
y = tf.random.uniform([200], maxval=2, dtype=tf.int32)

In [18]:
y

<tf.Tensor: shape=(200,), dtype=int32, numpy=
array([0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1], dtype=int32)>

In [19]:
dataset = tf.data.Dataset.from_tensor_slices((x, y))

In [16]:
dataset.element_spec

(TensorSpec(shape=(10,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int32, name=None))

In [20]:
for i, (x,y) in enumerate(dataset):
    print(f"x is {x}")
    print(f"y is {y}")
    if i > 5:
        break

x is [0.597754   0.554912   0.573985   0.71291053 0.9184737  0.39477813
 0.7358029  0.8952974  0.02176118 0.546155  ]
y is 0
x is [0.19919181 0.5963838  0.25596595 0.41930163 0.6867647  0.127123
 0.9119897  0.06392038 0.74711764 0.11514914]
y is 1
x is [0.17310655 0.61516094 0.79892015 0.5103539  0.27302158 0.0331192
 0.09656012 0.60670817 0.33081293 0.5840672 ]
y is 1
x is [0.22187018 0.98300123 0.12284315 0.49635708 0.3872652  0.54655313
 0.94389856 0.03722155 0.05015683 0.04314578]
y is 0
x is [0.45467985 0.75664186 0.69882774 0.24388158 0.84250057 0.84621716
 0.14228678 0.11126041 0.13774371 0.40562153]
y is 1
x is [0.9243996  0.8916346  0.26576865 0.78188336 0.49977922 0.23688781
 0.8735614  0.8708105  0.5708258  0.86342394]
y is 0
x is [0.3246554  0.89491856 0.9434403  0.11232805 0.3151616  0.0623275
 0.20481491 0.0499624  0.21202755 0.5303097 ]
y is 1


## Zip two dataset

In [21]:
x = tf.random.uniform([200, 10])
y = tf.random.uniform([200], maxval=2, dtype=tf.int32)
ds_x = tf.data.Dataset.from_tensor_slices(x)
ds_y = tf.data.Dataset.from_tensor_slices(y)

In [22]:
print(ds_x.element_spec)
print(ds_y.element_spec)

TensorSpec(shape=(10,), dtype=tf.float32, name=None)
TensorSpec(shape=(), dtype=tf.int32, name=None)


In [23]:
ds = tf.data.Dataset.zip((ds_x, ds_y))

In [24]:
ds.element_spec

(TensorSpec(shape=(10,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int32, name=None))

## Using take in for loop

In [25]:
for x, y in ds.take(10):
    print(y)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)


## Data Shuffling
https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle

In [26]:
x = tf.range(5)

In [27]:
x

<tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 1, 2, 3, 4], dtype=int32)>

In [35]:
ds = tf.data.Dataset.from_tensor_slices(x)

In [36]:
for elem in ds:
    print(elem)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [37]:
ds = ds.shuffle(buffer_size=5)

In [32]:
for elem in ds:
    print(elem)

tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)


## Repeat Dataset
https://www.tensorflow.org/api_docs/python/tf/data/Dataset#repeat

In [38]:
ds = ds.repeat(2)
for elem in ds:
    print(elem)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)


## Batching
https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch

In [39]:
dataset = tf.data.Dataset.range(100)


In [40]:
for elem in dataset.take(5):
    print(elem)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)


In [41]:
dataset = dataset.batch(4)

In [42]:
for elem in dataset.take(5):
    print(elem)

tf.Tensor([0 1 2 3], shape=(4,), dtype=int64)
tf.Tensor([4 5 6 7], shape=(4,), dtype=int64)
tf.Tensor([ 8  9 10 11], shape=(4,), dtype=int64)
tf.Tensor([12 13 14 15], shape=(4,), dtype=int64)
tf.Tensor([16 17 18 19], shape=(4,), dtype=int64)
