In [1]:
import tensorflow as tf

In [2]:
import numpy as np
import numpy.random as rnd

In [5]:
dat = tf.data.Dataset.from_tensor_slices(
    tensors = {
        "name": ["a", "b", "c"],
        "id": [1, 2, 3],
        "x": rnd.rand(3, 2),
        "y": rnd.rand(3, 1),
    }
)

In [6]:
dat

<TensorSliceDataset shapes: {name: (), id: (), x: (2,), y: (1,)}, types: {name: tf.string, id: tf.int32, x: tf.float64, y: tf.float64}>

##### One shot iterator
No initializer, we can iterate once and at the end we get `tf.errors.OutOfRangeError`. Since we have no initializer, we can not reinitialize the iterator in the same session.

In [26]:
iterator = dat.make_one_shot_iterator()

In [27]:
row = iterator.get_next()

In [29]:
with tf.Session() as sess:
    
    for i in range(4):
        try:
            print(sess.run(row))
        except tf.errors.OutOfRangeError as exc:
            print(type(exc))

{'name': b'a', 'id': 1, 'x': array([0.30432704, 0.26048328]), 'y': array([0.0285219])}
{'name': b'b', 'id': 2, 'x': array([0.54543213, 0.25145685]), 'y': array([0.65539612])}
{'name': b'c', 'id': 3, 'x': array([0.12961328, 0.82632586]), 'y': array([0.81019914])}
<class 'tensorflow.python.framework.errors_impl.OutOfRangeError'>


#### Initializable iterator

In [30]:
iterator = dat.make_initializable_iterator()

In [31]:
row = iterator.get_next()

In [34]:
with tf.Session() as sess:
    sess.run(iterator.initializer)
    for i in range(4):
        try:
            print(sess.run(row))
        except tf.errors.OutOfRangeError as exc:
            print(type(exc))
            
    print("\nWe can reinitialize and iterate once again:\n")
    sess.run(iterator.initializer)
    for i in range(4):
        try:
            print(sess.run(row))
        except tf.errors.OutOfRangeError as exc:
            print(type(exc))

{'name': b'a', 'id': 1, 'x': array([0.30432704, 0.26048328]), 'y': array([0.0285219])}
{'name': b'b', 'id': 2, 'x': array([0.54543213, 0.25145685]), 'y': array([0.65539612])}
{'name': b'c', 'id': 3, 'x': array([0.12961328, 0.82632586]), 'y': array([0.81019914])}
<class 'tensorflow.python.framework.errors_impl.OutOfRangeError'>

We can reinitialize and iterate once again:

{'name': b'a', 'id': 1, 'x': array([0.30432704, 0.26048328]), 'y': array([0.0285219])}
{'name': b'b', 'id': 2, 'x': array([0.54543213, 0.25145685]), 'y': array([0.65539612])}
{'name': b'c', 'id': 3, 'x': array([0.12961328, 0.82632586]), 'y': array([0.81019914])}
<class 'tensorflow.python.framework.errors_impl.OutOfRangeError'>


If we define the dataset from variables or placeholders, we can change the data:

In [46]:
x_data = tf.placeholder(np.float32, [10, 2])
y_data = tf.placeholder(np.float32, [10, 1])
dat2 = tf.data.Dataset.from_tensor_slices({"x": x_data, "y": y_data})

In [47]:
iterator = dat2.make_initializable_iterator()

In [48]:
row = iterator.get_next()

In [49]:
with tf.Session() as sess:
    sess.run(iterator.initializer, 
        feed_dict={
            x_data: np.ones([10, 2]),
            y_data: np.ones([10, 1]),
        }
    )
    for i in range(4):
        print(sess.run(row)) 

{'x': array([1., 1.], dtype=float32), 'y': array([1.], dtype=float32)}
{'x': array([1., 1.], dtype=float32), 'y': array([1.], dtype=float32)}
{'x': array([1., 1.], dtype=float32), 'y': array([1.], dtype=float32)}
{'x': array([1., 1.], dtype=float32), 'y': array([1.], dtype=float32)}


#### Reinitializable iterator
Initializer is defined independently of the Datasets and Datasets can be assigned to it and changed in the session-run-time.

In [51]:
iterator = tf.data.Iterator.from_structure(
    output_types = dat.output_types,
    output_shapes = dat.output_shapes
)

In [57]:
row = iterator.get_next()

In [54]:
train_data_initializer = iterator.make_initializer(dat)

In [50]:
dat2 = tf.data.Dataset.from_tensor_slices(
    tensors = {
        "name": ["d", "e", "f"],
        "id": [4, 5, 6],
        "x": rnd.rand(3, 2),
        "y": rnd.rand(3, 1),
    }
)

In [55]:
test_data_initializer = iterator.make_initializer(dat2)

In [60]:
with tf.Session() as sess:
    sess.run(train_data_initializer)
    print(sess.run(row)) 
        
    sess.run(test_data_initializer)
    print(sess.run(row)) 
    
    sess.run(train_data_initializer)
    print(sess.run(row)) 

{'name': b'a', 'id': 1, 'x': array([0.30432704, 0.26048328]), 'y': array([0.0285219])}
{'name': b'd', 'id': 4, 'x': array([0.08073161, 0.78845586]), 'y': array([0.67837993])}
{'name': b'a', 'id': 1, 'x': array([0.30432704, 0.26048328]), 'y': array([0.0285219])}


#### Feedable iterator
switches between iterators. So unlike the preceding one, the state of each iterators is preserved after switching.

In [61]:
iterator1 = dat.make_one_shot_iterator()
iterator2 = dat.make_one_shot_iterator()

In [78]:
handle = tf.placeholder(dtype=tf.string, shape = [], name = "iterator_handle")
iterator = tf.data.Iterator.from_string_handle(
    string_handle=handle, 
    output_types=dat.output_types, 
    output_shapes=dat.output_shapes
)

In [79]:
row = iterator.get_next()

In [83]:
with tf.Session() as sess:
    h1 = sess.run(iterator1.string_handle())
    h2 = sess.run(iterator2.string_handle())
    print(h1, h2)
    
    print(sess.run(row), {handle: h1}) 
    

b"\n,/job:localhost/replica:0/task:0/device:CPU:0\x12\tlocalhost\x1a\x15_39_OneShotIterator_2 \xe4\xed\xb1\xef\xf7\xd8\x80\x89,*9class tensorflow::`anonymous namespace'::IteratorResource" b"\n,/job:localhost/replica:0/task:0/device:CPU:0\x12\tlocalhost\x1a\x15_40_OneShotIterator_3 \xe4\xed\xb1\xef\xf7\xd8\x80\x89,*9class tensorflow::`anonymous namespace'::IteratorResource"


InvalidArgumentError: You must feed a value for placeholder tensor 'iterator_handle' with dtype string
	 [[Node: iterator_handle = Placeholder[dtype=DT_STRING, shape=[], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'iterator_handle', defined at:
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\ipykernel\kernelapp.py", line 497, in start
    self.io_loop.start()
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\asyncio\base_events.py", line 422, in run_forever
    self._run_once()
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\asyncio\base_events.py", line 1434, in _run_once
    handle._run()
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\tornado\platform\asyncio.py", line 122, in _handle_events
    handler_func(fileobj, events)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\zmq\eventloop\zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\zmq\eventloop\zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\IPython\core\interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\IPython\core\interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\IPython\core\interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-78-48e62ff1c6cd>", line 1, in <module>
    handle = tf.placeholder(tf.string, shape = [], name = "iterator_handle")
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\tensorflow\python\ops\array_ops.py", line 1735, in placeholder
    return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 5928, in placeholder
    "Placeholder", dtype=dtype, shape=shape, name=name)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\tensorflow\python\util\deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\tensorflow\python\framework\ops.py", line 3155, in create_op
    op_def=op_def)
  File "C:\~Lokalni data\WPy-3662\python-3.6.6.amd64\lib\site-packages\tensorflow\python\framework\ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'iterator_handle' with dtype string
	 [[Node: iterator_handle = Placeholder[dtype=DT_STRING, shape=[], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


## Operations
#### `repeat` and `batch`

In [110]:
iterator = dat.repeat().batch(2).make_one_shot_iterator()
batch = iterator.get_next()
batch

{'name': <tf.Tensor 'IteratorGetNext_16:1' shape=(?,) dtype=string>,
 'id': <tf.Tensor 'IteratorGetNext_16:0' shape=(?,) dtype=int32>,
 'x': <tf.Tensor 'IteratorGetNext_16:2' shape=(?, 2) dtype=float64>,
 'y': <tf.Tensor 'IteratorGetNext_16:3' shape=(?, 1) dtype=float64>}

In [111]:
with tf.Session() as sess:
    for i in range(2):
        print(sess.run(batch))

{'name': array([b'a', b'b'], dtype=object), 'id': array([1, 2]), 'x': array([[0.30432704, 0.26048328],
       [0.54543213, 0.25145685]]), 'y': array([[0.0285219 ],
       [0.65539612]])}
{'name': array([b'c', b'a'], dtype=object), 'id': array([3, 1]), 'x': array([[0.12961328, 0.82632586],
       [0.30432704, 0.26048328]]), 'y': array([[0.81019914],
       [0.0285219 ]])}


In [112]:
help(dat.shuffle)

Help on method shuffle in module tensorflow.python.data.ops.dataset_ops:

shuffle(buffer_size, seed=None, reshuffle_each_iteration=None) method of tensorflow.python.data.ops.dataset_ops.TensorSliceDataset instance
    Randomly shuffles the elements of this dataset.
    
    Args:
      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
        number of elements from this dataset from which the new
        dataset will sample.
      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
        random seed that will be used to create the distribution. See
        @{tf.set_random_seed} for behavior.
      reshuffle_each_iteration: (Optional.) A boolean, which if true indicates
        that the dataset should be pseudorandomly reshuffled each time it is
        iterated over. (Defaults to `True`.)
    
    Returns:
      Dataset: A `Dataset`.



#### `shuffle`
The mysterious argument `buffer_size` means the following. Imagine the data is sequentioally read from disk or whatever. In order to properly shuffle the samples we would need to read them all in. This argument controls how many rows we first read from the `Dataset` and then shuffle. In particular, when `buffer_size = 1`, there is no shuffling.

In [126]:
iterator = dat.shuffle(buffer_size = 100).make_one_shot_iterator()
row = iterator.get_next()

In [136]:
with tf.Session() as sess:
    for i in range(3):
        print(sess.run(row))

{'name': b'b', 'id': 2, 'x': array([0.54543213, 0.25145685]), 'y': array([0.65539612])}
{'name': b'a', 'id': 1, 'x': array([0.30432704, 0.26048328]), 'y': array([0.0285219])}
{'name': b'c', 'id': 3, 'x': array([0.12961328, 0.82632586]), 'y': array([0.81019914])}


#### `map`

In [150]:
dat_transf = dat.map(lambda x: {"x_sum": tf.reduce_sum(x["x"]), "y_square": x["y"]**2})

In [151]:
iterator = dat_transf.make_one_shot_iterator()
row = iterator.get_next()

In [152]:
with tf.Session() as sess:
    for i in range(3):
        print(sess.run(row))

{'x_sum': 0.5648103279428714, 'y_square': array([0.0008135])}
{'x_sum': 0.7968889858971598, 'y_square': array([0.42954407])}
{'x_sum': 0.9559391372359453, 'y_square': array([0.65642265])}


In [154]:
1200 * 800 * 3 * 4 * 2

23040000

# Training with shuffling and batching

In [3]:
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))

In [4]:
train_dataset = dataset.shuffle(100).batch(4)

In [5]:
train_iterator = train_dataset.make_initializable_iterator()

In [6]:
train_batch = train_iterator.get_next()

In [11]:
with tf.Session() as sess:
    for epoch in range(3):
        sess.run(train_iterator.initializer)
    
        try:
            while True:
                print(sess.run(train_batch))
        except tf.errors.OutOfRangeError:
            print(f"End of epoch {epoch}.")

[9 1 3 0]
[8 5 6 2]
[4 7]
End of epoch 0.
[0 1 4 9]
[8 5 6 7]
[3 2]
End of epoch 1.
[3 5 0 4]
[2 6 9 1]
[7 8]
End of epoch 2.
