In [0]:
import tensorflow as tf

In [0]:
dataset1=tf.data.Dataset.from_tensor_slices(tf.range(10))
# Emits data of 10, 11, 12, 13, 14, (One element at a time)

In [0]:
dataset2=tf.data.Dataset.from_tensor_slices((tf.range(30,45,3),tf.range(10,15)))
# Emits data of (30, 60), (33, 62), (36, 64), (39, 66), (42, 68)
# Emits one tuple at a time

In [4]:
dataset3 = tf.data.Dataset.from_tensor_slices((tf.range(10), tf.range(5)))
# Dataset not possible as zeroth dimenion is different at 10 and 5

ValueError: ignored

Just like from_tensor_slices, this method also accepts individual (or multiple) Numpy (or Tensors) objects. But this method doesn’t support batching of data, i.e all the data will be given out instantly.

In [0]:
dataset1=tf.data.Dataset.from_tensors(tf.range(10))
# Emits data of [10, 11, 12, 13, 14]
# Holds entire list as one element

In [0]:
dataset2 = tf.data.Dataset.from_tensors((tf.range(30, 45, 3), tf.range(60, 70, 2)))
# Emits data of ([30, 33, 36, 39, 42], [60, 62, 64, 66, 68])
# Holds entire tuple as one element

In [0]:
dataset3 = tf.data.Dataset.from_tensors((tf.range(10), tf.range(5)))
# Possible with from_tensors, regardless of zeroth dimension mismatch of constituent elements.
# Emits data of ([1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4])
# Holds entire tuple as one element

In this method, a generator function is passed as input. This method is useful in cases where you wish to generate the data at runtime and as such no raw data exists with you or in scenarios where your training data is extremely huge and it is not possible to store them in your disk.

In [8]:
def generator(sequence_type):
  if sequence_type==1:
    for i in range(5):
      yield 10+i
  elif sequence_type==2:
    for i in range(5):
      yield (30+3*i,60+2*i)
  else:
    for i in range(1,4):
      yield (i,['Hi']*i)
      
dataset1=tf.data.Dataset.from_generator(generator,(tf.int32),args=([1]))
# Emits data of 10, 11, 12, 13, 14, (One element at a time)

Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    


In [0]:
dataset2=tf.data.Dataset.from_generator(generator,(tf.int32,tf.int32),args=([2]))
# Emits data of (30, 60), (33, 62), (36, 64), (39, 66), (42, 68)
# Emits one tuple at a time

In [0]:
dataset3=tf.data.Dataset.from_generator(generator,(tf.int32,tf.string),args=([3]))
# Emits data of (1, ['Hi']), (2, ['Hi', 'Hi']), (3, ['Hi', 'Hi', 'Hi'])
# Emits one tuple at a time

In [0]:
dataset = tf.data.Dataset.from_tensor_slices(tf.range(10))
# Create a dataset with data of [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset = dataset.repeat(2)
# Duplicate the dataset
# Data will be [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset = dataset.shuffle(5)
# Shuffle the dataset
# Assumed shuffling: [3, 0, 7, 9, 4, 2, 5, 0, 1, 7, 5, 9, 4, 6, 2, 8, 6, 8, 1, 3]

def map_fn(x):
    return x * 3

dataset = dataset.map(map_fn)
# Same as dataset = dataset.map(lambda x: x * 3)
# Multiply each element with 3 using map transformation
# Dataset: [9, 0, 21, 27, 12, 6, 15, 0, 3, 21, 15, 27, 12, 18, 6, 24, 18, 24, 3, 9]

def filter_fn(x):
    return tf.reshape(tf.not_equal(x % 5, 1), [])

dataset = dataset.filter(filter_fn)
# Same as dataset = dataset.filter(lambda x: tf.reshape(tf.not_equal(x % 5, 1), []))
# Filter out all those elements whose modulus 5 returns 1
# Dataset: [9, 0, 27, 12, 15, 0, 3, 15, 27, 12, 18, 24, 18, 24, 3, 9]

dataset = dataset.batch(4)
# Batch at every 4 elements
# Dataset: [9, 0, 27, 12], [15, 0, 3, 15], [27, 12, 18, 24], [18, 24, 3, 9]

**Ordering of transformation**

The ordering of the application of the transformation is very important. Your model may learn differently for the same Dataset but differently ordered transformations.

In [0]:
# Ordering #1
dataset1 = tf.data.Dataset.from_tensor_slices(tf.range(10))
# Dataset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset1 = dataset1.batch(4)
# Dataset: [0, 1, 2, 3], [4, 5, 6, 7], [8, 9]

dataset1 = dataset1.repeat(2)
# Dataset: [0, 1, 2, 3], [4, 5, 6, 7], [8, 9], [0, 1, 2, 3], [4, 5, 6, 7], [8, 9]
# Notice a 2 element batch in between

dataset1 = dataset1.shuffle(4)
# Shuffles at batch level.
# Dataset: [0, 1, 2, 3], [4, 5, 6, 7], [8, 9], [8, 9], [0, 1, 2, 3], [4, 5, 6, 7]



# Ordering #2
dataset2 = tf.data.Dataset.from_tensor_slices(tf.range(10))
# Dataset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset2 = dataset2.shuffle(4)
# Dataset: [3, 1, 0, 4, 5, 8, 6, 9, 7, 2]

dataset2 = dataset2.repeat(2)
# Dataset: [3, 1, 0, 4, 5, 8, 6, 9, 7, 2, 3, 1, 0, 4, 5, 8, 6, 9, 7, 2]

dataset2 = dataset2.batch(4)
# Dataset: [3, 1, 0, 4], [5, 8, 6, 9], [7, 2, 3, 1], [0, 4, 5, 8], [6, 9, 7, 2]

In [13]:
from tensorflow.examples.tutorials.mnist import input_data

mnist = input_data.read_data_sets("MNIST_data/", reshape=False)
X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val     = mnist.validation.images, mnist.validation.labels
X_test, y_test   = mnist.test.images, mnist.test.labels

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use urllib or similar directly.
Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


In [0]:
import tensorflow.contrib.slim as slim

# LeNet-5 model
class Model:
    def __init__(self, data_X, data_y):
        self.n_class = 10
        self._create_architecture(data_X, data_y)

    def _create_architecture(self, data_X, data_y):
        y_hot = tf.one_hot(data_y, depth = self.n_class)
        logits = self._create_model(data_X)
        predictions = tf.argmax(logits, 1, output_type = tf.int32)
        self.loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels = y_hot, 
                                                                              logits = logits))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = 0.001).minimize(self.loss)
        self.accuracy = tf.reduce_sum(tf.cast(tf.equal(predictions, data_y), tf.float32))

    def _create_model(self, X):
        X1 = X - 0.5
        X1 = tf.pad(X1, tf.constant([[0, 0], [2, 2], [2, 2], [0, 0]]))
        with slim.arg_scope([slim.conv2d, slim.fully_connected], 
                            weights_initializer = tf.truncated_normal_initializer(0.0, 0.1)):
            net = slim.conv2d(X1, 6, [5, 5], padding = 'VALID')
            net = slim.max_pool2d(net, [2, 2])
            net = slim.conv2d(net, 16, [5, 5], padding = 'VALID')
            net = slim.max_pool2d(net, [2, 2])
            
            net = tf.reshape(net, [-1, 400])
            net = slim.fully_connected(net, 120)
            net = slim.fully_connected(net, 84)
            net = slim.fully_connected(net, self.n_class, activation_fn = None)
        return net

#OVERVIEW OF ITERATOR
```
# Create dataset and perform transformations on it
dataset = << Create Dataset object >>
dataset = << Perform transformations on dataset >>

# Create iterator
iterator = << Create iterator using dataset >>
next_batch = iterator.get_next()

# Create session
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    try: 
        # Keep running next_batch till the Dataset is exhausted
        while True:
            sess.run(next_batch)
            
    except tf.errors.OutOfRangeError:
        pass
```



 iterator doesn’t keep track of how many elements are present in the Dataset. Hence, it is normal to keep running the iterator’s get_next operation till Tensorflow’s tf.errors.OutOfRangeError exception is occurred.

#One-shot iterator
This is the most basic type of iterator. All the data with all types of transformations that is needed in the dataset has to be decided before the Dataset is fed into this iterator. One-shot iterator will iterate through all the elements present in Dataset and once exhausted, cannot be used anymore. As a result, the Dataset generated for this iterator can tend to occupy a lot of memory.

In [15]:
!pip install tqdm



In [16]:
from tqdm import tqdm
epochs=10
batch_size=64
iterations=len(y_train)*epochs

dataset=tf.data.Dataset.from_tensor_slices((X_train,y_train))
# Generate the complete Dataset required in the pipeline

dataset=dataset.repeat(epochs).batch(batch_size)
iterator=dataset.make_one_shot_iterator()

data_X,data_y=iterator.get_next()
data_y=tf.cast(data_y,tf.int32)
model=Model(data_X,data_y)

with tf.Session() as sess, tqdm(total=iterations) as pbar:#Instantly make your loops show a smart progress meter - just wrap any iterable with tqdm(iterable), and you’re done!
  sess.run(tf.global_variables_initializer())
  tot_accuracy=0
  try:
    while True:
      accuracy, _ = sess.run([model.accuracy, model.optimizer])
      tot_accuracy += accuracy
      pbar.update(batch_size)
  except tf.errors.OutOfRangeError:
        pass
      
print('\nAverage training accuracy: {:.4f}'.format(tot_accuracy / iterations))

Instructions for updating:
Colocations handled automatically by placer.


550016it [06:37, 1403.65it/s]                            


Average training accuracy: 0.9820





# Initializable
In One-shot iterator, we had the shortfall of repetition of same training dataset in memory and there was absence of periodically validating our model using validation dataset in our code. In initializable iterator we overcome these problems. Initializable iterator has to be initialized with dataset before it starts running.

In [17]:
epochs=10
batch_size=64
iterations=len(y_train)*epochs

placeholder_X = tf.placeholder(tf.float32, [None, 28, 28, 1])
placeholder_y = tf.placeholder(tf.int32, [None])

dataset = tf.data.Dataset.from_tensor_slices((placeholder_X, placeholder_y))
dataset = dataset.batch(batch_size)
iterator = dataset.make_initializable_iterator()

data_X, data_y = iterator.get_next()
data_y = tf.cast(data_y, tf.int32)
model = Model(data_X, data_y)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_no in range(epochs):
        train_loss, train_accuracy = 0, 0
        val_loss, val_accuracy = 0, 0

        # Initialize iterator with training data
        sess.run(iterator.initializer, feed_dict = {placeholder_X: X_train, placeholder_y: y_train})
        try:
            with tqdm(total = len(y_train)) as pbar:
                while True:
                    _, loss, acc = sess.run([model.optimizer, model.loss, model.accuracy])
                    train_loss += loss 
                    train_accuracy += acc
                    pbar.update(batch_size) #Manually update the progress bar, useful for streams
        except tf.errors.OutOfRangeError:
            pass
    
        # Initialize iterator with validation data
        sess.run(iterator.initializer, feed_dict = {placeholder_X: X_val, placeholder_y: y_val})
        try:
            while True:
                loss, acc = sess.run([model.loss, model.accuracy])
                val_loss += loss 
                val_accuracy += acc
        except tf.errors.OutOfRangeError:
            pass
    
        print('\nEpoch No: {}'.format(epoch_no + 1))
        print('Train accuracy = {:.4f}, loss = {:.4f}'.format(train_accuracy / len(y_train), 
                                                        train_loss / len(y_train)))
        print('Val accuracy = {:.4f}, loss = {:.4f}'.format(val_accuracy / len(y_val), 
                                                        val_loss / len(y_val)))

55040it [00:40, 1357.73it/s]                           



Epoch No: 1
Train accuracy = 0.9226, loss = 0.2502
Val accuracy = 0.9670, loss = 0.1137


55040it [00:39, 1376.83it/s]                           



Epoch No: 2
Train accuracy = 0.9771, loss = 0.0731
Val accuracy = 0.9784, loss = 0.0657


55040it [00:40, 1362.87it/s]                           



Epoch No: 3
Train accuracy = 0.9843, loss = 0.0505
Val accuracy = 0.9834, loss = 0.0538


55040it [00:40, 1366.67it/s]                           



Epoch No: 4
Train accuracy = 0.9881, loss = 0.0384
Val accuracy = 0.9852, loss = 0.0529


55040it [00:40, 1363.63it/s]                           



Epoch No: 5
Train accuracy = 0.9913, loss = 0.0289
Val accuracy = 0.9860, loss = 0.0522


55040it [00:40, 1372.44it/s]                           



Epoch No: 6
Train accuracy = 0.9928, loss = 0.0234
Val accuracy = 0.9848, loss = 0.0527


55040it [00:40, 1375.96it/s]                           



Epoch No: 7
Train accuracy = 0.9941, loss = 0.0196
Val accuracy = 0.9856, loss = 0.0609


55040it [00:40, 1374.96it/s]                           



Epoch No: 8
Train accuracy = 0.9943, loss = 0.0172
Val accuracy = 0.9854, loss = 0.0592


55040it [00:39, 1458.33it/s]                           



Epoch No: 9
Train accuracy = 0.9950, loss = 0.0154
Val accuracy = 0.9856, loss = 0.0579


55040it [00:39, 1385.50it/s]                           



Epoch No: 10
Train accuracy = 0.9960, loss = 0.0118
Val accuracy = 0.9868, loss = 0.0589


#Reinitializable
In initializable iterator, there was a shortfall of different datasets undergoing the same pipeline before the Dataset is fed into the iterator. This problem is overcome by reinitializable iterator as we have the ability to feed different types of Datasets thereby undergoing different pipelines. Only one care has to be taken is that different Datasets are of the same data type

In [18]:
def map_fn(x, y):
    x=x*3.0+62.0
    return x, y

epochs = 10
batch_size = 64

placeholder_X = tf.placeholder(tf.float32, shape = [None, 28, 28, 1])
placeholder_y = tf.placeholder(tf.int32, shape = [None])

# Create separate Datasets for training and validation
train_dataset = tf.data.Dataset.from_tensor_slices((placeholder_X, placeholder_y))
train_dataset = train_dataset.batch(batch_size).map(map_fn)
val_dataset = tf.data.Dataset.from_tensor_slices((placeholder_X, placeholder_y))
val_dataset = val_dataset.batch(batch_size)

# Iterator has to have same output types across all Datasets to be used
iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
data_X, data_y = iterator.get_next()
data_y = tf.cast(data_y, tf.int32)
model = Model(data_X, data_y)

# Initialize with required Datasets
train_iterator = iterator.make_initializer(train_dataset)
val_iterator = iterator.make_initializer(val_dataset)

with tf.Session() as sess:#, tqdm(total=len(y_train)) as pbar::
    sess.run(tf.global_variables_initializer())

    for epoch_no in range(epochs):
        train_loss, train_accuracy = 0, 0
        val_loss, val_accuracy = 0, 0

        # Start train iterator
        sess.run(train_iterator, feed_dict = {placeholder_X: X_train, placeholder_y: y_train})
        try:
            with tqdm(total = len(y_train)) as pbar:
                while True:
                    _, acc, loss = sess.run([model.optimizer, model.accuracy, model.loss])
                    train_loss += loss
                    train_accuracy += acc
                    pbar.update(batch_size)
        except tf.errors.OutOfRangeError:
            pass

        # Start validation iterator
        sess.run(val_iterator, feed_dict = {placeholder_X: X_val, placeholder_y: y_val})
        try:
            while True:
                acc, loss = sess.run([model.accuracy, model.loss])
                val_loss += loss
                val_accuracy += acc
        except tf.errors.OutOfRangeError:
            pass

        print('\nEpoch: {}'.format(epoch_no + 1))
        print('Train accuracy: {:.4f}, loss: {:.4f}'.format(train_accuracy / len(y_train),
                                                             train_loss / len(y_train)))
        print('Val accuracy: {:.4f}, loss: {:.4f}\n'.format(val_accuracy / len(y_val), 
                                                            val_loss / len(y_val)))

55040it [00:39, 1400.85it/s]                           



Epoch: 1
Train accuracy: 0.7831, loss: 0.7269
Val accuracy: 0.2474, loss: 2.6010



55040it [00:38, 1435.86it/s]                           



Epoch: 2
Train accuracy: 0.9271, loss: 0.2346
Val accuracy: 0.2870, loss: 2.2820



55040it [00:38, 1425.60it/s]                           



Epoch: 3
Train accuracy: 0.9513, loss: 0.1575
Val accuracy: 0.2962, loss: 2.1088



55040it [00:38, 1414.60it/s]                           



Epoch: 4
Train accuracy: 0.9610, loss: 0.1241
Val accuracy: 0.3652, loss: 1.8208



55040it [00:39, 1404.82it/s]                           



Epoch: 5
Train accuracy: 0.9678, loss: 0.1067
Val accuracy: 0.3698, loss: 1.7952



55040it [00:39, 1405.75it/s]                           



Epoch: 6
Train accuracy: 0.9729, loss: 0.0876
Val accuracy: 0.3692, loss: 1.7915



55040it [00:39, 1395.43it/s]                           



Epoch: 7
Train accuracy: 0.9765, loss: 0.0768
Val accuracy: 0.3998, loss: 1.7029



55040it [00:39, 1401.88it/s]                           



Epoch: 8
Train accuracy: 0.9782, loss: 0.0697
Val accuracy: 0.4804, loss: 1.5795



55040it [00:39, 1398.81it/s]                           



Epoch: 9
Train accuracy: 0.9800, loss: 0.0645
Val accuracy: 0.5110, loss: 1.5920



55040it [00:39, 1404.41it/s]                           



Epoch: 10
Train accuracy: 0.9831, loss: 0.0547
Val accuracy: 0.4642, loss: 1.5961



**comparing tqdm position from above and below is, in above after each epoch you can see a progress bar but in below only for 1st epoch you can see a progress bar**