In [1]:
import tensorflow as tf

In [2]:
dataset1=tf.data.Dataset.from_tensor_slices(tf.range(10))
# Emits data of 10, 11, 12, 13, 14, (One element at a time)

In [3]:
dataset2=tf.data.Dataset.from_tensor_slices((tf.range(30,45,3),tf.range(10,15)))
# Emits data of (30, 60), (33, 62), (36, 64), (39, 66), (42, 68)
# Emits one tuple at a time

In [None]:
dataset3 = tf.data.Dataset.from_tensor_slices((tf.range(10), tf.range(5)))
# Dataset not possible as zeroth dimenion is different at 10 and 5

Just like from_tensor_slices, this method also accepts individual (or multiple) Numpy (or Tensors) objects. But this method doesn’t support batching of data, i.e all the data will be given out instantly.

In [5]:
dataset1=tf.data.Dataset.from_tensors(tf.range(10))
# Emits data of [10, 11, 12, 13, 14]
# Holds entire list as one element

In [6]:
dataset2 = tf.data.Dataset.from_tensors((tf.range(30, 45, 3), tf.range(60, 70, 2)))
# Emits data of ([30, 33, 36, 39, 42], [60, 62, 64, 66, 68])
# Holds entire tuple as one element

In [7]:
dataset3 = tf.data.Dataset.from_tensors((tf.range(10), tf.range(5)))
# Possible with from_tensors, regardless of zeroth dimension mismatch of constituent elements.
# Emits data of ([1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4])
# Holds entire tuple as one element

In this method, a generator function is passed as input. This method is useful in cases where you wish to generate the data at runtime and as such no raw data exists with you or in scenarios where your training data is extremely huge and it is not possible to store them in your disk.

In [8]:
def generator(sequence_type):
  if sequence_type==1:
    for i in range(5):
      yield 10+i
  elif sequence_type==2:
    for i in range(5):
      yield (30+3*i,60+2*i)
  else:
    for i in range(1,4):
      yield (i,['Hi']*i)
      
dataset1=tf.data.Dataset.from_generator(generator,(tf.int32),args=([1]))
# Emits data of 10, 11, 12, 13, 14, (One element at a time)

Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    


In [9]:
dataset2=tf.data.Dataset.from_generator(generator,(tf.int32,tf.int32),args=([2]))
# Emits data of (30, 60), (33, 62), (36, 64), (39, 66), (42, 68)
# Emits one tuple at a time

In [10]:
dataset3=tf.data.Dataset.from_generator(generator,(tf.int32,tf.string),args=([3]))
# Emits data of (1, ['Hi']), (2, ['Hi', 'Hi']), (3, ['Hi', 'Hi', 'Hi'])
# Emits one tuple at a time

In [11]:
dataset = tf.data.Dataset.from_tensor_slices(tf.range(10))
# Create a dataset with data of [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset = dataset.repeat(2)
# Duplicate the dataset
# Data will be [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset = dataset.shuffle(5)
# Shuffle the dataset
# Assumed shuffling: [3, 0, 7, 9, 4, 2, 5, 0, 1, 7, 5, 9, 4, 6, 2, 8, 6, 8, 1, 3]

def map_fn(x):
    return x * 3

dataset = dataset.map(map_fn)
# Same as dataset = dataset.map(lambda x: x * 3)
# Multiply each element with 3 using map transformation
# Dataset: [9, 0, 21, 27, 12, 6, 15, 0, 3, 21, 15, 27, 12, 18, 6, 24, 18, 24, 3, 9]

def filter_fn(x):
    return tf.reshape(tf.not_equal(x % 5, 1), [])

dataset = dataset.filter(filter_fn)
# Same as dataset = dataset.filter(lambda x: tf.reshape(tf.not_equal(x % 5, 1), []))
# Filter out all those elements whose modulus 5 returns 1
# Dataset: [9, 0, 27, 12, 15, 0, 3, 15, 27, 12, 18, 24, 18, 24, 3, 9]

dataset = dataset.batch(4)
# Batch at every 4 elements
# Dataset: [9, 0, 27, 12], [15, 0, 3, 15], [27, 12, 18, 24], [18, 24, 3, 9]

**Ordering of transformation**

The ordering of the application of the transformation is very important. Your model may learn differently for the same Dataset but differently ordered transformations.

In [12]:
# Ordering #1
dataset1 = tf.data.Dataset.from_tensor_slices(tf.range(10))
# Dataset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset1 = dataset1.batch(4)
# Dataset: [0, 1, 2, 3], [4, 5, 6, 7], [8, 9]

dataset1 = dataset1.repeat(2)
# Dataset: [0, 1, 2, 3], [4, 5, 6, 7], [8, 9], [0, 1, 2, 3], [4, 5, 6, 7], [8, 9]
# Notice a 2 element batch in between

dataset1 = dataset1.shuffle(4)
# Shuffles at batch level.
# Dataset: [0, 1, 2, 3], [4, 5, 6, 7], [8, 9], [8, 9], [0, 1, 2, 3], [4, 5, 6, 7]



# Ordering #2
dataset2 = tf.data.Dataset.from_tensor_slices(tf.range(10))
# Dataset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset2 = dataset2.shuffle(4)
# Dataset: [3, 1, 0, 4, 5, 8, 6, 9, 7, 2]

dataset2 = dataset2.repeat(2)
# Dataset: [3, 1, 0, 4, 5, 8, 6, 9, 7, 2, 3, 1, 0, 4, 5, 8, 6, 9, 7, 2]

dataset2 = dataset2.batch(4)
# Dataset: [3, 1, 0, 4], [5, 8, 6, 9], [7, 2, 3, 1], [0, 4, 5, 8], [6, 9, 7, 2]

In [13]:
from tensorflow.examples.tutorials.mnist import input_data

mnist = input_data.read_data_sets("MNIST_data/", reshape=False)
X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val     = mnist.validation.images, mnist.validation.labels
X_test, y_test   = mnist.test.images, mnist.test.labels

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


In [14]:
import tensorflow.contrib.slim as slim

# LeNet-5 model
class Model:
    def __init__(self, data_X, data_y):
        self.n_class = 10
        self._create_architecture(data_X, data_y)

    def _create_architecture(self, data_X, data_y):
        y_hot = tf.one_hot(data_y, depth = self.n_class)
        logits = self._create_model(data_X)
        predictions = tf.argmax(logits, 1, output_type = tf.int32)
        self.loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels = y_hot, 
                                                                              logits = logits))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = 0.001).minimize(self.loss)
        self.accuracy = tf.reduce_sum(tf.cast(tf.equal(predictions, data_y), tf.float32))

    def _create_model(self, X):
        X1 = X - 0.5
        X1 = tf.pad(X1, tf.constant([[0, 0], [2, 2], [2, 2], [0, 0]]))
        with slim.arg_scope([slim.conv2d, slim.fully_connected], 
                            weights_initializer = tf.truncated_normal_initializer(0.0, 0.1)):
            net = slim.conv2d(X1, 6, [5, 5], padding = 'VALID')
            net = slim.max_pool2d(net, [2, 2])
            net = slim.conv2d(net, 16, [5, 5], padding = 'VALID')
            net = slim.max_pool2d(net, [2, 2])
            
            net = tf.reshape(net, [-1, 400])
            net = slim.fully_connected(net, 120)
            net = slim.fully_connected(net, 84)
            net = slim.fully_connected(net, self.n_class, activation_fn = None)
        return net

#OVERVIEW OF ITERATOR
```
# Create dataset and perform transformations on it
dataset = << Create Dataset object >>
dataset = << Perform transformations on dataset >>

# Create iterator
iterator = << Create iterator using dataset >>
next_batch = iterator.get_next()

# Create session
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    try: 
        # Keep running next_batch till the Dataset is exhausted
        while True:
            sess.run(next_batch)
            
    except tf.errors.OutOfRangeError:
        pass
```



 iterator doesn’t keep track of how many elements are present in the Dataset. Hence, it is normal to keep running the iterator’s get_next operation till Tensorflow’s tf.errors.OutOfRangeError exception is occurred.

#One-shot iterator
This is the most basic type of iterator. All the data with all types of transformations that is needed in the dataset has to be decided before the Dataset is fed into this iterator. One-shot iterator will iterate through all the elements present in Dataset and once exhausted, cannot be used anymore. As a result, the Dataset generated for this iterator can tend to occupy a lot of memory.

In [15]:
!pip install tqdm



In [16]:
from tqdm import tqdm
epochs=10
batch_size=64
iterations=len(y_train)*epochs

dataset=tf.data.Dataset.from_tensor_slices((X_train,y_train))
# Generate the complete Dataset required in the pipeline

dataset=dataset.repeat(epochs).batch(batch_size)
iterator=dataset.make_one_shot_iterator()

data_X,data_y=iterator.get_next()
data_y=tf.cast(data_y,tf.int32)
model=Model(data_X,data_y)

with tf.Session() as sess, tqdm(total=iterations) as pbar:#Instantly make your loops show a smart progress meter - just wrap any iterable with tqdm(iterable), and you’re done!
    sess.run(tf.global_variables_initializer())
    tot_accuracy=0
    try:
        while True:
            accuracy, _ = sess.run([model.accuracy, model.optimizer])
            tot_accuracy += accuracy
            pbar.update(batch_size)
    except tf.errors.OutOfRangeError:
        pass
      
print('\nAverage training accuracy: {:.4f}'.format(tot_accuracy / iterations))

Instructions for updating:
Colocations handled automatically by placer.


550016it [02:17, 3987.09it/s]                                                  



Average training accuracy: 0.9812


# Initializable
In One-shot iterator, we had the shortfall of repetition of same training dataset in memory and there was absence of periodically validating our model using validation dataset in our code. In initializable iterator we overcome these problems. Initializable iterator has to be initialized with dataset before it starts running.

In [17]:
epochs=10
batch_size=64
iterations=len(y_train)*epochs

placeholder_X = tf.placeholder(tf.float32, [None, 28, 28, 1])
placeholder_y = tf.placeholder(tf.int32, [None])

dataset = tf.data.Dataset.from_tensor_slices((placeholder_X, placeholder_y))
dataset = dataset.batch(batch_size)
iterator = dataset.make_initializable_iterator()

data_X, data_y = iterator.get_next()
data_y = tf.cast(data_y, tf.int32)
model = Model(data_X, data_y)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_no in range(epochs):
        train_loss, train_accuracy = 0, 0
        val_loss, val_accuracy = 0, 0

        # Initialize iterator with training data
        sess.run(iterator.initializer, feed_dict = {placeholder_X: X_train, placeholder_y: y_train})
        try:
            with tqdm(total = len(y_train)) as pbar:
                while True:
                    _, loss, acc = sess.run([model.optimizer, model.loss, model.accuracy])
                    train_loss += loss 
                    train_accuracy += acc
                    pbar.update(batch_size) #Manually update the progress bar, useful for streams
        except tf.errors.OutOfRangeError:
            pass
    
        # Initialize iterator with validation data
        sess.run(iterator.initializer, feed_dict = {placeholder_X: X_val, placeholder_y: y_val})
        try:
            while True:
                loss, acc = sess.run([model.loss, model.accuracy])
                val_loss += loss 
                val_accuracy += acc
        except tf.errors.OutOfRangeError:
            pass
    
        print('\nEpoch No: {}'.format(epoch_no + 1))
        print('Train accuracy = {:.4f}, loss = {:.4f}'.format(train_accuracy / len(y_train), 
                                                        train_loss / len(y_train)))
        print('Val accuracy = {:.4f}, loss = {:.4f}'.format(val_accuracy / len(y_val), 
                                                        val_loss / len(y_val)))

55040it [00:09, 5702.16it/s]                                                   



Epoch No: 1
Train accuracy = 0.9232, loss = 0.2610
Val accuracy = 0.9592, loss = 0.1424


55040it [00:06, 7961.73it/s]                                                   



Epoch No: 2
Train accuracy = 0.9764, loss = 0.0769
Val accuracy = 0.9724, loss = 0.0969


55040it [00:06, 7884.24it/s]                                                   



Epoch No: 3
Train accuracy = 0.9836, loss = 0.0537
Val accuracy = 0.9784, loss = 0.0806


55040it [00:07, 7526.51it/s]                                                   



Epoch No: 4
Train accuracy = 0.9873, loss = 0.0408
Val accuracy = 0.9812, loss = 0.0753


55040it [00:07, 7776.25it/s]                                                   



Epoch No: 5
Train accuracy = 0.9903, loss = 0.0319
Val accuracy = 0.9858, loss = 0.0562


55040it [00:06, 7952.54it/s]                                                   



Epoch No: 6
Train accuracy = 0.9922, loss = 0.0262
Val accuracy = 0.9888, loss = 0.0471


55040it [00:07, 7736.92it/s]                                                   



Epoch No: 7
Train accuracy = 0.9936, loss = 0.0216
Val accuracy = 0.9880, loss = 0.0487


55040it [00:07, 7806.02it/s]                                                   



Epoch No: 8
Train accuracy = 0.9940, loss = 0.0187
Val accuracy = 0.9878, loss = 0.0528


55040it [00:07, 7653.06it/s]                                                   



Epoch No: 9
Train accuracy = 0.9939, loss = 0.0183
Val accuracy = 0.9902, loss = 0.0474


55040it [00:07, 7699.05it/s]                                                   



Epoch No: 10
Train accuracy = 0.9955, loss = 0.0145
Val accuracy = 0.9882, loss = 0.0487


#Reinitializable
In initializable iterator, there was a shortfall of different datasets undergoing the same pipeline before the Dataset is fed into the iterator. This problem is overcome by reinitializable iterator as we have the ability to feed different types of Datasets thereby undergoing different pipelines. Only one care has to be taken is that different Datasets are of the same data type

In [18]:
def map_fn(x, y):
    x=x*3.0+62.0
    return x, y

epochs = 10
batch_size = 64

placeholder_X = tf.placeholder(tf.float32, shape = [None, 28, 28, 1])
placeholder_y = tf.placeholder(tf.int32, shape = [None])

# Create separate Datasets for training and validation
train_dataset = tf.data.Dataset.from_tensor_slices((placeholder_X, placeholder_y))
train_dataset = train_dataset.batch(batch_size).map(map_fn)
val_dataset = tf.data.Dataset.from_tensor_slices((placeholder_X, placeholder_y))
val_dataset = val_dataset.batch(batch_size)

# Iterator has to have same output types across all Datasets to be used
iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
data_X, data_y = iterator.get_next()
data_y = tf.cast(data_y, tf.int32)
model = Model(data_X, data_y)

# Initialize with required Datasets
train_iterator = iterator.make_initializer(train_dataset)
val_iterator = iterator.make_initializer(val_dataset)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_no in range(epochs):
        train_loss, train_accuracy = 0, 0
        val_loss, val_accuracy = 0, 0

        # Start train iterator
        sess.run(train_iterator, feed_dict = {placeholder_X: X_train, placeholder_y: y_train})
        try:
            with tqdm(total = len(y_train)) as pbar:
                while True:
                    _, acc, loss = sess.run([model.optimizer, model.accuracy, model.loss])
                    train_loss += loss
                    train_accuracy += acc
                    pbar.update(batch_size)
        except tf.errors.OutOfRangeError:
            pass

        # Start validation iterator
        sess.run(val_iterator, feed_dict = {placeholder_X: X_val, placeholder_y: y_val})
        try:
            while True:
                acc, loss = sess.run([model.accuracy, model.loss])
                val_loss += loss
                val_accuracy += acc
        except tf.errors.OutOfRangeError:
            pass

        print('\nEpoch: {}'.format(epoch_no + 1))
        print('Train accuracy: {:.4f}, loss: {:.4f}'.format(train_accuracy / len(y_train),
                                                             train_loss / len(y_train)))
        print('Val accuracy: {:.4f}, loss: {:.4f}\n'.format(val_accuracy / len(y_val), 
                                                            val_loss / len(y_val)))

55040it [00:08, 6155.97it/s]                                                   



Epoch: 1
Train accuracy: 0.7415, loss: 0.8195
Val accuracy: 0.4628, loss: 1.6218



55040it [00:07, 7438.07it/s]                                                   



Epoch: 2
Train accuracy: 0.9357, loss: 0.2060
Val accuracy: 0.6182, loss: 1.1918



55040it [00:07, 7714.17it/s]                                                   



Epoch: 3
Train accuracy: 0.9583, loss: 0.1331
Val accuracy: 0.7546, loss: 0.9779



55040it [00:07, 7633.97it/s]                                                   



Epoch: 4
Train accuracy: 0.9662, loss: 0.1042
Val accuracy: 0.7558, loss: 0.9080



55040it [00:07, 7671.16it/s]                                                   



Epoch: 5
Train accuracy: 0.9728, loss: 0.0869
Val accuracy: 0.7886, loss: 0.8332



55040it [00:07, 7673.30it/s]                                                   



Epoch: 6
Train accuracy: 0.9776, loss: 0.0717
Val accuracy: 0.8134, loss: 0.7441



55040it [00:07, 7444.10it/s]                                                   



Epoch: 7
Train accuracy: 0.9785, loss: 0.0667
Val accuracy: 0.8156, loss: 0.7388



55040it [00:07, 7704.42it/s]                                                   



Epoch: 8
Train accuracy: 0.9815, loss: 0.0565
Val accuracy: 0.7874, loss: 0.7514



55040it [00:06, 7867.33it/s]                                                   



Epoch: 9
Train accuracy: 0.9827, loss: 0.0520
Val accuracy: 0.8062, loss: 0.6939



55040it [00:07, 7804.89it/s]                                                   



Epoch: 10
Train accuracy: 0.9857, loss: 0.0439
Val accuracy: 0.8034, loss: 0.7030



**comparing tqdm position from above and below is in above after each epoch you can see a progress bar but in below only for 1st epoch you can see a progress bar**

In [19]:
def map_fn(x,y):
  x=x*3.0+62.0
  return (x,y)

epochs = 10
batch_size = 64

placeholder_X = tf.placeholder(tf.float32, shape = [None, 28, 28, 1])
placeholder_y = tf.placeholder(tf.int32, shape = [None])

# Create separate Datasets for training and validation
train_dataset = tf.data.Dataset.from_tensor_slices((placeholder_X, placeholder_y))
train_dataset = train_dataset.batch(batch_size).map(lambda x, y: map_fn(x, y))
val_dataset = tf.data.Dataset.from_tensor_slices((placeholder_X, placeholder_y))
val_dataset = val_dataset.batch(batch_size)

# Iterator has to have same output types across all Datasets to be used
iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
data_X, data_y = iterator.get_next()
data_y = tf.cast(data_y, tf.int32)
model = Model(data_X, data_y)

# Initialize with required Datasets
train_iterator = iterator.make_initializer(train_dataset)
val_iterator = iterator.make_initializer(val_dataset)

with tf.Session() as sess, tqdm(total=len(y_train)) as pbar:
  sess.run(tf.global_variables_initializer())
  for epoch_no in range(epochs):
        train_loss, train_accuracy = 0, 0
        val_loss, val_accuracy = 0, 0
        
        # Start train iterator
        sess.run(train_iterator,feed_dict={placeholder_X:X_train,placeholder_y:y_train})
        try:
          while True:
            _, acc, loss = sess.run([model.optimizer, model.accuracy, model.loss])
            train_loss += loss
            train_accuracy += acc
            pbar.update(batch_size)
        except tf.errors.OutOfRangeError:
            pass
        # Start validation iterator
        sess.run(val_iterator, feed_dict = {placeholder_X: X_val, placeholder_y: y_val})
        try:
            while True:
                acc, loss = sess.run([model.accuracy, model.loss])
                val_loss += loss
                val_accuracy += acc
        except tf.errors.OutOfRangeError:
            pass

        print('\nEpoch: {}'.format(epoch_no + 1))
        print('Train accuracy: {:.4f}, loss: {:.4f}'.format(train_accuracy / len(y_train),
                                                             train_loss / len(y_train)))
        print('Val accuracy: {:.4f}, loss: {:.4f}\n'.format(val_accuracy / len(y_val), 
                                                            val_loss / len(y_val)))

100%|█████████████████████████████████▉| 54848/55000 [00:13<00:00, 7129.57it/s]


Epoch: 1
Train accuracy: 0.1323, loss: 2.2529
Val accuracy: 0.1028, loss: 3.2966



109440it [00:25, 7213.70it/s]                                                  


Epoch: 2
Train accuracy: 0.2219, loss: 1.9559
Val accuracy: 0.0894, loss: 4.0944



164864it [00:35, 7810.61it/s]


Epoch: 3
Train accuracy: 0.2319, loss: 1.9323
Val accuracy: 0.0964, loss: 5.6891



219904it [00:44, 7449.13it/s]


Epoch: 4
Train accuracy: 0.2343, loss: 1.9302
Val accuracy: 0.0934, loss: 6.5835



275136it [00:53, 7510.07it/s]


Epoch: 5
Train accuracy: 0.1114, loss: 2.3046
Val accuracy: 0.1098, loss: 5.8575



329472it [01:03, 7606.06it/s]


Epoch: 6
Train accuracy: 0.1121, loss: 2.3016
Val accuracy: 0.1100, loss: 5.8905



385024it [01:12, 7355.96it/s]


Epoch: 7
Train accuracy: 0.1123, loss: 2.3015
Val accuracy: 0.1098, loss: 5.7503



439616it [01:21, 7545.59it/s]


Epoch: 8
Train accuracy: 0.1122, loss: 2.3014
Val accuracy: 0.1098, loss: 5.6159



494912it [01:30, 7689.05it/s]


Epoch: 9
Train accuracy: 0.1123, loss: 2.3014
Val accuracy: 0.1100, loss: 5.5049



550144it [01:40, 7027.48it/s]


Epoch: 10
Train accuracy: 0.1124, loss: 2.3014
Val accuracy: 0.1100, loss: 5.4021



550400it [01:41, 5402.39it/s]
