[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1LYv_JIsXTQCxxwvtedF8IKgzAE8hhBh1#scrollTo=AKB-MYwPla8C)


In [1]:
import tensorflow as tf
import numpy as np

## Creating a tensorflow dataset using a list

In [2]:
list_data = tf.data.Dataset.from_tensor_slices([1, 2, 3],)

In [3]:
list_data.element_spec # The type specification of an element of this dataset.

TensorSpec(shape=(), dtype=tf.int32, name=None)

## Creating a tensorflow dataset using a numpy array

In [4]:
np_data = tf.data.Dataset.from_tensor_slices(np.array([
                                                       [1, 2, 3],
                                                       [4, 5, 6,],
                                                       [7, 8, 9],
                                                       ]
                                            ))

In [5]:
np_data.element_spec

TensorSpec(shape=(3,), dtype=tf.int64, name=None)

## A real use case example of creating a tensorflow dataset from numpy array

In [6]:
# Load data the mnist data
train, test = tf.keras.datasets.mnist.load_data()
# Read the images array's and the lables
images, lables = train
# Quick preprocess
images = images/255.0

# Create your dataset
mnist_dataset = tf.data.Dataset.from_tensor_slices((images, lables))

In [7]:
mnist_dataset.element_spec 

(TensorSpec(shape=(28, 28), dtype=tf.float64, name=None),
 TensorSpec(shape=(), dtype=tf.uint8, name=None))

Now let us look how the training images and the labels look within the tensorflow dataset. We can load the data out from a tensorflow dataset using the as_numpy_iterator() method which will let us access the data as numpy array. In our case it will let us inscpect the dataset

In [8]:
# X is the first input array and y is the corresponding lable
for X,y in mnist_dataset.as_numpy_iterator():
  print("Shape of X, y")
  print(X.shape, y.shape)
  print(type(X), ", This number is -->", y) 
  break # to avoid iterating over all the training examples

Shape of X, y
(28, 28) ()
<class 'numpy.ndarray'> , This number is --> 5


In [9]:
mnist_dataset2 = tf.data.Dataset.from_tensor_slices((images, lables, lables))

In [10]:
mnist_dataset2.element_spec 

(TensorSpec(shape=(28, 28), dtype=tf.float64, name=None),
 TensorSpec(shape=(), dtype=tf.uint8, name=None),
 TensorSpec(shape=(), dtype=tf.uint8, name=None))

## Important tensorflow methods

1. range: Creates a Dataset of a step-separated range of values. signature start, stop, steps

In [11]:
range_dataset = tf.data.Dataset.range(5)
print(list(range_dataset.as_numpy_iterator()))

range_stepped_dataset = tf.data.Dataset.range(1, 10, 3, output_type=tf.float32) # r
print(list(range_stepped_dataset.as_numpy_iterator()))

[0, 1, 2, 3, 4]
[1.0, 4.0, 7.0]


2.map: lets you apply a function on each element of a tensor

In [12]:
def div_by_4(x):
  '''
  A function that will help you divide each 
  tensor element by 4
  '''
  return x/4


3. take: It creates a new dataset with the utmost count passed to it. Following is a quick demonstration of how the take method works. It also works with batched datasets and will demonstrate its behavior in the batch method.

In [13]:
list(range_dataset.map(div_by_4).as_numpy_iterator())

[0.0, 0.25, 0.5, 0.75, 1.0]

In [14]:
for X,y in mnist_dataset.take(6).as_numpy_iterator():
  print(X.shape, y)

(28, 28) 5
(28, 28) 0
(28, 28) 4
(28, 28) 1
(28, 28) 9
(28, 28) 2


4. skip: It creates a Dataset that skips the count elements from the dataset. Let's see how this method works, in the example, below let's skip the first 3 elements from the mnist_dataset and take the next 3. If you have been following along in the tutorial we should see labels/images 1, 9, and 2 in order

In [15]:
for X,y in mnist_dataset.skip(3).take(3).as_numpy_iterator():
  print(X.shape, y)

(28, 28) 1
(28, 28) 9
(28, 28) 2


In [16]:
fist_six_images_taken = mnist_dataset.take(6)
print('Lenght of the dataset',len(fist_six_images_taken))
# Lenght of the dataset 6
print(fist_six_images_taken.element_spec)
# (TensorSpec(shape=(28, 28), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.uint8, name=None))

for X,y in fist_six_images_taken.as_numpy_iterator():
  print(X.shape, y)

# (28, 28) 5
# (28, 28) 0
# (28, 28) 4
# (28, 28) 1
# (28, 28) 9
# (28, 28) 2

Lenght of the dataset 6
(TensorSpec(shape=(28, 28), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.uint8, name=None))
(28, 28) 5
(28, 28) 0
(28, 28) 4
(28, 28) 1
(28, 28) 9
(28, 28) 2


5.repeat: This method repeats the dataset values in order

In [17]:
dataset2 = tf.data.Dataset.range(3)
repeated_dataset = dataset2.repeat(3)
print(repeated_dataset.element_spec)
print(list(repeated_dataset.as_numpy_iterator()))

TensorSpec(shape=(), dtype=tf.int64, name=None)
[0, 1, 2, 0, 1, 2, 0, 1, 2]


6.shuffle: Randomly shuffles the elements of this dataset.

In [18]:
dataset3 = tf.data.Dataset.range(3)
print(list(dataset3.as_numpy_iterator()))
print(list(dataset3.shuffle(3).as_numpy_iterator()))


[0, 1, 2]
[0, 2, 1]


In [19]:
# [0, 1, 2]
# [1, 2, 0]

7. zip

In [20]:
np_array = np.array([[1, 2, 3],
                    [4, 5, 6],
                    [7, 8, 9]])
labels = np.array([0, 0, 1])
d_ds = tf.data.Dataset.from_tensor_slices(np_array)
lab_tf = tf.data.Dataset.from_tensor_slices(labels)

# Zipping 2 tf datasets
zipped_dataset = tf.data.Dataset.zip((d_ds, lab_tf))

print('Array   , Label')
for x,y in zipped_dataset.as_numpy_iterator():
    
    print(x, ',' ,y)
# Array   , Label
# [1 2 3] , 0
# [4 5 6] , 0
# [7 8 9] , 1

Array   , Label
[1 2 3] , 0
[4 5 6] , 0
[7 8 9] , 1


8. batch
This function is used to combine consecutive of elements a dataset into batches based on the batch_size specified.

In [21]:
batched_ds = zipped_dataset.batch(2)

print('batched dataset, lables')
for i in batched_ds:
  print(i[0].shape, '        ,  ',i[1].shape)
print()

print('Elements of the batched dataset')
for X,y in batched_ds.as_numpy_iterator():
  print('X= ', X)
  print('y= ',y)
# batched dataset, lables
# (2, 3)         ,   (2,)
# (1, 3)         ,   (1,)

# Elements of the batched dataset
# X=  [[1 2 3]
#     [4 5 6]]

# y=  [0 0]

# X=  [[7 8 9]]
# y=  [1]

batched dataset, lables
(2, 3)         ,   (2,)
(1, 3)         ,   (1,)

Elements of the batched dataset
X=  [[1 2 3]
 [4 5 6]]
y=  [0 0]
X=  [[7 8 9]]
y=  [1]


In [22]:
batched_ds = zipped_dataset.batch(2, drop_remainder=True)

print('batched dataset, lables')
for i in batched_ds:
  print(i[0].shape, i[1].shape)
print()

print('Elements of the batched dataset')
for X,y in batched_ds.as_numpy_iterator():
  print('X= ', X)
  print('y= ',y)
# batched dataset, lables
# (2, 3) (2,)

# Elements of the batched dataset
# X=  [[1 2 3]
#     [4 5 6]]
# y=  [0 0]

batched dataset, lables
(2, 3) (2,)

Elements of the batched dataset
X=  [[1 2 3]
 [4 5 6]]
y=  [0 0]


9. window

In [23]:
data = tf.data.Dataset.range(10)
print('Orginal data', list(data.as_numpy_iterator()))
#  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

windowed_data = data.window(5, shift=1, drop_remainder=True)
for window_data in windowed_data:
    for val in window_data:
        print(val.numpy(), end=" ")
    print()
# Orginal data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# 0 1 2 3 4 
# 1 2 3 4 5 
# 2 3 4 5 6 
# 3 4 5 6 7 
# 4 5 6 7 8 
# 5 6 7 8 9 
print('-------')

# with shift = 2
windowed_data = data.window(5, shift=2, drop_remainder=True)
for window_data in windowed_data:
    for val in window_data:
        print(val.numpy(), end=" ")
    print()

# 0 1 2 3 4 
# 2 3 4 5 6 
# 4 5 6 7 8 

Orginal data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 
-------
0 1 2 3 4 
2 3 4 5 6 
4 5 6 7 8 


## Method Chaining in tensorflow for your pipeline

In [24]:
ndataset = tf.data.Dataset.range(10)
ndataset = ndataset.window(5, shift=1, drop_remainder=True)
ndataset = ndataset.flat_map(lambda window: window.batch(5))
ndataset = ndataset.map(lambda window: (window[:-1], window[-1:]))
ndataset = ndataset.shuffle(buffer_size=10)
ndataset = ndataset.batch(3).prefetch(1)


for x, y in ndataset:
    print("x = ", list(x.numpy()))
    print("y = ", list(y.numpy()))

# x =  [array([4, 5, 6, 7]), array([5, 6, 7, 8]), array([3, 4, 5, 6])]
# y =  [array([8]), array([9]), array([7])]
# x =  [array([0, 1, 2, 3]), array([2, 3, 4, 5]), array([1, 2, 3, 4])]
# y =  [array([4]), array([6]), array([5])]

x =  [array([3, 4, 5, 6]), array([5, 6, 7, 8]), array([0, 1, 2, 3])]
y =  [array([7]), array([9]), array([4])]
x =  [array([4, 5, 6, 7]), array([1, 2, 3, 4]), array([2, 3, 4, 5])]
y =  [array([8]), array([5]), array([6])]


## A much better way of chaining them together

In [25]:
ndataset2 = tf.data.Dataset.range(10)

ndataset2= ndataset2.window(5, shift=1, drop_remainder=True) \
                    .flat_map(lambda window: window.batch(5)) \
                    .map(lambda window2: (window2[:-1], window2[-1:])) \
                    .shuffle(buffer_size=10) \
                    .batch(3).prefetch(1)

for x, y in ndataset2:
    print("x = ", list(x.numpy()))
    print("y = ", list(y.numpy()))

# x =  [array([1, 2, 3, 4]), array([2, 3, 4, 5]), array([5, 6, 7, 8])]
# y =  [array([5]), array([6]), array([9])]
# x =  [array([0, 1, 2, 3]), array([4, 5, 6, 7]), array([3, 4, 5, 6])]
# y =  [array([4]), array([8]), array([7])]

x =  [array([4, 5, 6, 7]), array([3, 4, 5, 6]), array([2, 3, 4, 5])]
y =  [array([8]), array([7]), array([6])]
x =  [array([0, 1, 2, 3]), array([1, 2, 3, 4]), array([5, 6, 7, 8])]
y =  [array([4]), array([5]), array([9])]


## My preffered way: 
### Chain the transformations togther using a function (makes the code much more readable, easier to understand and reusable in most cases)

In [26]:
def make_window_dataset(ds, window_size=5, shift=1, batch_size=3):
  '''
  This function helps in creating a windowed dataset.
  The window size is set to 5 which shifts the element by 1
  and returns a tf.dataset of batch size 3
  '''
  return ds.window(window_size, shift=shift, drop_remainder=True) \
                    .flat_map(lambda window: window.batch(5)) \
                    .map(lambda window2: (window2[:-1], window2[-1:])) \
                    .shuffle(buffer_size=10) \
                    .batch(batch_size).prefetch(1)

In [27]:
nds3 = tf.data.Dataset.range(10, 20)
nds3 = nds3.apply(make_window_dataset)
for x, y in nds3:
    print("x = ", list(x.numpy()))
    print("y = ", list(y.numpy()))

x =  [array([13, 14, 15, 16]), array([11, 12, 13, 14]), array([12, 13, 14, 15])]
y =  [array([17]), array([15]), array([16])]
x =  [array([15, 16, 17, 18]), array([10, 11, 12, 13]), array([14, 15, 16, 17])]
y =  [array([19]), array([14]), array([18])]
