<h1 align="center">Tensorflow Input Pipeline
</h1>

In [268]:
import tensorflow as tf

# Create tf dataset from a list

In [270]:
dataset=[12,34,-54,34,65,-45,34,65]
df=tf.data.Dataset.from_tensor_slices(dataset)
df

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

# Iterate through tf dataset

In [201]:
for sales in df.as_numpy_iterator():
    print(sales)

12
34
-54
34
65
-45
34
65


In [203]:
df=df.filter(lambda x:x>0)
for sales in df.as_numpy_iterator():
    print(sales)

12
34
34
65
34
65


# Iterate through elements as numpy elements

In [204]:
df=df.map(lambda x:x*5)
for sales in df.as_numpy_iterator():
    print(sales)

60
170
170
325
170
325


In [207]:
df=df.shuffle(2)
for sales in df.as_numpy_iterator():
    print(sales)

60
170
170
170
325
325


In [208]:
df=df.batch(2)
for sales in df.as_numpy_iterator():
    print(sales)


[170  60]
[325 170]
[325 170]


# Convert sales numbers from USA dollars ($) to PKR Rupees (INR) Assuming 1->285 conversation rate

In [384]:
daily_sales_numbers = [21, 22, -108, -1, 32, 34,31]
tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)

tf_dataset = tf_dataset.filter(lambda x: x>0).map(lambda y: y*285).shuffle(2).batch(2)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

[5985 6270]
[9120 8835]
[9690]


In [386]:
images_ds = tf.data.Dataset.list_files('images/*/*', shuffle=False)


In [387]:
image_counts=len(images_ds)
image_counts

3475

In [388]:
print(type(images_ds))

<class 'tensorflow.python.data.ops.from_tensor_slices_op._TensorSliceDataset'>


In [389]:
for file in images_ds.take(3):
    print(file.numpy())

b'images\\Pepper__bell___Bacterial_spot\\0022d6b7-d47c-4ee2-ae9a-392a53f48647___JR_B.Spot 8964.JPG'
b'images\\Pepper__bell___Bacterial_spot\\006adb74-934f-448f-a14f-62181742127b___JR_B.Spot 3395.JPG'
b'images\\Pepper__bell___Bacterial_spot\\00f2e69a-1e56-412d-8a79-fdce794a17e4___JR_B.Spot 3132.JPG'


# Perform all of the above operations in one shot

In [390]:
images_ds = images_ds.shuffle(200)
for file in images_ds.take(3):
    print(file.numpy())

b'images\\Pepper__bell___Bacterial_spot\\197fdd19-46d1-46f7-8e79-aa9545f76ff0___JR_B.Spot 9000.JPG'
b'images\\Pepper__bell___Bacterial_spot\\2a8a3b2f-d65a-416b-a251-63bb89f448c3___JR_B.Spot 3322.JPG'
b'images\\Pepper__bell___Bacterial_spot\\264753e7-97b9-4f1e-a1c2-a9239eb691b9___JR_B.Spot 8866.JPG'


In [393]:
class_names = ["Early","Normal","Old"]


In [394]:
train_size = int(image_counts*0.8)
train_ds = images_ds.take(train_size)
test_ds = images_ds.skip(train_size)

In [289]:
len(train_ds)


2780

In [290]:
len(test_ds)


695

In [291]:
def get_label(file_path):
    import os
    parts = tf.strings.split(file_path, os.path.sep)
    return parts[-2]

In [292]:
get_label("images\\Pepper__bell___Bacterial_spot\\11092e1c-9fdc-4405-85b9-a8d7548f12bb___JR_B.Spot 9051.JPG")


<tf.Tensor: shape=(), dtype=string, numpy=b'Pepper__bell___Bacterial_spot'>

In [293]:
def process_image(file_path):
    label = get_label(file_path)
    img = tf.io.read_file(file_path) # load the raw data from the file as a string
    img = tf.image.decode_jpeg(img)
    img = tf.image.resize(img, [128, 128])
    return img, label

In [294]:
img, label = process_image("images\\Pepper__bell___Bacterial_spot\\11092e1c-9fdc-4405-85b9-a8d7548f12bb___JR_B.Spot 9051.JPG")
img.numpy()[:2]

array([[[153.5 , 139.5 , 138.5 ],
        [142.25, 128.25, 127.25],
        [143.5 , 129.5 , 128.5 ],
        [142.25, 128.25, 127.25],
        [138.  , 124.  , 123.  ],
        [151.  , 137.  , 136.  ],
        [156.25, 142.25, 141.25],
        [162.25, 148.25, 147.25],
        [148.75, 134.75, 133.75],
        [143.25, 129.25, 128.25],
        [151.75, 137.75, 136.75],
        [141.  , 127.  , 126.  ],
        [147.25, 133.25, 132.25],
        [150.25, 136.25, 135.25],
        [145.5 , 131.5 , 130.5 ],
        [148.  , 134.  , 133.  ],
        [153.5 , 139.5 , 138.5 ],
        [159.5 , 145.5 , 144.5 ],
        [153.  , 139.  , 138.  ],
        [143.5 , 129.5 , 128.5 ],
        [156.  , 142.  , 141.  ],
        [155.  , 141.  , 140.  ],
        [155.25, 141.25, 140.25],
        [155.75, 141.75, 140.75],
        [147.25, 133.25, 132.25],
        [148.  , 134.  , 133.  ],
        [156.25, 142.25, 141.25],
        [149.  , 135.  , 134.  ],
        [147.75, 133.75, 132.75],
        [142.7

In [295]:
train_ds = train_ds.map(process_image)
test_ds = test_ds.map(process_image)

In [296]:
for image, label in train_ds.take(1):
    print("****",image)
    print("****",label)

**** tf.Tensor(
[[[169.5  156.5  147.5 ]
  [175.75 162.75 153.75]
  [177.   164.   155.  ]
  ...
  [166.   148.   136.  ]
  [155.25 137.25 125.25]
  [146.25 128.25 116.25]]

 [[179.5  166.5  157.5 ]
  [187.75 174.75 165.75]
  [180.75 167.75 158.75]
  ...
  [148.   130.   118.  ]
  [166.75 148.75 136.75]
  [147.25 129.25 117.25]]

 [[177.75 164.75 155.75]
  [168.5  155.5  146.5 ]
  [181.5  168.5  159.5 ]
  ...
  [151.75 133.75 121.75]
  [151.25 133.25 121.25]
  [138.5  120.5  108.5 ]]

 ...

 [[171.75 155.75 142.75]
  [169.5  153.5  140.5 ]
  [168.   152.   139.  ]
  ...
  [150.5  129.5  110.5 ]
  [169.   148.   129.  ]
  [168.25 147.25 128.25]]

 [[177.25 161.25 148.25]
  [180.75 164.75 151.75]
  [170.75 154.75 141.75]
  ...
  [155.   134.   115.  ]
  [160.25 139.25 120.25]
  [163.5  142.5  123.5 ]]

 [[176.75 160.75 147.75]
  [172.   156.   143.  ]
  [171.   155.   142.  ]
  ...
  [153.5  132.5  113.5 ]
  [161.75 140.75 121.75]
  [152.25 131.25 112.25]]], shape=(128, 128, 3), dtype=fl

In [297]:
def scale(image, label):
    return image/255, label

In [298]:
train_ds = train_ds.map(scale)


In [299]:
for image, label in train_ds.take(5):
    print("****Image: ",image.numpy()[0][0])
    print("****Label: ",label.numpy())

****Image:  [0.5833333  0.5441176  0.54019606]
****Label:  b'Pepper__bell___Bacterial_spot'
****Image:  [0.7676471 0.7441176 0.7519608]
****Label:  b'Pepper__bell___Bacterial_spot'
****Image:  [0.6607843 0.6137255 0.6137255]
****Label:  b'Pepper__bell___Bacterial_spot'
****Image:  [0.65392154 0.6186274  0.59313726]
****Label:  b'Pepper__bell___Bacterial_spot'
****Image:  [0.59705883 0.5735294  0.5892157 ]
****Label:  b'Pepper__bell___Bacterial_spot'


<h1 align="center">Optimize Tensorflow Pipeline Performance: prefetch & cache  Deep Learning</h1>

In [342]:
import tensorflow as tf
import time

In [344]:
tf.__version__

'2.17.0'

# Prefetch

In [350]:
class FileDataset(tf.data.Dataset):
    def read_file_in_batches(num_samples):
        # Opening the file
        time.sleep(0.03)

        for sample_idx in range(num_samples):
            # Reading data (line, record) from the file
            time.sleep(0.015)

            yield (sample_idx,)

    def __new__(cls, num_samples=3):
        return tf.data.Dataset.from_generator(
            cls.read_file_in_batches,
            output_signature = tf.TensorSpec(shape = (1,), dtype = tf.int64),
            args=(num_samples,)
        )

In [354]:
def benchmark(dataset, num_epochs=2):
    for epoch_num in range(num_epochs):
        for sample in dataset:
            # Performing a training step
            time.sleep(0.01)


In [356]:
%%timeit
benchmark(FileDataset())

598 ms ± 117 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [412]:
%%timeit
benchmark(FileDataset().prefetch(1))

346 ms ± 41.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [360]:
%%timeit
benchmark(FileDataset().prefetch(tf.data.AUTOTUNE))

531 ms ± 43.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# As you can notice above, using prefetch improves the performance from 598 ms to 414 and 531 ms

# Cache

In [362]:
dataset = tf.data.Dataset.range(5)
dataset = dataset.map(lambda x: x**2)
dataset = dataset.cache("mycache.txt")
# The first time reading through the data will generate the data using
# `range` and `map`.
list(dataset.as_numpy_iterator())

[0, 1, 4, 9, 16]

In [364]:
# Subsequent iterations read from the cache.
list(dataset.as_numpy_iterator())

[0, 1, 4, 9, 16]

In [366]:
def mapped_function(s):
    # Do some hard pre-processing
    tf.py_function(lambda: time.sleep(0.03), [], ())
    return s

In [368]:
%%timeit -r1 -n1
benchmark(FileDataset().map(mapped_function), 5)

2.07 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [370]:
%%timeit -r1 -n1
benchmark(FileDataset().map(mapped_function).cache(), 5)

530 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
