<a href="https://colab.research.google.com/github/RifatMuhtasim/Deep_Learning/blob/main/Learn/45.Tensorflow_Prefetch_Cache.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import time

In [2]:
tf.__version__

'2.15.0'

# Prefetch

In [3]:
class FileDataset(tf.data.Dataset):
  def read_files_in_batches(num_samples):
    time.sleep(0.03)
    for sample_idx in range(num_samples):
      # Reading data (line, record) from the file
      time.sleep(0.015)
      yield (sample_idx, )

  def __new__(cls, num_samples=3):
    return tf.data.Dataset.from_generator(
        cls.read_files_in_batches,
        output_signature = tf.TensorSpec(shape=(1,), dtype=tf.int64),
        args = (num_samples, )
    )

In [4]:
def benchmark(dataset, num_epochs=2):
  for epoch_num in range(num_epochs):
    for sample in dataset:
      time.sleep(0.01)

In [5]:
%%timeit
benchmark(FileDataset())

371 ms ± 82.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%timeit
benchmark(FileDataset().prefetch(1))

433 ms ± 87.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
benchmark(FileDataset().prefetch(tf.data.AUTOTUNE))

322 ms ± 43.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


As you can notice above, using prefetch improves the performance

# Cache

In [8]:
dataset = tf.data.Dataset.range(5)
dataset = dataset.map(lambda x: x**2)
dataset = dataset.cache("mycache.txt")
list(dataset.as_numpy_iterator())

[0, 1, 4, 9, 16]

In [9]:
def mapped_function(s):
  tf.py_function(lambda: time.sleep(0.03), [], ())
  return s

In [10]:
%%timeit -r1 -n1
benchmark(FileDataset().map(mapped_function), 5)

1.07 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [11]:
%%timeit -r1 -n1
benchmark(FileDataset().map(mapped_function).cache(), 5)

366 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
