# Import libraries

In [33]:
import tensorflow as tf
import time

In [34]:
tf.__version__

'2.10.1'

# Create a dummy class to do operations on data

In [35]:
class FileDataset(tf.data.Dataset):
    def read_files_in_batches(num_samples):
        # open files in batches
        time.sleep(0.03)
        for sample_idx in range(num_samples):
            # Reading data (line, record) from the file
            time.sleep(0.015)
            yield (sample_idx,)
            
    def __new__(cls,num_samples=3):
        return tf.data.Dataset.from_generator(
            cls.read_files_in_batches,
            output_signature=tf.TensorSpec(shape=(1,),dtype=tf.int64),
            args=(num_samples,)
        )

In [36]:
def benchmark(dataset,num_epochs=2):
    for epoch_num in range(num_epochs):
        for sample in dataset:
            # Performing a training step
            time.sleep(0.01)

# Time taken to initialize the class FileDataset

In [37]:
%%timeit
benchmark(FileDataset())

369 ms ± 9.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Time taken to initialize the class FileDataset using prefetch

In [38]:
%%timeit
benchmark(FileDataset().prefetch(1))

305 ms ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
%%timeit
benchmark(FileDataset().prefetch(tf.data.AUTOTUNE))

310 ms ± 8.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Create a dummy dataset

In [40]:
dataset=tf.data.Dataset.range(5)
for d in dataset:
    print(d.numpy())

0
1
2
3
4


# Lambda function on dummy dataset

In [41]:
dataset=dataset.map(lambda x:x**2)
for d in dataset:
    print(d.numpy())

0
1
4
9
16


# Using cache function

In [42]:
dataset=dataset.cache()
list(dataset.as_numpy_iterator())

[0, 1, 4, 9, 16]

In [43]:
list(dataset.as_numpy_iterator())

[0, 1, 4, 9, 16]

In [44]:
def mapped_function(s):
    tf.py_function(lambda: time.sleep(0.03),[],())
    return s

In [45]:
FileDataset().map(mapped_function)

<MapDataset element_spec=TensorSpec(shape=(1,), dtype=tf.int64, name=None)>

In [46]:
%%timeit -n1 -r1

benchmark(FileDataset().map(mapped_function),5)

1.35 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [47]:
%%timeit -n1 -r1

benchmark(FileDataset().map(mapped_function).cache(),5)

557 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
