Data pipelines work on the principle of ETL, which stands for extract, transform, load.

tfds and tf.data are used

TFDS and tf.data in a nutshell

```python
import tensorflow as tf
import tensorflow_datasets as tfds

# construct a tf.data.Dataset by downloading and extracting
dataset = tfds.load(name = "mnist", split = "train")

# Transform
dataset = dataset.shuffle(NUM_SAMPLES) # buffer size
dataset = dataset.repeat(NUM_EPOCHS)
dataset = dataset.map(lambda x : ...)
dataset = dataset.batch(BATCH_SIZE)

# Load
iterator = dataset.take(10) # To fetch 10 samples from the dataset
for data in iterator:
    # Access data to use it
```

In [9]:
import tensorflow as tf
import tensorflow_datasets as tfds

dataset = tfds.load(name = "mnist")
#dataset = tfds.load(name = "mnist", split = "train")



In [17]:
#Inspecting shapes and datatypes
dataset['train']

<_PrefetchDataset element_spec={'image': TensorSpec(shape=(28, 28, 1), dtype=tf.uint8, name=None), 'label': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

In [18]:
# Checking if the dataset is an instance of tf.data.Dataset
assert isinstance(dataset['train'], tf.data.Dataset)

In [19]:
# See available datasets
tfds.list_builders()

['abstract_reasoning',
 'accentdb',
 'aeslc',
 'aflw2k3d',
 'ag_news_subset',
 'ai2_arc',
 'ai2_arc_with_ir',
 'amazon_us_reviews',
 'anli',
 'answer_equivalence',
 'arc',
 'asqa',
 'asset',
 'assin2',
 'asu_table_top_converted_externally_to_rlds',
 'austin_buds_dataset_converted_externally_to_rlds',
 'austin_sailor_dataset_converted_externally_to_rlds',
 'austin_sirius_dataset_converted_externally_to_rlds',
 'bair_robot_pushing_small',
 'bc_z',
 'bccd',
 'beans',
 'bee_dataset',
 'beir',
 'berkeley_autolab_ur5',
 'berkeley_cable_routing',
 'berkeley_fanuc_manipulation',
 'berkeley_gnm_cory_hall',
 'berkeley_gnm_recon',
 'berkeley_gnm_sac_son',
 'berkeley_mvp_converted_externally_to_rlds',
 'berkeley_rpt_converted_externally_to_rlds',
 'big_patent',
 'bigearthnet',
 'billsum',
 'binarized_mnist',
 'binary_alpha_digits',
 'ble_wind_field',
 'blimp',
 'booksum',
 'bool_q',
 'bot_adversarial_dialogue',
 'bridge',
 'bucc',
 'c4',
 'c4_wsrs',
 'caltech101',
 'caltech_birds2010',
 'caltech_b

In [20]:
# Viewing a datasets metadata
mnist, info = tfds.load(name="mnist", with_info =True)
info

tfds.core.DatasetInfo(
    name='mnist',
    full_name='mnist/3.0.1',
    description="""
    The MNIST database of handwritten digits.
    """,
    homepage='http://yann.lecun.com/exdb/mnist/',
    data_dir='/Users/siva/tensorflow_datasets/mnist/3.0.1',
    file_format=tfrecord,
    download_size=11.06 MiB,
    dataset_size=21.00 MiB,
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=uint8),
        'label': ClassLabel(shape=(), dtype=int64, num_classes=10),
    }),
    supervised_keys=('image', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=10000, num_shards=1>,
        'train': <SplitInfo num_examples=60000, num_shards=1>,
    },
    citation="""@article{lecun2010mnist,
      title={MNIST handwritten digit database},
      author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
      journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
      volume={2},
      year={2010}
    }""",
)

In [22]:
print('Image features:', info.features['image'])

Image features: Image(shape=(28, 28, 1), dtype=uint8)


In [24]:
print('Number of training examples', info.splits['train'].num_examples)
print('Number of test examples', info.splits['test'].num_examples)

Number of training examples 60000
Number of test examples 10000


### Loding a specific version
```python
mnist = tfds.load("mnist:1.*.*")
```

In [26]:
# Loading a dataset(as_supervised = True)
'''
If you do this with as supervised equals true, 
then your dataset will be preformatted into tuples of data and label.

If you set as a false, 
your dataset will be available as a dictionary.
'''

dataset = tfds.load('mnist', as_supervised = True)

# Inspecting shapes of a batch
for image , label in dataset['train'].take(1):
    print("Supervised_True:", image.shape, label.shape)
    
dataset = tfds.load('mnist')

# Inspecting shapes of a batch
for image , label in dataset['train'].take(1):
    print("Supervised_False:", image.shape)

Supervised_True: (28, 28, 1) ()


2024-03-12 10:55:39.982397: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2024-03-12 10:55:39.983643: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-03-12 10:55:40.085220: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.

AttributeError: 'str' object has no attribute 'shape'

```Python
# Using splits

train_set = tfds.load(name, split=tfds.Split.TRAIN)
validation_set = tfds.load(name, split=tfds.Split.VALIDATION)
test_set = tfds.load(name, split=tfds.Split.TEST)
all_set = tfds.load(name, split=tfds.Split.ALL)
```

```Python
# Non conventional naming slpits
split = tfds.Split('test2015')
ds = tfds.load('coco2014', split=split)
```

In [None]:
# Dataset Builder: Explicit call instead of tfds.load method

mnist_builder = tfds.builder("mnist")

mnist_builder.download_and_prepare()

mnist_builder.as_dataset(split= tfds.Split.TRAIN)
