# The tf.data API

In [3]:
import tensorflow as tf

In [4]:
X=tf.constant([1,2,3,4,5,6,7,8,9])
dataset=tf.data.Dataset.from_tensor_slices(X)

In [5]:
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [6]:
for item in dataset:
    print(item)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


2025-07-24 14:50:58.974589: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [7]:
X=tf.range(10)
dataset=tf.data.Dataset.from_tensor_slices(X)
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [8]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


2025-07-24 14:50:58.985799: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [9]:
dataset=tf.data.Dataset.range(3)
dataset

<_RangeDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [10]:
try:
    dataset[3]
except TypeError as ex:
    print(ex)


'_RangeDataset' object is not subscriptable


### tf.data Dataset with Structured Data

#### Tuple of Tensors

In [13]:
X = tf.constant([[1, 2], [3, 4], [5, 6]])
y = tf.constant([0, 1, 0])
dataset = tf.data.Dataset.from_tensor_slices((X, y))

In [14]:
for item in dataset:
    print(item)

(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 2], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 4], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([5, 6], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)


#### Dictionary of Tensors

In [16]:
inputs = {
    "feature1": tf.constant([1.0, 2.0, 3.0]),
    "feature2": tf.constant([10.0, 20.0, 30.0])
}

dataset = tf.data.Dataset.from_tensor_slices(inputs)

In [17]:
for item in dataset:
    print(item)

{'feature1': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'feature2': <tf.Tensor: shape=(), dtype=float32, numpy=10.0>}
{'feature1': <tf.Tensor: shape=(), dtype=float32, numpy=2.0>, 'feature2': <tf.Tensor: shape=(), dtype=float32, numpy=20.0>}
{'feature1': <tf.Tensor: shape=(), dtype=float32, numpy=3.0>, 'feature2': <tf.Tensor: shape=(), dtype=float32, numpy=30.0>}


2025-07-24 14:50:59.035704: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


####  Nested Tuple + Dictionary

In [19]:
X = tf.constant([[1, 2], [3, 4], [5, 6]])
y = tf.constant([0, 1, 0])
meta = tf.constant([100, 200, 300])

dataset = tf.data.Dataset.from_tensor_slices(((X, meta), y))

In [20]:
for item in dataset:
    print(item,"\n")

((<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 2], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=100>), <tf.Tensor: shape=(), dtype=int32, numpy=0>) 

((<tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 4], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=200>), <tf.Tensor: shape=(), dtype=int32, numpy=1>) 

((<tf.Tensor: shape=(2,), dtype=int32, numpy=array([5, 6], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=300>), <tf.Tensor: shape=(), dtype=int32, numpy=0>) 



## Chaining Transformations

In [22]:
dataset=tf.data.Dataset.from_tensor_slices(tf.range(10))
dataset=dataset.repeat(3).batch(7)

In [23]:
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [24]:
dataset=dataset.map(lambda x:x*2)   # x is a batch
for item in dataset:
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


In [25]:
dataset=dataset.filter(lambda x:tf.reduce_sum(x)>50)  # x is a batch
for item in dataset:
    print(item)

tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)


2025-07-24 14:50:59.109136: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [26]:
for item in dataset.take(2):
    print(item)

tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)


In [27]:
import time

def slow_map_fn(x):
    tf.print("Processing", x)
    time.sleep(1)  # simulate slow processing
    return x * x

dataset=tf.data.Dataset.range(10)
dataset=dataset.map(lambda x: tf.py_function(slow_map_fn, [x], tf.int64)
                      ,num_parallel_calls=tf.data.AUTOTUNE)
for item in dataset:
    print(item.numpy())

Processing 5
Processing 2
Processing 6
Processing 1
Processing 3
Processing 4
Processing 0
Processing 7
0
Processing 8
1
Processing 9
4
9
16
25
36
49
64
81


## Shuffling the data

In [29]:
#This will shuffle batches
dataset=tf.data.Dataset.range(10).repeat(2).batch(7)
dataset=dataset.shuffle(buffer_size=4)
for item in dataset:
    print(item)

tf.Tensor([4 5 6 7 8 9], shape=(6,), dtype=int64)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int64)
tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int64)


In [30]:
#This Shuffles individual elements
dataset=tf.data.Dataset.range(20)
dataset=dataset.shuffle(buffer_size=10).batch(7)
for item in dataset:
    print(item)

tf.Tensor([ 7  0 11  2  3  5 13], shape=(7,), dtype=int64)
tf.Tensor([16 12  4 17  1 18 14], shape=(7,), dtype=int64)
tf.Tensor([ 8  9 10  6 15 19], shape=(6,), dtype=int64)


## Interleaving lines from multiple files

In [32]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing=fetch_california_housing()
X_train_full,X_test,y_train_full,y_test=train_test_split(housing.data,
                                                         housing.target.reshape(-1,1),random_state=42)
X_train,X_valid,y_train,y_valid=train_test_split(X_train_full,y_train_full,random_state=42)

In [33]:
from pathlib import Path
import numpy as np

def save_to_csv_files(data,name_prefix,header=None,n_parts=10):
    housing_dir=Path()/"datasets"/"housing"
    housing_dir.mkdir(parents=True,exist_ok=True)
    filename_format="my_{}_{:02d}.csv"

    filepaths=[]
    m=len(data)
    chunks=np.array_split(np.arange(m),n_parts)

    for file_idx,row_indices in enumerate(chunks):
        part_csv=housing_dir/filename_format.format(name_prefix,file_idx)
        filepaths.append(str(part_csv))
        with open(part_csv,"w") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([str(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

train_data=np.c_[X_train,y_train]
valid_data=np.c_[X_valid,y_valid]
test_data=np.c_[X_test,y_test]
header_cols=housing.feature_names+["MedianHouseValue"]
header=",".join(header_cols)


In [34]:
train_filepaths=save_to_csv_files(train_data,"train",header,n_parts=20)
valid_filepaths=save_to_csv_files(valid_data,"valid",header,n_parts=10)
test_filepaths=save_to_csv_files(test_data,"test",header,n_parts=10)

In [35]:
print("".join(open(train_filepaths[0]).readlines()[:4]))

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442
5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687
3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621



In [36]:
train_filepaths

['datasets/housing/my_train_00.csv',
 'datasets/housing/my_train_01.csv',
 'datasets/housing/my_train_02.csv',
 'datasets/housing/my_train_03.csv',
 'datasets/housing/my_train_04.csv',
 'datasets/housing/my_train_05.csv',
 'datasets/housing/my_train_06.csv',
 'datasets/housing/my_train_07.csv',
 'datasets/housing/my_train_08.csv',
 'datasets/housing/my_train_09.csv',
 'datasets/housing/my_train_10.csv',
 'datasets/housing/my_train_11.csv',
 'datasets/housing/my_train_12.csv',
 'datasets/housing/my_train_13.csv',
 'datasets/housing/my_train_14.csv',
 'datasets/housing/my_train_15.csv',
 'datasets/housing/my_train_16.csv',
 'datasets/housing/my_train_17.csv',
 'datasets/housing/my_train_18.csv',
 'datasets/housing/my_train_19.csv']

In [37]:
test_filepaths

['datasets/housing/my_test_00.csv',
 'datasets/housing/my_test_01.csv',
 'datasets/housing/my_test_02.csv',
 'datasets/housing/my_test_03.csv',
 'datasets/housing/my_test_04.csv',
 'datasets/housing/my_test_05.csv',
 'datasets/housing/my_test_06.csv',
 'datasets/housing/my_test_07.csv',
 'datasets/housing/my_test_08.csv',
 'datasets/housing/my_test_09.csv']

#### Building an Input Pipeline

In [39]:
filepath_dataset=tf.data.Dataset.list_files(train_filepaths,seed=42)

In [40]:
for filepath in filepath_dataset:
    print(filepath)

tf.Tensor(b'datasets/housing/my_train_05.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_16.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_17.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_14.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_10.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_02.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_12.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_19.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_09.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_13.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_15.csv', shape=(), dtype=string)
tf.Ten

In [41]:
n_readers=5
dataset=filepath_dataset.interleave(
    lambda filepath:tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers
)

In [42]:
for line in dataset.take(4):
    print(line)

tf.Tensor(b'4.5909,16.0,5.475877192982456,1.0964912280701755,1357.0,2.9758771929824563,33.63,-117.71,2.418', shape=(), dtype=string)
tf.Tensor(b'2.4792,24.0,3.4547038327526134,1.1341463414634145,2251.0,3.921602787456446,34.18,-118.38,2.0', shape=(), dtype=string)
tf.Tensor(b'4.2708,45.0,5.121387283236994,0.953757225433526,492.0,2.8439306358381504,37.48,-122.19,2.67', shape=(), dtype=string)
tf.Tensor(b'2.1856,41.0,3.7189873417721517,1.0658227848101265,803.0,2.0329113924050635,32.76,-117.12,1.205', shape=(), dtype=string)


## Preprocessing the Data

In [44]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(X_train)

In [45]:
X_mean,X_std=scaler.mean_,scaler.scale_
n_inputs=8

def parse_csv_line(line):
    defs=[0.]*n_inputs+[tf.constant([],dtype=tf.float32)]
    fields=tf.io.decode_csv(line,record_defaults=defs)
    return tf.stack(fields[:-1]),tf.stack(fields[-1:])

def preprocess(line):
    x,y=parse_csv_line(line)
    return (x-X_mean)/X_std,y

In [46]:
preprocess(b'3.5214,15.0,3.0499445061043287,1.106548279689234,,1.6059933407325193,37.63,-122.43,0')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([-0.19397889, -1.0778131 , -0.9433854 ,  0.01485314, -1.2998114 ,
        -0.5729162 ,  0.9292612 , -1.4221538 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>)

## Putting Everything Together + Prefetching

In [48]:
def csv_reader_dataset(filepath,n_readers=5,n_read_threads=None,n_parse_threads=4,
                      shuffle_buffer_size=10000,seed=42,batch_size=32,repeat=False):
    dataset=tf.data.Dataset.list_files(filepath,seed=seed)
    dataset=dataset.interleave(
        lambda filepath:tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers,num_parallel_calls=n_read_threads  )
    dataset=dataset.map(preprocess,num_parallel_calls=n_parse_threads)
    dataset=dataset.shuffle(buffer_size=shuffle_buffer_size,seed=seed)
    if repeat:
        dataset=dataset.repeat()
    
    return dataset.batch(batch_size).prefetch(1)

In [49]:
example_set=csv_reader_dataset(train_filepaths,batch_size=4)
for X_batch,y_batch in example_set.take(2):
    print("X =",X_batch)
    print("y =",y_batch)
    print()

X = tf.Tensor(
[[-1.3957452  -0.04940685 -0.22830808  0.22648273  2.2593622   0.35200632
   0.9667386  -1.4121602 ]
 [ 2.7112627  -1.0778131   0.69413143 -0.14870553  0.51810503  0.3507294
  -0.82285154  0.80680597]
 [-0.13484643 -1.868895    0.01032507 -0.13787179 -0.12893449  0.03143518
   0.2687057   0.13212144]
 [ 0.09031774  0.9789995   0.1327582  -0.13753782 -0.23388447  0.10211545
   0.97610843 -1.4121602 ]], shape=(4, 8), dtype=float32)
y = tf.Tensor(
[[1.819]
 [3.674]
 [0.954]
 [2.725]], shape=(4, 1), dtype=float32)

X = tf.Tensor(
[[ 0.05218809 -2.0271113   0.2940109  -0.02403445  0.16218767 -0.02844518
   1.4117942  -0.93737936]
 [-0.672276    0.02970133 -0.76922584 -0.15086786  0.4962024  -0.02741998
  -0.7853724   0.77182245]
 [-0.8111771   0.34613404 -0.21826383 -0.0801027   0.06636376  0.26724264
   0.1937491   0.30204034]
 [-0.689403    1.8491895  -0.80511904 -0.08778115  1.0903106  -0.36003128
   0.994848   -1.4171551 ]], shape=(4, 8), dtype=float32)
y = tf.Tensor(
[[1

## Using Dataset with Keras

In [51]:
train_set=csv_reader_dataset(train_filepaths,repeat=True)
valid_set=csv_reader_dataset(valid_filepaths)
test_set=csv_reader_dataset(test_filepaths,repeat=True)

In [52]:
tf.keras.backend.clear_session()
tf.keras.utils.set_random_seed(42)

In [53]:
model=tf.keras.Sequential([
    tf.keras.layers.Dense(20,activation="relu",kernel_initializer="he_normal"),
    tf.keras.layers.Dense(1)
])

In [54]:
model.compile(loss="mse",optimizer="nadam")

step_per_epoch=len(X_train)//32
validation_steps=len(X_valid)//32
model.fit(train_set,
         epochs=3,
         steps_per_epoch=step_per_epoch,
         validation_data=valid_set,
         validation_steps=validation_steps)

Epoch 1/3
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 819us/step - loss: 3.8101 - val_loss: 3.9114
Epoch 2/3
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 769us/step - loss: 0.9672 - val_loss: 0.8660
Epoch 3/3
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 748us/step - loss: 0.6777 - val_loss: 0.6843


<keras.src.callbacks.history.History at 0x17ae748f0>

In [55]:
model.evaluate(test_set,steps=len(X_test)//32)

[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 434us/step - loss: 0.5918 


0.5716007947921753

In [56]:
new_set=test_set.take(2)
y_pred=model.predict(new_set)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


In [57]:
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)
loss_fn=tf.keras.losses.MeanSquaredError()
steps_per_epoch = len(X_train) // 32
n_epochs=5

for epoch in range(1,n_epochs+1):
    print("\rEpoch: {}/{}".format(epoch,n_epochs),end="")
    for X_batch,y_batch in train_set.take(steps_per_epoch):
        with tf.GradientTape() as tape:
            y_pred=model(X_batch)
            main_loss=tf.reduce_mean(loss_fn(y_batch,y_pred))
            loss=tf.add_n([main_loss]+model.losses)

        gradient=tape.gradient(loss,model.trainable_variables)
        optimizer.apply_gradients(zip(gradient,model.trainable_variables))
print("\nTraining Completed!")

Epoch: 1/5

2025-07-24 14:51:06.502270: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch: 5/5
Training Completed!


In [58]:
@tf.function
def train_step(model,X_batch,y_batch,optimizer,loss_fn):
    with tf.GradientTape() as tape:
        
        y_pred=model(X_batch)
        main_loss=tf.reduce_mean(loss_fn(y_batch,y_pred))
        loss=tf.add_n([main_loss]+model.losses)
    gradient=tape.gradient(loss,model.trainable_variables)
    optimizer.apply_gradients(zip(gradient,model.trainable_variables))
    
        


steps_per_epoch=len(X_train)//32
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)
loss_fn=tf.keras.losses.MeanSquaredError()
n_epochs=5

for epoch in range(1,n_epochs+1):
    print("\rEpoch: {}/{}".format(epoch,n_epochs),end="")
    for X_batch,y_batch in train_set.take(steps_per_epoch):       
        train_step(model,X_batch,y_batch,optimizer,loss_fn)
print("\nTraining Completed!")

Epoch: 5/5
Training Completed!


# The TFRecord Format

In [60]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record.")
    f.write(b"This is just after first record")

In [61]:
filepaths=["my_data.tfrecord"]

dataset=tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record.', shape=(), dtype=string)
tf.Tensor(b'This is just after first record', shape=(), dtype=string)


2025-07-24 14:51:18.664655: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:381] TFRecordDataset `buffer_size` is unspecified, default to 262144


In [62]:

filepaths=["my_test_{}".format(i) for i in range(5)]
for i ,filepath in enumerate(filepaths):
   with tf.io.TFRecordWriter(filepath) as f:
       for j in range(3):
           f.write("File: {} Record: {}".format(i,j))

dataset=tf.data.TFRecordDataset(filepaths,num_parallel_reads=3)
for item in dataset:
    print(item)
       
    


tf.Tensor(b'File: 0 Record: 0', shape=(), dtype=string)
tf.Tensor(b'File: 1 Record: 0', shape=(), dtype=string)
tf.Tensor(b'File: 2 Record: 0', shape=(), dtype=string)
tf.Tensor(b'File: 0 Record: 1', shape=(), dtype=string)
tf.Tensor(b'File: 1 Record: 1', shape=(), dtype=string)
tf.Tensor(b'File: 2 Record: 1', shape=(), dtype=string)
tf.Tensor(b'File: 0 Record: 2', shape=(), dtype=string)
tf.Tensor(b'File: 1 Record: 2', shape=(), dtype=string)
tf.Tensor(b'File: 2 Record: 2', shape=(), dtype=string)
tf.Tensor(b'File: 3 Record: 0', shape=(), dtype=string)
tf.Tensor(b'File: 4 Record: 0', shape=(), dtype=string)
tf.Tensor(b'File: 3 Record: 1', shape=(), dtype=string)
tf.Tensor(b'File: 4 Record: 1', shape=(), dtype=string)
tf.Tensor(b'File: 3 Record: 2', shape=(), dtype=string)
tf.Tensor(b'File: 4 Record: 2', shape=(), dtype=string)
