# Creating a Dataset
From a Tensor

In [1]:
import tensorflow as tf

# Create a dataset from a tensor
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)

# Iterating over the dataset
for item in dataset:
    print(item)


tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


Using a Nested Structure

In [2]:
# Create a dataset with nested structures
X_nested = {"a": ([1, 2, 3], [4, 5, 6]), "b": [7, 8, 9]}
dataset = tf.data.Dataset.from_tensor_slices(X_nested)

# Iterating over the dataset
for item in dataset:
    print(item)


{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=1>, <tf.Tensor: shape=(), dtype=int32, numpy=4>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=7>}
{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=2>, <tf.Tensor: shape=(), dtype=int32, numpy=5>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=8>}
{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(), dtype=int32, numpy=6>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=9>}


# Chaining Transformations
Transformations like repeat(), batch(), and map() can be chained for efficient preprocessing.

Example: Repeat and Batch

In [3]:
# Repeat the dataset 3 times and batch it
dataset = tf.data.Dataset.from_tensor_slices(tf.range(10))
dataset = dataset.repeat(3).batch(7)

# Iterating over the transformed dataset
for item in dataset:
    print(item)


tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


Example: Applying a Mapping Function

In [4]:
# Apply a transformation (e.g., multiplying by 2)
dataset = dataset.map(lambda x: x * 2)

for item in dataset:
    print(item)


tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


Example: Filtering Data

In [5]:
# Filter items based on a condition (sum > 50)
dataset = dataset.filter(lambda x: tf.reduce_sum(x) > 50)

for item in dataset:
    print(item)


tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)


Example: Taking a Few Items

In [6]:
# Take only the first 2 items
for item in dataset.take(2):
    print(item)


tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)


# Shuffling the Data
Shuffling ensures that the dataset instances are independent and identically distributed (IID).

Simple Shuffling

In [7]:
# Shuffle the dataset with a buffer size of 4
dataset = tf.data.Dataset.range(10).repeat(2)
dataset = dataset.shuffle(buffer_size=4, seed=42).batch(7)

for item in dataset:
    print(item)


tf.Tensor([1 4 2 3 5 0 6], shape=(7,), dtype=int64)
tf.Tensor([9 8 2 0 3 1 4], shape=(7,), dtype=int64)
tf.Tensor([5 7 9 6 7 8], shape=(6,), dtype=int64)


## Advanced Shuffling for Large Datasets
For datasets that do not fit in memory:

Shuffle the source data itself (e.g., using Linux's shuf command).
Split data into multiple files.
Use tf.data to read files randomly, interleave records, and add shuffling buffers.

In [10]:
import tensorflow as tf
import os

# Example: Replace with actual paths or glob patterns
file_paths = ["/content/sample_data/california_housing_test.csv"]

# Verify that files exist
for path in file_paths:
    if not os.path.exists(path):
        print(f"File does not exist: {path}")

# Create a dataset from the list of files
dataset = tf.data.Dataset.list_files(file_paths)

# Interleave lines from files
dataset = dataset.interleave(
    lambda file: tf.data.TextLineDataset(file),
    cycle_length=3,  # Number of files to read concurrently
    block_length=1
)

# Shuffle the dataset
dataset = dataset.shuffle(buffer_size=1000)  # Adjust buffer size based on memory

# Inspect the dataset
for item in dataset.take(5):
    print(item.numpy().decode("utf-8"))



-119.010000,35.380000,52.000000,114.000000,26.000000,158.000000,26.000000,1.075000,67500.000000
-122.140000,40.070000,31.000000,2053.000000,465.000000,1193.000000,447.000000,1.492300,44400.000000
-118.370000,33.950000,5.000000,6955.000000,2062.000000,3591.000000,1566.000000,3.111000,247600.000000
-117.810000,33.820000,22.000000,2898.000000,335.000000,1057.000000,324.000000,10.811100,500001.000000
-117.920000,34.120000,32.000000,2552.000000,576.000000,2161.000000,548.000000,2.945900,144400.000000


# Interleaving File Data
Interleave: Use the interleave() method to read and combine lines from multiple files simultaneously. This improves parallelism:

In [None]:
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers
)


# Preprocessing Data
Parsing CSV Lines: Convert raw CSV lines into feature tensors

In [12]:
def parse_csv_line(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    return tf.stack(fields[:-1]), tf.stack(fields[-1:])


Scaling Features: Normalize the features using precomputed mean and standard deviation

In [13]:
def preprocess(line):
    x, y = parse_csv_line(line)
    return (x - X_mean) / X_std, y


##  Efficient Input Pipeline
Combine everything into a reusable function:

In [14]:
def csv_reader_dataset(filepaths, n_readers=5, n_read_threads=None,
                       n_parse_threads=5, shuffle_buffer_size=10_000,
                       batch_size=32, seed=42):
    dataset = tf.data.Dataset.list_files(filepaths, seed=seed)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size).prefetch(1)


# Using with Keras
Create datasets for training, validation, and testing:

In [None]:
train_set = csv_reader_dataset(train_filepaths)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)


In [None]:
model = tf.keras.Sequential([...])
model.compile(loss="mse", optimizer="sgd")
model.fit(train_set, validation_data=valid_set, epochs=5)
test_mse = model.evaluate(test_set)


In [None]:
# For predictions
new_set = test_set.take(3)  # Get 3 new samples
y_pred = model.predict(new_set)


# TFRecord Basics
A TFRecord file stores binary data, consisting of:

A length field (size of each record)
A CRC checksum for integrity verification
The data payload (actual record content)
Writing TFRecord Files

In [17]:
import tensorflow as tf

with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second record")


Reading TFRecord Files

In [18]:
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)

for item in dataset:
    print(item)  # Outputs tf.Tensor objects


tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


#  Compressed TFRecord Files
TFRecord files can be compressed using formats like GZIP for network efficiency.

Writing Compressed TFRecords

In [19]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("compressed.tfrecord", options) as f:
    f.write(b"Compressed data")


In [20]:
dataset = tf.data.TFRecordDataset(["compressed.tfrecord"], compression_type="GZIP")


# Protobuf Introduction
Definition
Protobuf (Protocol Buffers) is a binary serialization format widely used in TFRecord files. A protobuf schema defines structured data using .proto files, such as

In [None]:
syntax = "proto3";

message Person {
  string name = 1;
  int32 id = 2;
  repeated string email = 3;
}


# Using Protobuf in Python
Once compiled with protoc, the generated Python access classes can be used to create, manipulate, and serialize objects

In [None]:
from person_pb2 import Person

person = Person(name="Alice", id=123, email=["a@b.com", "c@d.com"])
serialized = person.SerializeToString()  # Serialize to binary
print(serialized)

# Deserialize
person2 = Person()
person2.ParseFromString(serialized)
print(person2)


#  TensorFlow Protobufs
The Example protobuf is commonly used in TFRecord files for datasets. Its structure is:

Features: A mapping of feature names to values, where each value is:
A BytesList (e.g., strings)
A FloatList (e.g., floats)
An Int64List (e.g., integers)

In [22]:
from tensorflow.train import BytesList, FloatList, Int64List, Feature, Features, Example

example = Example(
    features=Features(
        feature={
            "name": Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id": Feature(int64_list=Int64List(value=[123])),
            "emails": Feature(bytes_list=BytesList(value=[b"a@b.com", b"c@d.com"])),
        }
    )
)


Writing Example Protobufs to TFRecord

In [None]:
with tf.io.TFRecordWriter("example.tfrecord") as f:
    f.write(example.SerializeToString())


## Reading and Parsing Protobufs
To parse serialized examples, define a feature description and use tf.io.parse_single_example()

In [None]:
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string),
    "id": tf.io.FixedLenFeature([], tf.int64),
    "emails": tf.io.VarLenFeature(tf.string),
}

def parse_example(serialized):
    return tf.io.parse_single_example(serialized, feature_description)

dataset = tf.data.TFRecordDataset("example.tfrecord").map(parse_example)

for item in dataset:
    print(item["name"].numpy())  # Example: b'Alice'


# Handling Nested Data with SequenceExample
For hierarchical data (e.g., text documents with sentences), use the SequenceExample protobuf, which supports:

### Contextual features:
Metadata like document title or author
### Feature lists:
Nested lists such as sentences or comments

In [None]:
message SequenceExample {
  Features context = 1;
  FeatureLists feature_lists = 2;
}


In [None]:
from tensorflow.train import FeatureList, FeatureLists, SequenceExample

sequence_example = SequenceExample(
    context=Features(
        feature={
            "title": Feature(bytes_list=BytesList(value=[b"My Document"])),
        }
    ),
    feature_lists=FeatureLists(
        feature_list={
            "sentences": FeatureList(
                feature=[
                    Feature(bytes_list=BytesList(value=[b"Hello", b"world"])),
                    Feature(bytes_list=BytesList(value=[b"This", b"is", b"TensorFlow"])),
                ]
            )
        }
    )
)


Serializing and Writing

In [None]:
with tf.io.TFRecordWriter("sequence.tfrecord") as f:
    f.write(sequence_example.SerializeToString())


# Storing Images and Tensors
You can store images or tensors in TFRecords using BytesList.

In [None]:
image = tf.io.encode_jpeg(tf.random.uniform([128, 128, 3], maxval=255, dtype=tf.int32))
example = Example(
    features=Features(
        feature={"image": Feature(bytes_list=BytesList(value=[image.numpy()]))}
    )
)

with tf.io.TFRecordWriter("images.tfrecord") as f:
    f.write(example.SerializeToString())


Reading and Decoding

In [None]:
def parse_image(serialized):
    features = {"image": tf.io.FixedLenFeature([], tf.string)}
    parsed = tf.io.parse_single_example(serialized, features)
    return tf.io.decode_jpeg(parsed["image"])

dataset = tf.data.TFRecordDataset("images.tfrecord").map(parse_image)


# Key Preprocessing Layers
Normalization Layer

Standardizes numerical features by centering them at zero with a unit standard deviation.

In [None]:
norm_layer = tf.keras.layers.Normalization()
norm_layer.adapt(X_train)
model = tf.keras.Sequential([norm_layer, tf.keras.layers.Dense(1)])


Preprocess before training

In [None]:
norm_layer.adapt(X_train)
X_train_scaled = norm_layer(X_train)
model = tf.keras.Sequential([tf.keras.layers.Dense(1)])


Discretization Layer
Transforms numerical features into categorical ones by binning values

In [None]:
discretize_layer = tf.keras.layers.Discretization(bin_boundaries=[18, 50])
age_categories = discretize_layer(age_data)


# CategoryEncoding Layer

Encodes integer categories into:
One-hot: Binary vectors for each category.
Multi-hot: Multi-category occurrences.
Count: Frequency of categories.

In [None]:
onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3)
one_hot_encoded = onehot_layer(age_categories)


# StringLookup Layer
Encodes text categories into integers or one-hot vectors.


In [None]:
cities = ["Paris", "Auckland", "San Francisco"]
str_lookup_layer = tf.keras.layers.StringLookup(output_mode="one_hot")
str_lookup_layer.adapt(cities)
encoded = str_lookup_layer(["Paris", "Auckland"])


# Custom Preprocessing Layers
If a suitable layer isn't available, you can implement a custom one

In [None]:
class MyNormalization(tf.keras.layers.Layer):
    def adapt(self, X):
        self.mean_ = np.mean(X, axis=0)
        self.std_ = np.std(X, axis=0)

    def call(self, inputs):
        return (inputs - self.mean_) / (self.std_ + tf.keras.backend.epsilon())


Integration with tf.data API
Preprocessing layers can be adapted to datasets using

In [None]:
dataset = dataset.map(lambda X, y: (norm_layer(X), y))


# Text Preprocessing with TextVectorization

In [24]:
import tensorflow as tf

# Training data
train_data = ["To be", "!(to be)", "That's the question", "Be, be, be."]

# Initialize and adapt the TextVectorization layer
text_vec_layer = tf.keras.layers.TextVectorization(output_mode="tf_idf")
text_vec_layer.adapt(train_data)

# Transform input sentences
result = text_vec_layer(["Be good!", "Question: be or be?"])
print(result)


tf.Tensor(
[[0.96725637 0.6931472  0.         0.         0.         0.        ]
 [0.96725637 1.3862944  0.         0.         0.         1.0986123 ]], shape=(2, 6), dtype=float32)


Using Pretrained Language Model Components

In [25]:
import tensorflow as tf
import tensorflow_hub as hub

# Load the pretrained module
hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2")

# Encode sentences
sentence_embeddings = hub_layer(tf.constant(["To be", "Not to be"]))
print(sentence_embeddings.numpy().round(2))


[[-0.25  0.28  0.01  0.1   0.14  0.16  0.25  0.02  0.07  0.13 -0.19  0.06
  -0.04 -0.07  0.   -0.08 -0.14 -0.16  0.02 -0.24  0.16 -0.16 -0.03  0.03
  -0.14  0.03 -0.09 -0.04 -0.14 -0.19  0.07  0.15  0.18 -0.23 -0.07 -0.08
   0.01 -0.01  0.09  0.14 -0.03  0.03  0.08  0.1  -0.01 -0.03 -0.07 -0.1
   0.05  0.31]
 [-0.2   0.2  -0.08  0.02  0.19  0.05  0.22 -0.09  0.02  0.19 -0.02 -0.14
  -0.2  -0.04  0.01 -0.07 -0.22 -0.1   0.16 -0.44  0.31 -0.1   0.23  0.15
  -0.05  0.15 -0.13 -0.04 -0.08 -0.16 -0.1   0.13  0.13 -0.18 -0.04  0.03
  -0.1  -0.07  0.07  0.03 -0.08  0.02  0.05  0.07 -0.14 -0.1  -0.18 -0.13
  -0.04  0.15]]


Image Preprocessing Example

In [26]:
import tensorflow as tf
from sklearn.datasets import load_sample_images

# Load sample images
images = load_sample_images()["images"]

# Center crop images
crop_image_layer = tf.keras.layers.CenterCrop(height=100, width=100)
cropped_images = crop_image_layer(images)

# Rescale images
rescale_layer = tf.keras.layers.Rescaling(scale=1.0/255)
rescaled_images = rescale_layer(cropped_images)


In [27]:
# Data Augmentation Example
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomRotation(0.2),
    tf.keras.layers.RandomZoom(0.1),
])

# Apply augmentation to images
augmented_images = data_augmentation(cropped_images)


# The TensorFlow Datasets Project

In [28]:
# Import TensorFlow Datasets
import tensorflow_datasets as tfds
import tensorflow as tf

# Load the MNIST dataset, split into training, validation, and test sets
train_set, valid_set, test_set = tfds.load(
    name="mnist",
    split=["train[:90%]", "train[90%:]", "test"],
    as_supervised=True  # Ensures the dataset returns (image, label) tuples
)

# Prepare the datasets
train_set = train_set.shuffle(buffer_size=10_000, seed=42).batch(32).prefetch(1)
valid_set = valid_set.batch(32).cache()
test_set = test_set.batch(32).cache()

# Set random seed for reproducibility
tf.random.set_seed(42)

# Define a simple neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),  # Flatten the input images
    tf.keras.layers.Dense(10, activation="softmax")  # Output layer with 10 classes
])

# Compile the model
model.compile(
    loss="sparse_categorical_crossentropy",  # Suitable loss function for integer labels
    optimizer="nadam",  # Nesterov-accelerated Adaptive Moment Estimation
    metrics=["accuracy"]  # Track accuracy during training
)

# Train the model
history = model.fit(
    train_set,
    validation_data=valid_set,
    epochs=5
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_set)

# Print test results
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Downloading and preparing dataset 11.06 MiB (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /root/tensorflow_datasets/mnist/3.0.1...


Dl Completed...:   0%|          | 0/5 [00:00<?, ? file/s]

Dataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.
Epoch 1/5


  super().__init__(**kwargs)


[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.7581 - loss: 19.2089 - val_accuracy: 0.8690 - val_loss: 6.2683
Epoch 2/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8770 - loss: 5.8573 - val_accuracy: 0.8858 - val_loss: 5.8402
Epoch 3/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8821 - loss: 5.2617 - val_accuracy: 0.8892 - val_loss: 5.1971
Epoch 4/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8857 - loss: 4.9461 - val_accuracy: 0.8847 - val_loss: 5.3869
Epoch 5/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8883 - loss: 4.8653 - val_accuracy: 0.8782 - val_loss: 5.8640
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8859 - loss: 5.4981
Test Loss: 5.6026
Test Accuracy: 0.8835
