Indoor Scene Classification with TFLu and the ESP32-CAM

In [1]:
%tensorflow_version 2.x

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


Python libraries

In [2]:
import numpy as np
import pathlib
import tensorflow as tf
import tensorflow_datasets as tfds
import zipfile
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2
from tensorflow.keras.models import Model

Constants

In [8]:
MODEL_ALPHA = 0.35
MODEL_INPUT_WIDTH = 48
MODEL_INPUT_HEIGHT = 48
TFL_MODEL_FILE = "indoor_scene_recognition.tflite"
TFL_MODEL_HEADER_FILE = "indoor_scene_recognition_model.h"
TF_MODEL = "indoor_scene_recognition"

Transfer learning with Keras

Unzip the dataset (dataset.zip)

In [3]:
import zipfile
with zipfile.ZipFile("dataset.zip", 'r') as zip_ref:
  zip_ref.extractall(".")
data_dir = "dataset"

Prepare the train (80%) and validation (20%) datasets

In [4]:
train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  interpolation="bilinear",
  image_size=(48, 48))
val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  interpolation="bilinear",
  image_size=(48, 48))

Found 43 files belonging to 3 classes.
Using 35 files for training.
Found 43 files belonging to 3 classes.
Using 8 files for validation.


Get the name of the classes

In [5]:
class_names = train_ds.class_names
num_classes = len(class_names)
print(class_names)

['Glass', 'Mag', 'Unknown']


Rescale the pixel values from [0, 255] tp [-1, 1]

In [6]:
rescale = tf.keras.layers.Rescaling(1./255, offset= -1)
train_ds = train_ds.map(lambda x, y: (rescale(x), y))
val_ds   = val_ds.map(lambda x, y: (rescale(x), y))

Import the MobileNet v2 pre-trained model

In [9]:
# https://github.com/keras-team/keras-applications/blob/master/keras_applications/mobilenet_v2.py
base_model = MobileNetV2(input_shape=(MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT, 3),
                         include_top=False,
                         weights='imagenet',
                         alpha=0.35)
base_model.trainable = False

feat_extr = base_model

  base_model = MobileNetV2(input_shape=(MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT, 3),


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.35_224_no_top.h5
[1m2019640/2019640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Augment the input data

In [11]:
augmen = tf.keras.Sequential([
  tf.keras.layers.RandomRotation(0.2),
  tf.keras.layers.RandomFlip('horizontal'),
])

train_ds = train_ds.map(lambda x, y: (augmen(x), y))
val_ds   = val_ds.map(lambda x, y: (augmen(x), y))

Prepare the classification head

In [12]:
global_avg_layer = tf.keras.layers.GlobalAveragePooling2D()
dense_layer = tf.keras.layers.Dense(num_classes, activation='softmax')

Build the model architecture

In [13]:
inputs = tf.keras.Input(shape=(MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT, 3))
x = global_avg_layer(feat_extr.layers[-1].output)
x = tf.keras.layers.Dropout(0.2)(x)
outputs = dense_layer(x)
model = tf.keras.Model(inputs=feat_extr.inputs, outputs=outputs)

Compile the model with a 0.0005 learning rate

In [14]:
lr = 0.0005
model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=False),
  metrics=['accuracy'])

Model summary

In [15]:
model.summary()

Train the model with 100 epochs

In [17]:
model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=100
)

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step - accuracy: 0.5580 - loss: 0.9213 - val_accuracy: 0.6250 - val_loss: 0.7558
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 337ms/step - accuracy: 0.6863 - loss: 0.8321 - val_accuracy: 0.7500 - val_loss: 0.7419
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 352ms/step - accuracy: 0.6274 - loss: 0.8553 - val_accuracy: 0.6250 - val_loss: 0.9718
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 332ms/step - accuracy: 0.6568 - loss: 0.9328 - val_accuracy: 0.2500 - val_loss: 1.0478
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 357ms/step - accuracy: 0.5979 - loss: 0.7042 - val_accuracy: 0.6250 - val_loss: 0.7903
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 242ms/step - accuracy: 0.8527 - loss: 0.6008 - val_accuracy: 0.5000 - val_loss: 0.9382
Epoch 7/100
[1m2/2[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x7ed5fdb24710>

Save the TensorFlow model

In [21]:
model.save("indoor_scene_recognition.keras")
model.export("indoor_scene_recognition")
model.save("indoor_scene_recognition.h5")

Saved artifact at 'indoor_scene_recognition'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): List[TensorSpec(shape=(None, 48, 48, 3), dtype=tf.float32, name='keras_tensor')]
Output Type:
  TensorSpec(shape=(None, 3), dtype=tf.float32, name=None)
Captures:
  139457976534800: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139457976536720: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139457976536336: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139457976535376: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139457976533840: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139457976537296: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139457976537872: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139457976536912: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139457976534992: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139457976537680: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13



Preparing and testing the quantized TFLite model

Unzip the test dataset (test_samples.zip)

In [25]:
with zipfile.ZipFile("test_samples.zip", 'r') as zip_ref:
    zip_ref.extractall(".")
test_dir = "test_samples"

Rescale the pixel values from [0, 255] to [-1, 1]

In [27]:
test_ds = tf.keras.utils.image_dataset_from_directory(test_dir,
                                                      interpolation="bilinear",
                                                      image_size=(MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT))
test_ds  = test_ds.map(lambda x, y: (rescale(x), y))

Found 13 files belonging to 1 classes.


Quantize the TensorFlow model with the TFLite converter

In [28]:
repr_ds = test_ds.unbatch()

def representative_data_gen():
  for i_value, o_value in repr_ds.batch(1).take(48):
    yield [i_value]

converter = tf.lite.TFLiteConverter.from_saved_model(TF_MODEL)
converter.representative_dataset = tf.lite.RepresentativeDataset(representative_data_gen)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8

tfl_model = converter.convert()

Get the TFLite model size in bytes

In [29]:
size_tfl_model = len(tfl_model)
print(len(tfl_model), "bytes")

619840 bytes


Initialize the TFLite interpreter

In [30]:
# Initialize the TFLite interpreter
interpreter = tf.lite.Interpreter(model_content=tfl_model)

# Allocate the tensors
interpreter.allocate_tensors()

    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    


Get input quantization parameters

In [31]:
# Get input/output layer information
i_details = interpreter.get_input_details()[0]
o_details = interpreter.get_output_details()[0]

# Get input quantization parameters.
i_quant = i_details["quantization_parameters"]
i_scale      = i_quant['scales'][0]
i_zero_point = i_quant['zero_points'][0]

Evaluate the accuracy of the quantized TFLite model

In [32]:
test_ds0 = val_ds.unbatch()

num_correct_samples = 0
num_total_samples   = len(list(test_ds0.batch(1)))

for i_value, o_value in test_ds0.batch(1):
  i_value = (i_value / i_scale) + i_zero_point
  i_value = tf.cast(i_value, dtype=tf.int8)
  interpreter.set_tensor(i_details["index"], i_value)
  interpreter.invoke()
  o_pred = interpreter.get_tensor(o_details["index"])[0]

  if np.argmax(o_pred) == o_value:
    num_correct_samples += 1

print("Accuracy:", num_correct_samples/num_total_samples)

Accuracy: 0.75


Convert the TFLite model to C-byte array with xxd

In [33]:
open("model.tflite", "wb").write(tfl_model)
!apt-get update && apt-get -qq install xxd
!xxd -c 60 -i model.tflite > indoor_scene_recognition.h

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connected                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f                                                                               Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Waiting for headers] [4 InRelease 14.2 kB/129 kB 11%] [Waiting for headers]                                                                               Get:5 https://developer.download.nvidia.com/compute/cuda/rep