# Load images

This tutorial provides a simple example of how to load an image dataset using `tf.data`.

The dataset used in this example is distributed as directories of images, with one class of image per directory.

In [None]:
from google.colab import drive
drive.mount('/gdrive')

## Setup

In [None]:
%mkdir data

In [None]:
!unzip -q /gdrive/MyDrive/kaggle_cars_data.zip 

In [None]:
# Time magic
!pip install ipython-autotime
%load_ext autotime

In [None]:
!pip install -U tensorboard_plugin_profile
# Load the TensorBoard notebook extension.
%load_ext tensorboard

In [None]:
from datetime import datetime
from packaging import version

# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

print(f"Using TF version: {tf.__version__}")
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

In [None]:
from PIL import Image

import IPython.display as display
import numpy as np
import matplotlib.pyplot as plt
import os
import pathlib

In [None]:
device_name = tf.test.gpu_device_name()
if not device_name:
    raise SystemError('GPU device not found')
    
print('Found GPU at: {}'.format(device_name))

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

### Retrieve the images

Before you start any training, you will need a set of images to teach the network about the new classes you want to recognize.

In [None]:
root_path = "./data"

train_data_dir = f"{root_path}/train/"
train_data_dir = pathlib.Path(train_data_dir)

val_data_dir = f"{root_path}/validation/"
val_data_dir = pathlib.Path(val_data_dir)

test_data_dir = f"{root_path}/test/"
test_data_dir = pathlib.Path(test_data_dir)

print(f"Training data from path: {train_data_dir}, Validation data from path: {val_data_dir}, Test data from path: {test_data_dir}")

In [None]:
train_image_count = len(list(train_data_dir.glob('*/*.jpg')))
test_image_count = len(list(test_data_dir.glob('*/*.jpg')))
val_image_count = len(list(val_data_dir.glob('*/*.jpg')))

print(f"Number of train images: {train_image_count}, Number of val images: {val_image_count}, Number of test images: {test_image_count}")

In [None]:
class_names = np.array([item.name for item in train_data_dir.glob('*')])
print(f"Class names: {class_names}")

In [None]:
for fol in class_names:
  print (f"Folder {fol} has {len(list(train_data_dir.glob('%s/*.jpg' % fol)))} images")

for fol in class_names:
  print (f"Folder {fol} has {len(list(val_data_dir.glob('%s/*.jpg' % fol)))} images")

for fol in class_names:
  print (f"Folder {fol} has {len(list(test_data_dir.glob('%s/*.jpg' % fol)))} images")

In [None]:
batch_size = 32
img_height = 100
img_width = 100
steps_per_epoch = np.ceil(train_image_count/batch_size)

## Load using `tf.data`

The above `keras.preprocessing` method is convienient, but has three downsides: 

1. It's slow. See the performance section below.
1. It lacks fine-grained control.
1. It is not well integrated with the rest of TensorFlow.

To load the files as a `tf.data.Dataset` first create a dataset of the file paths:

In [None]:
train_list_ds = tf.data.Dataset.list_files(str(train_data_dir/'*/*'))
val_list_ds = tf.data.Dataset.list_files(str(val_data_dir/'*/*'))
test_list_ds = tf.data.Dataset.list_files(str(test_data_dir/'*/*'))

In [None]:
for f in train_list_ds.take(3):
  print(f.numpy())

for f in val_list_ds.take(3):
  print(f.numpy())

for f in test_list_ds.take(3):
  print(f.numpy())

Write a short pure-tensorflow function that converts a file path to an `(img, label)` pair:

In [None]:
def get_label(file_path):
  # convert the path to a list of path components
  parts = tf.strings.split(file_path, os.path.sep)
  # The second to last is the class-directory
  return parts[-2] == class_names[0]

In [None]:
def decode_img(img):
  # convert the compressed string to a 3D uint8 tensor
  img = tf.image.decode_jpeg(img, channels=3)
  # Use `convert_image_dtype` to convert to floats in the [0,1] range.
  img = tf.image.convert_image_dtype(img, tf.float32)
  # resize the image to the desired size.
  return tf.image.resize(img, [img_width, img_height])

In [None]:
def process_path(file_path):
  label = get_label(file_path)
  # load the raw data from the file as a string
  img = tf.io.read_file(file_path)
  img = decode_img(img)
  return img, label

Use `Dataset.map` to create a dataset of `image, label` pairs:

In [None]:
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_labeled_ds = train_list_ds.map(process_path, 
                                     num_parallel_calls=AUTOTUNE)

In [None]:
for image, label in train_labeled_ds.take(1):
  print("Image shape: ", image.numpy().shape)
  print("Label: ", label.numpy())

In [None]:
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
val_labeled_ds = val_list_ds.map(process_path, 
                                 num_parallel_calls=AUTOTUNE)

In [None]:
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
test_labeled_ds = test_list_ds.map(process_path, 
                                   num_parallel_calls=AUTOTUNE)

### Basic methods for training

To train a model with this dataset you will want the data:

* To be well shuffled.
* To be batched.
* Batches to be available as soon as possible.

These features can be easily added using the `tf.data` api.

In [None]:
def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
  # This is a small dataset, only load it once, and keep it in memory.
  # use `.cache(filename)` to cache preprocessing work for datasets that don't
  # fit in memory.
  if cache:
    if isinstance(cache, str):
      ds = ds.cache(cache)
    else:
      ds = ds.cache()

  ds = ds.shuffle(buffer_size=shuffle_buffer_size)

  # Repeat forever
  ds = ds.repeat()

  ds = ds.batch(batch_size)

  # `prefetch` lets the dataset fetch batches in the background while the model
  # is training.
  ds = ds.prefetch(buffer_size=AUTOTUNE)

  return ds

In [None]:
def prepare_for_testing(ds, cache=True):
  # This is a small dataset, only load it once, and keep it in memory.
  # use `.cache(filename)` to cache preprocessing work for datasets that don't
  # fit in memory.
  if cache:
    if isinstance(cache, str):
      ds = ds.cache(cache)
    else:
      ds = ds.cache()

  ds = ds.batch(batch_size)

  # `prefetch` lets the dataset fetch batches in the background while the model
  # is training.
  ds = ds.prefetch(buffer_size=AUTOTUNE)

  return ds

In [None]:
def show_batch(image_batch, label_batch):
  plt.figure(figsize=(10,10))
  for n in range(25):
      ax = plt.subplot(5,5,n+1)
      plt.imshow(image_batch[n])
      plt.title(class_names[0] if label_batch[n]==True else class_names[1])
    #   plt.title(class_names[label_batch[n]==class_names[0]][0].title())
      plt.axis('off')

In [None]:
train_ds = prepare_for_training(train_labeled_ds)

image_batch, label_batch = next(iter(train_ds))
show_batch(image_batch.numpy(), label_batch.numpy())

In [None]:
val_ds = prepare_for_testing(val_labeled_ds)

image_batch, label_batch = next(iter(val_ds))
show_batch(image_batch.numpy(), label_batch.numpy())

In [None]:
test_ds = prepare_for_testing(test_labeled_ds)

image_batch, label_batch = next(iter(test_ds))
show_batch(image_batch.numpy(), label_batch.numpy())

## Performance

Note: This section just shows a couple of easy tricks that may help performance. For an in depth guide see [Input Pipeline Performance](../../guide/performance/datasets).

To investigate, first here's a function to check the performance of our datasets:

In [None]:
import time
default_timeit_steps = 100

def timeit(ds, steps=default_timeit_steps):
  start = time.time()
  it = iter(ds)
  for i in range(steps):
    batch = next(it)
    if i%10 == 0:
      print('.',end='')
  print()
  end = time.time()

  duration = end-start
  print("{} batches: {} s".format(steps, duration))
  print("{:0.5f} Images/s".format(batch_size*steps/duration))

In [None]:
input_shape = (100, 100, 3)
num_outputs = 1
num_epochs = 10

In [None]:
def get_model(input_shape, num_neurons, num_outputs):
    model = keras.Sequential([
        keras.layers.Flatten(input_shape=input_shape),
        keras.layers.Dense(num_neurons, activation='relu'),
        keras.layers.Dense(num_outputs)
    ])
    
    return model

In [None]:
num_neurons = 64
one_layer_64 = get_model(input_shape, num_neurons, num_outputs)
one_layer_64.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

logdir = f"logs/scalars/one_layer_{num_neurons}"
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir,
                                                   histogram_freq = 1,
                                                   profile_batch = '500,520')

training_history = one_layer_64.fit(
    train_ds, 
    validation_data=val_ds, 
    epochs=num_epochs,
    steps_per_epoch=steps_per_epoch,
    callbacks=[tensorboard_callback])

print(f"Average test loss: {np.average(training_history.history['loss'])}")
val_loss, val_acc = one_layer_64.evaluate(val_ds, verbose=2)
test_loss, test_acc = one_layer_64.evaluate(test_ds, verbose=2)

print(f"\nTest accuracy: {test_acc}")
print(f"\nValidation accuracy: {val_acc}")

In [None]:
num_neurons = 128
one_layer_128 = get_model(input_shape, num_neurons, num_outputs)
one_layer_128.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

logdir = f"logs/scalars/one_layer_{num_neurons}"
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir,
                                                   histogram_freq = 1,
                                                   profile_batch = '500,520')

training_history = one_layer_128.fit(
    train_ds, 
    validation_data=val_ds, 
    epochs=num_epochs,
    steps_per_epoch=steps_per_epoch,
    callbacks=[tensorboard_callback])

print(f"Average test loss: {np.average(training_history.history['loss'])}")
val_loss, val_acc = one_layer_128.evaluate(val_ds, verbose=2)
test_loss, test_acc = one_layer_128.evaluate(test_ds, verbose=2)

print(f"\nTest accuracy: {test_acc}")
print(f"\nValidation accuracy: {val_acc}")

In [None]:
num_neurons = 256
one_layer_256 = get_model(input_shape, num_neurons, num_outputs)
one_layer_256.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

logdir = f"logs/scalars/one_layer_{num_neurons}"
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir,
                                                   histogram_freq = 1,
                                                   profile_batch = '500,520')

training_history = one_layer_256.fit(
    train_ds, 
    validation_data=val_ds, 
    epochs=num_epochs,
    steps_per_epoch=steps_per_epoch,
    callbacks=[tensorboard_callback])

print(f"Average test loss: {np.average(training_history.history['loss'])}")
val_loss, val_acc = one_layer_256.evaluate(val_ds, verbose=2)
test_loss, test_acc = one_layer_256.evaluate(test_ds, verbose=2)

print(f"\nTest accuracy: {test_acc}")
print(f"\nValidation accuracy: {val_acc}")

In [None]:
num_neurons = 512
one_layer_512 = get_model(input_shape, num_neurons, num_outputs)
one_layer_512.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

logdir = f"logs/scalars/one_layer_{num_neurons}"
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir,
                                                   histogram_freq = 1,
                                                   profile_batch = '500,520')

training_history = one_layer_512.fit(
    train_ds, 
    validation_data=val_ds, 
    epochs=num_epochs,
    steps_per_epoch=steps_per_epoch,
    callbacks=[tensorboard_callback])

print(f"Average test loss: {np.average(training_history.history['loss'])}")
val_loss, val_acc = one_layer_512.evaluate(val_ds, verbose=2)
test_loss, test_acc = one_layer_512.evaluate(test_ds, verbose=2)

print(f"\nTest accuracy: {test_acc}")
print(f"\nValidation accuracy: {val_acc}")

In [None]:
%tensorboard --logdir logs/scalars