# Description of the dataset and the task
- Data Collection
- Implications on the types of conclusions that could be made from the data
- Description of the variables, observations, and/or structure of the data
- Target task

In [None]:
import pandas as pd
from pathlib import Path
import os.path
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import os
import PIL
import PIL.Image
from PIL import Image
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing import image
import dask.bag as bag
from dask.diagnostics import ProgressBar
import matplotlib.cm as cm

In [None]:
# Create a list with the filepaths for training and testing
batch_size = 32
img_height = 224
img_width = 224

train_path = Path('dataset/train')
test_path = Path('dataset/test')

train_ds = tf.keras.utils.image_dataset_from_directory(
    train_path, 
    validation_split=0.2, 
    subset="training", 
    seed=123, 
    image_size=(img_height, img_width),
    batch_size=batch_size)

val_ds = tf.keras.utils.image_dataset_from_directory(
  train_path,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

test_ds = tf.keras.utils.image_dataset_from_directory(
    test_path, 
    seed=123, 
    image_size=(img_height, img_width))

class_names = train_ds.class_names
print(class_names)
print(len(class_names))

In [None]:
class_names = np.array(train_ds.class_names)
print(class_names)

In [None]:
for image_batch, labels_batch in train_ds:
    print(image_batch.shape)
    print(labels_batch.shape)
    break

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(class_names[labels[i]])
        plt.axis("off")

# Exploratory data analysis

This block of code initializes file paths for the folders and images that are going to be used for the exploratory data analysis.

In [None]:
# train_dir = Path('dataset/train')
# train_filepaths_jpg = list(train_dir.rglob(r'**/*.jpg'))
# train_filepaths_jpeg = list(train_dir.rglob(r'**/*.jpeg'))
# train_filepaths_png = list(train_dir.rglob(r'**/*.png'))
# train_filepaths = train_filepaths_jpg + train_filepaths_jpeg + train_filepaths_png


# def proc_img(filepath):
#     """ Create a DataFrame with the filepath and the labels of the pictures
#     """

#     labels = [str(filepath[i]).split("\\")[-2] \
#               for i in range(len(filepath))]

#     filepath = pd.Series(filepath, name='Filepath').astype(str)
#     labels = pd.Series(labels, name='Label')

#     # Concatenate filepaths and labels
#     df = pd.concat([filepath, labels], axis=1)

#     # Shuffle the DataFrame and reset index
#     df = df.sample(frac=1).reset_index(drop = True)
    
#     return df

# train_df = proc_img(train_filepaths)
# train_df = train_df.sort_values("Label")
# labels = train_df["Label"].unique()
# labels.sort()
# # train_df
# # train_df.loc[train_df["Label"] == label]

# i=0
# imagePath = {}

# for label in labels:
#     imagePath[i] = [fn for fn in os.listdir(f'{train_dir}\\{label}') if fn.endswith('.jpg')]
#     temp = [fn for fn in os.listdir(f'{train_dir}\\{label}') if fn.endswith('.png')]
#     for j in temp:
#         imagePath[i].append(j)
#     temp = [fn for fn in os.listdir(f'{train_dir}\\{label}') if fn.endswith('.jpeg')]
#     for j in temp:
#         imagePath[i].append(j)
#     i+=1

# Distribution of Sizes
The distribution of sizes is shown by obtaining the dimensions of the image namely its height, weight, and depth if it has depth. The height and weight are then compiled into a scatterplot for easy visualization with each fruit and vegetable having its own graph. Knowing the distribution of sizes can help us understand which size is appropriate to be used for pre-processing purposes to normalize the sizes of each image and better tune the model.

In [None]:
# # Distribution of Sizes
# classDirectories = {}
# for label in labels:
#     classDirectories[label] = 'dataset/train/' + label + '/'

# def get_dims(file):
#     im = Image.open(file)
#     arr = np.array(im)
#     if (len(arr.shape) == 3):
#         h,w,d = arr.shape
#     else:
#         h,w = arr.shape
#     return h,w

In [None]:
# for n,d in classDirectories.items():
#     filepath = d
#     filelist = [filepath + f for f in os.listdir(filepath)]
#     dims = bag.from_sequence(filelist).map(get_dims)
#     with ProgressBar():
#         dims = dims.compute()
#         dim_df = pd.DataFrame(dims, columns=['height', 'width'])
#         sizes = dim_df.groupby(['height', 'width']).size().reset_index().rename(columns={0:'count'})
#         sizes.plot.scatter(x='width', y='height');
#         plt.xlim(0, 8000)
#         plt.ylim(0, 8000)
#         plt.title('Image Sizes (pixels) | {}'.format(n))

It can be seen from the results above that most images gather around the 0-2000 height and width so it is important to run it again and limit the view to those of that size.

In [None]:
# for n,d in classDirectories.items():
#     filepath = d
#     filelist = [filepath + f for f in os.listdir(filepath)]
#     dims = bag.from_sequence(filelist).map(get_dims)
#     with ProgressBar():
#         dims = dims.compute()
#         dim_df = pd.DataFrame(dims, columns=['height', 'width'])
#         sizes = dim_df.groupby(['height', 'width']).size().reset_index().rename(columns={0:'count'})
#         sizes.plot.scatter(x='width', y='height');
#         plt.xlim(0, 2000)
#         plt.ylim(0, 2000)
#         plt.title('Image Sizes (pixels) | {}'.format(n))

It can be observed that most images land in the size 250x250 to 750x750

# Distribution of Labels
By checking the directories of each label the amount of images per fruit and vegetable can be shown in a bar graph with number of images in the y-axis and label in the x-axis. Knowing the proper distrubtion of labels can help determine which image has a different count from others. It is important for each label to have the same amount of images as it may affect the training of the model later on.

In [None]:
# # Distribution of Labels
# number_classes = {}
# for label in labels:
#     number_classes[label] = len(os.listdir(f'{train_dir}\\{label}'))
# f, ax = plt.subplots(figsize=(33,20)) # set the size that you'd like (width, height)
# plt.bar(number_classes.keys(), number_classes.values(), width = .5)
# plt.title("Number of Images by Class");
# plt.xlabel('Class Name');
# plt.ylabel('# Images');

# Data Pre-processing and Cleaning

In [None]:
import numpy as np
import time

import PIL.Image as Image
import matplotlib.pylab as plt

import tensorflow as tf
import tensorflow_hub as hub

import datetime

%load_ext tensorboard

# Model Training

We will make use of **MobileNetV2**, a pre-trained network for image classification. Through this network, **transfer learning** will be performed to make the classification of fruits and vegetables much easier.

We begin by extracting the labels from **ImageNet**, a database of images, to initially test out the their labels on our dataset. We also extract MobileNetV2 for later use.

In [None]:
inception_v3 = "https://tfhub.dev/google/tf2-preview/inception_v3/feature_vector/4"

classifier_model = inception_v3

We define the size of the images and create a **sequential model**. This type of model means building the network one layer at a time. MobileNetV2 is wrapped in a Keras layer to be called later.

In [None]:
IMAGE_SHAPE = (224, 224)

classifier = tf.keras.Sequential([
    hub.KerasLayer(classifier_model, input_shape=IMAGE_SHAPE+(3,))
])

We normalize the values of the input betwee [0, 1] and apply it into our dataset. 

In [None]:
normalization_layer = tf.keras.layers.Rescaling(1./255)
train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y)) # Where x—images, y—labels.
val_ds = val_ds.map(lambda x, y: (normalization_layer(x), y)) # Where x—images, y—labels.
test_ds = test_ds.map(lambda x, y: (normalization_layer(x), y))

While the input pipeline is running, optimization algorithms are implemented to monitor the CPU allocation and tune the value dynamically at runtime

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Test datasets are then shuffled.

In [None]:
for image_batch, labels_batch in train_ds:
    print(image_batch.shape)
    print(labels_batch.shape)
    break

for test_image_batch, test_labels_batch in test_ds:
    print(image_batch.shape)
    print(labels_batch.shape)
    break

As observed, making use of labels from ImageNet was not a good idea as some labels exists in ImageNet but not on our dataset. 

Now, we will make use of MobileNetV2 against our dataset.

In [None]:
inception_v3 = "https://tfhub.dev/google/tf2-preview/inception_v3/feature_vector/4"

feature_extractor_model = inception_v3

In [None]:
feature_extractor_layer = hub.KerasLayer(
    feature_extractor_model,
    input_shape=(224, 224, 3),
    trainable=False)

In [None]:
feature_batch = feature_extractor_layer(image_batch)
print(feature_batch.shape)

Again, we create a model from the pre-trained model with an additiona of a dense layer. A dense layer is a hidden layer that receives input from all neurons of its previous layer.

In [None]:
num_classes = len(class_names)

model = tf.keras.Sequential([
  feature_extractor_layer,
  tf.keras.layers.Dense(num_classes)
])

model.summary()

In [None]:
predictions = model(image_batch)

In [None]:
predictions.shape

The model will be configured with **Adam** as it's optimizer as it is the best adapative optimizer for sparse data. It makes use of stochastic gradient descent method that is based on adaptive estimation of first-order and second-order moments. We make use of **SparseCategoricalCrossentropy** as its loss function since we have more than 1 classes.

A visualization of the events will also be shown which includes: Metrics summary plots. Training graph visualization. Weight histograms.

In [None]:
model.compile(
  optimizer=tf.keras.optimizers.Adam(),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=['acc'])

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir,
    histogram_freq=1) # Enable histogram computation for every epoch.

The number of epochs was initially set to 10.

In [None]:
NUM_EPOCHS = 20

history = model.fit(train_ds,
                    validation_data=val_ds,
                    epochs=NUM_EPOCHS,
                    callbacks=tensorboard_callback)

In [None]:
%tensorboard --logdir logs/fit

Upon observing the change in accuracy of the model, it can be observed that upon reaching 9 epochs, the accuracy no longer improves and the graph goes along the horizontal axis.

In [None]:
loss, accuracy = model.evaluate(test_ds)
print('Test accuracy :', accuracy)

Testing the model against the test dataset, a **loss  of 0.5047 and an accuracy of 0.8576** was achieved.

We now test the model the against a batch of the train dataset and plot the results.

In [None]:
predicted_batch = model.predict(image_batch)
predicted_id = tf.math.argmax(predicted_batch, axis=-1)
predicted_label_batch = class_names[predicted_id]
print(predicted_label_batch)

In [None]:
plt.figure(figsize=(18,20))
plt.subplots_adjust(hspace=0.5)

for n in range(30):
    plt.subplot(6,5,n+1)
    plt.imshow(image_batch[n])
    plt.title(predicted_label_batch[n].title())
    plt.axis('off')
_ = plt.suptitle("Model predictions")

Again, it will be tested against a batch of the test dataset and will be plotted.

In [None]:
test_ds = test_ds.shuffle(2)
for test_image_batch, test_labels_batch in test_ds:
    print(image_batch.shape)
    print(labels_batch.shape)
    break

In [None]:
predicted_batch = model.predict(test_image_batch)
predicted_id = tf.math.argmax(predicted_batch, axis=-1)
predicted_label_batch = class_names[predicted_id]
print(predicted_label_batch)

In [None]:
plt.figure(figsize=(18,20))
plt.subplots_adjust(hspace=0.5)

for n in range(30):
    plt.subplot(6,5,n+1)
    plt.imshow(test_image_batch[n])
    plt.title(predicted_label_batch[n].title())
    plt.axis('off')
_ = plt.suptitle("Model predictions")

To avoid time being consumed while training the data, the model will be saved for future use.

In [None]:
t = time.time()

export_path = "saved_models/{}".format(int(t))
model.save(export_path)

export_path

In [None]:
reloaded = tf.keras.models.load_model(export_path)

In [None]:
result_batch = model.predict(image_batch)
reloaded_result_batch = reloaded.predict(image_batch)

In [None]:
abs(reloaded_result_batch - result_batch).max()

In [None]:
reloaded_predicted_id = tf.math.argmax(reloaded_result_batch, axis=-1)
reloaded_predicted_label_batch = class_names[reloaded_predicted_id]
print(reloaded_predicted_label_batch)

In [None]:
plt.figure(figsize=(18,20))
plt.subplots_adjust(hspace=0.5)
for n in range(30):
    plt.subplot(6,5,n+1)
    plt.imshow(image_batch[n])
    plt.title(reloaded_predicted_label_batch[n].title())
    plt.axis('off')
_ = plt.suptitle("Model predictions")

In [None]:
# predict on the test dataset
reloaded_result_batch = reloaded.predict(test_image_batch)
reloaded_predicted_id = tf.math.argmax(reloaded_result_batch, axis=-1)
reloaded_predicted_label_batch = class_names[reloaded_predicted_id]
print(reloaded_predicted_label_batch)

In [None]:
plt.figure(figsize=(18,20))
plt.subplots_adjust(hspace=0.5)
for n in range(30):
    plt.subplot(6,5,n+1)
    plt.imshow(test_image_batch[n])
    plt.title(reloaded_predicted_label_batch[n].title())
    plt.axis('off')
_ = plt.suptitle("Model predictions")

# Model Selection and Hyperparameter Tuning
- Change number of epochs
- Change the batch size
- Add/reduce the number of layers
- Change activation function in dense layer
- Change kernal/bias(?)

# Insights and conclusions