# Description of the dataset and the task
- Data Collection
- Implications on the types of conclusions that could be made from the data
- Description of the variables, observations, and/or structure of the data
- Target task

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os.path
import matplotlib.pyplot as plt
import tensorflow as tf

In [5]:
# Create a list with the filepaths for training and testing
train_path = Path('dataset/train')
train_imgs_jpg = list(train_path.glob(r'**/*.jpg'))
train_imgs_jpeg = list(train_path.glob(r'**/*.jpeg'))
train_imgs_png = list(train_path.glob(r'**/*.png'))
train_imgs = train_imgs_jpg + train_imgs_jpeg + train_imgs_png

val_path = Path('dataset/validation')
val_imgs_jpg = list(val_path.glob(r'**/*.jpg'))
val_imgs_jpeg = list(val_path.glob(r'**/*.jpeg'))
val_imgs_png = list(val_path.glob(r'**/*.png'))
val_imgs = val_imgs_jpg + val_imgs_jpeg + val_imgs_png

test_path = Path('dataset/test')
test_imgs_jpg = list(test_path.glob(r'**/*.jpg'))
test_imgs_jpeg = list(test_path.glob(r'**/*.jpeg'))
test_imgs_png = list(test_path.glob(r'**/*.png'))
test_imgs = test_imgs_jpg + test_imgs_jpeg + test_imgs_png

def processData(filepath):
    # Create a DataFrame with the filepath of the image and the labels of each
    labels = [str(filepath[i]).split("\\")[-2] for i in range(len(filepath))]

    filepath = pd.Series(filepath, name='Filepaths').astype(str)
    labels = pd.Series(labels, name='Labels')

    # Concatenate filepaths and labels
    df = pd.concat([filepath, labels], axis=1)

    # Shuffle the DataFrame
    df = df.sample(frac=1).reset_index(drop=True)
    
    return df

train_df = processData(train_imgs)
test_df = processData(test_imgs)
val_df = processData(val_imgs)

['dataset', 'train', 'apple', 'Image_1.jpg']
['dataset', 'test', 'apple', 'Image_1.jpg']
['dataset', 'validation', 'apple', 'Image_1.jpg']


In [13]:
print('----- Training set -----')
print(f'Number of images: {train_df.shape[0]}\n')
print(f'Number of different labels: {len(train_df.Labels.unique())}\n')
print(f'Labels: {train_df.Labels.unique()}')

print('\n----- Validation set -----')
print(f'Number of images: {val_df.shape[0]}\n')
print(f'Number of different labels: {len(val_df.Labels.unique())}\n')
print(f'Labels: {val_df.Labels.unique()}')

print('\n----- Test set -----')
print(f'Number of images: {test_df.shape[0]}\n')
print(f'Number of different labels: {len(test_df.Labels.unique())}\n')
print(f'Labels: {test_df.Labels.unique()}')

----- Training set -----
Number of images: 3278

Number of different labels: 36

Labels: ['chilli pepper' 'lettuce' 'garlic' 'cucumber' 'potato' 'orange' 'pear'
 'grapes' 'capsicum' 'corn' 'raddish' 'ginger' 'peas' 'banana' 'jalepeno'
 'turnip' 'sweetpotato' 'paprika' 'cauliflower' 'carrot' 'pomegranate'
 'watermelon' 'lemon' 'mango' 'eggplant' 'cabbage' 'sweetcorn' 'pineapple'
 'beetroot' 'onion' 'bell pepper' 'spinach' 'apple' 'kiwi' 'tomato'
 'soy beans']

----- validation set -----
Number of images: 340

Number of different labels: 36

Labels: ['bell pepper' 'capsicum' 'sweetcorn' 'spinach' 'mango' 'onion'
 'sweetpotato' 'chilli pepper' 'jalepeno' 'watermelon' 'apple' 'cabbage'
 'peas' 'pomegranate' 'raddish' 'corn' 'eggplant' 'carrot' 'lettuce'
 'potato' 'banana' 'ginger' 'orange' 'pineapple' 'lemon' 'turnip' 'kiwi'
 'garlic' 'grapes' 'cucumber' 'beetroot' 'paprika' 'tomato' 'soy beans'
 'pear' 'cauliflower']

----- Test set -----
Number of images: 340

Number of different labels:

In [None]:
# The DataFrame with the filepaths in one column and the labels in the other one
train_df.head(5)

# Exploratory data analysis
- Mean Image
- Distribution of the Size of Images
- Distribution of labels

# Data Pre-processing and Cleaning
- Change into same dimensions

In [None]:
# load and augment the images for the CNN model
pp_function = tf.keras.applications.mobilenet_v2.preprocess_input
train_img_generator = tf.keras.preprocessing.image.ImageDataGenerator(preprocessing_function=pp_function)
test_img_generator = tf.keras.preprocessing.image.ImageDataGenerator(preprocessing_function=pp_function)

train_images = train_img_generator.flow_from_dataframe(
    dataframe=train_df,
    x_col='Filepath',
    y_col='Label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=0,
    rotation_range=30,
    zoom_range=0.15,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode="nearest"
)

val_images = train_img_generator.flow_from_dataframe(
    dataframe=val_df,
    x_col='Filepath',
    y_col='Label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=0,
    rotation_range=30,
    zoom_range=0.15,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode="nearest"
)

test_images = test_img_generator.flow_from_dataframe(
    dataframe=test_df,
    x_col='Filepath',
    y_col='Label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)

In [None]:
pretrained = tf.keras.applications.MobileNetV2(input_shape)

# Model Training

# Model Selection and Hyperparameter Tuning

# Insights and conclusions