In [1]:
import os
from pathlib import Path

import numpy
import pandas as pd
from matplotlib.image import imread

- Select root directory

In [3]:
root_directory = str(Path(os.getcwd()).parent)+"/"
print(root_directory)

/workspace/Mildew-Detection-in-Cherry-Leaves-P5/


In [4]:
current_working_directory = os.getcwd()
os.chdir(root_directory)

In [5]:
# Setting the kaggle configuration directory to the current working directory

os.environ["KAGGLE_CONFIG_DIR"] = root_directory
! chmod 600 kaggle.json

In [6]:
os.chdir(current_working_directory)

In [7]:
# kaggle dataset and download

KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = f"{root_directory}inputs/cherry_leaves-datases"
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Downloading cherry-leaves.zip to /workspace/Mildew-Detection-in-Cherry-Leaves-P5/inputs/cherry_leaves-datases
 89%|█████████████████████████████████▊    | 49.0M/55.0M [00:01<00:00, 42.6MB/s]
100%|██████████████████████████████████████| 55.0M/55.0M [00:01<00:00, 36.4MB/s]


In [None]:
# unziping data

! unzip {DestinationFolder}/*.zip -d {DestinationFolder} \
    && rm {DestinationFolder}/*.zip

# Data preperation

## Data cleaning
* check and remove non image files

In [9]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + folder)


            # print files
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + given_file
                os.remove(file_location) # removes non image files
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))

In [13]:
remove_non_image_file(my_data_dir=f'{root_directory}inputs/cherry_leaves-datasets/cherry-leaves/')

Folder: healthy - has image file 2104
Folder: healthy - has non-image file 0
Folder: powdery_mildew - has image file 2104
Folder: powdery_mildew - has non-image file 0


- images directory

In [18]:
images_directory = f"""{root_directory}inputs/cherry-leaves-datasets/cherry-leaves/"""
healthy_image_directory = f"""{images_directory}healthy/"""
non_healthy_image_directory = f"""{images_directory}powdery_mildew/"""

print(healthy_image_directory)
print(non_healthy_image_directory)

/workspace/Mildew-Detection-in-Cherry-Leaves-P5/inputs/cherry-leaves-datasets/cherry-leaves/healthy/
/workspace/Mildew-Detection-in-Cherry-Leaves-P5/inputs/cherry-leaves-datasets/cherry-leaves/powdery_mildew/


- enable dataset directories

In [16]:
train_directory = f"""{root_directory}model/data_train/"""
test_directory = f"""{root_directory}model/data_test/"""
val_directory = f"""{root_directory}model/data_validation/"""

for _dir in [train_directory, test_directory, val_directory]:
    print(_dir)

/workspace/Mildew-Detection-in-Cherry-Leaves-P5/model/data_train/
/workspace/Mildew-Detection-in-Cherry-Leaves-P5/model/data_test/
/workspace/Mildew-Detection-in-Cherry-Leaves-P5/model/data_validation/


- Add image labels > healthy vs powdery_mildew

In [19]:
healthy_class = os.listdir(healthy_image_directory)
non_healthy_class = os.listdir(non_healthy_image_directory)
images_dataframe = pd.DataFrame(
    [{"filename": filename, "output": 1} for filename in healthy_class]
    + [{"filename": filename, "output": 0} for filename in non_healthy_class]
)

In [20]:
images_dataframe

Unnamed: 0,filename,output
0,0008f3d3-2f85-4973-be9a-1b520b8b59fc___JR_HL 4...,1
1,0008f3d3-2f85-4973-be9a-1b520b8b59fc___JR_HL 4...,1
2,0008f3d3-2f85-4973-be9a-1b520b8b59fc___JR_HL 4...,1
3,002efba9-09b3-43de-93b7-5c2460185cde___JR_HL 9...,1
4,0048afb8-b950-4c57-9e72-7e26282327ee___JR_HL 9...,1
...,...,...
4203,ffdeb404-b84d-4389-9cc9-e1d3159374fe___FREC_Pw...,0
4204,fff3ae4b-4bce-4b7a-b53c-98c482d9d8fd___FREC_Pw...,0
4205,fff3ae4b-4bce-4b7a-b53c-98c482d9d8fd___FREC_Pw...,0
4206,fffc6a5c-66f3-4d73-afd2-84862fb83c4f___FREC_Pw...,0


In [21]:
def evaluate_image_average_size(row):
    if row.output == 1:
        img = imread(f"{healthy_image_directory}{row.filename}")
    else:
        img = imread(f"{non_healthy_image_directory}{row.filename}")

    height, width, _ = img.shape

    row["height"] = height
    row["width"] = width
    # row['colors'] = colors

    return row


images_dataframe = images_dataframe.apply(
    lambda row: evaluate_image_average_size(row), axis=1
)

In [22]:
images_dataframe.head(10)

Unnamed: 0,filename,output,height,width
0,0008f3d3-2f85-4973-be9a-1b520b8b59fc___JR_HL 4...,1,256,256
1,0008f3d3-2f85-4973-be9a-1b520b8b59fc___JR_HL 4...,1,256,256
2,0008f3d3-2f85-4973-be9a-1b520b8b59fc___JR_HL 4...,1,256,256
3,002efba9-09b3-43de-93b7-5c2460185cde___JR_HL 9...,1,256,256
4,0048afb8-b950-4c57-9e72-7e26282327ee___JR_HL 9...,1,256,256
5,0048afb8-b950-4c57-9e72-7e26282327ee___JR_HL 9...,1,256,256
6,0048afb8-b950-4c57-9e72-7e26282327ee___JR_HL 9...,1,256,256
7,005f183c-0a73-4738-91f7-c0a0e02cd9e3___JR_HL 9...,1,256,256
8,0086a8c7-1440-423e-939d-d0567b1fc4e4___JR_HL 4...,1,256,256
9,0086a8c7-1440-423e-939d-d0567b1fc4e4___JR_HL 4...,1,256,256


In [23]:
images_dataframe["height"].describe()

count    4208.0
mean      256.0
std         0.0
min       256.0
25%       256.0
50%       256.0
75%       256.0
max       256.0
Name: height, dtype: float64

In [24]:
images_dataframe["width"].describe()

count    4208.0
mean      256.0
std         0.0
min       256.0
25%       256.0
50%       256.0
75%       256.0
max       256.0
Name: width, dtype: float64

## Split train validation test set

In [26]:
import os
import random
import shutil

import joblib


def split_train_validation_test_images(
    train_dir,
    val_dir,
    test_dir,
    images_directory,
    train_set_ratio,
    validation_set_ratio,
    test_set_ratio,
):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum 1.0")
        return

    # gets classes labels
    labels_dictionary = {"healthy": 1, "powdery_mildew": 0}

    labels = list(labels_dictionary.keys())

    for label in labels:

        files = os.listdir(images_directory + label)
        random.Random(12345).shuffle(files)

        train_set_files_qty = int(len(files) * train_set_ratio)
        validation_set_files_qty = int(len(files) * validation_set_ratio)

        count = 1
        for file_name in files:
            if count <= train_set_files_qty:
                # move given file to train set
                shutil.copy(
                    images_directory + label + "/" + file_name,
                    train_dir + label + "/" + file_name,
                )

            elif count <= (train_set_files_qty + validation_set_files_qty):
                # move given file to validation set
                shutil.copy(
                    images_directory + label + "/" + file_name,
                    val_dir + label + "/" + file_name,
                )

            else:
                # move given file to test set
                shutil.copy(
                    images_directory + label + "/" + file_name,
                    test_dir + label + "/" + file_name,
                )

            count += 1

Conventionally,
* The training set is divided into 0.70 ratio of data.
* The validation set is divided into 0.10 ratio of data.
* The test set is divided into 0.20 ratio of data.

In [28]:
split_train_validation_test_images(
    images_directory=images_directory,
    train_dir=train_directory,
    val_dir=val_directory,
    test_dir=test_directory,
    train_set_ratio=0.7,
    validation_set_ratio=0.1,
    test_set_ratio=0.2,
)

---

---