# Data Collection

In [1]:
# Import packages

import numpy
import os


In [2]:
# Change working directory

current_dir = os.getcwd()
current_dir

'/workspace/Mildew-Detection-in-Cherry-Leaves-P5/jupyter_notebooks'

In [3]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


In [4]:
current_dir = os.getcwd()
current_dir

'/workspace/Mildew-Detection-in-Cherry-Leaves-P5'

In [None]:
# install kaggle
!pip install kaggle

In [8]:
# Setting the kaggle configuration directory to the current working directory

os.environ["KAGGLE_CONFIG_DIR"] = os.getcwd()
! chmod 600 kaggle.json

In [11]:
# kaggle dataset and download

KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/cherry_leaves-dataset"
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Downloading cherry-leaves.zip to inputs/cherry_leaves-dataset
 75%|████████████████████████████▎         | 41.0M/55.0M [00:01<00:00, 25.7MB/s]
100%|██████████████████████████████████████| 55.0M/55.0M [00:01<00:00, 30.7MB/s]


In [None]:
# unziping data

! unzip {DestinationFolder}/*.zip -d {DestinationFolder} \
    && rm {DestinationFolder}/*.zip

# Data preperation

## Data cleaning
* check and remove non image files

In [5]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)


            # print files
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + given_file
                os.remove(file_location) # removes non image files
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))

In [3]:
remove_non_image_file(my_data_dir='/workspace/Mildew-Detection-in-Cherry-Leaves-P5/inputs/cherry_leaves-dataset/cherry-leaves')

Folder: healthy - has image file 2104
Folder: healthy - has non-image file 0
Folder: powdery_mildew - has image file 2104
Folder: powdery_mildew - has non-image file 0


## Split train validation set

In [11]:
import os
import shutil
import random
import joblib

def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum 1.0")
        return

    # get classes labels
    labels = os.listdir(my_data_dir) # it should only get the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train','validation','test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random-shuffle(files)


            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)


            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move given file to train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)
                
                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move given file to validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)
                
                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)
                
                count += 1
            
            os.rmdir(my_data_dir + '/' + label)

* The training set is divided into 0.70 ratio of data.
* The validation set is divided into 0.10 ratio of data.
* The test set is divided into 0.20 ratio of data.

In [14]:
split_train_validation_test_images(my_data_dir = f"/workspace/Mildew-Detection-in-Cherry-Leaves-P5/inputs/cherry_leaves-dataset/cherry-leaves",
                        train_set_ratio = 0.7,
                        validation_set_ratio = 0.1,
                        test_set_ratio = 0.2
                        )