# Data Collection

## Objectives

1. Add kaggle.json file
2. Successfully pull data from Kaggle via API
3. Add json file to git.ignore

## Inputs

- Cherry leaves data from Kaggle API


## Outputs

- Unzipped cherry leaves image folders

## Set Directory

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/workspaces/mildew-detector/jupyter_notebooks'

In [2]:
os.chdir(os.path.dirname(current_dir))
print(current_dir)

/workspaces/mildew-detector/jupyter_notebooks


In [3]:
current_dir = os.getcwd()
current_dir

'/workspaces/mildew-detector'

## Import Packages

In [4]:
import numpy
from PIL import Image

In [None]:
%pip install -r /workspaces/mildew-detector/requirements.txt

## Install Kaggle

In [None]:
%pip install kaggle==1.5.12

#### Configure Kaggle

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

#### Set Kaggle Dataset and Download

In [None]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/mildew_dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Unzip downloaded file and delete zip file.

In [None]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

## Data Preparation

#### Data Cleaning

Check and remove non-image files

In [5]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        # print(files)
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)  # remove non image file
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))

In [6]:
remove_non_image_file(my_data_dir='inputs/mildew_dataset/cherry-leaves/')

Folder: powdery_mildew - has image file 2104
Folder: powdery_mildew - has non-image file 0


IsADirectoryError: [Errno 21] Is a directory: 'inputs/mildew_dataset/cherry-leaves//resized/powdery_mildew'

#### Resize Images to Smaller Pixel Size
This is to avoid issues with a too-large slug for deployment later down the line. 

Note - Function in its current form requires the newPath variable to be amended to send the resized images to the correct folder.

In [None]:
import os
from os import listdir

In [None]:
def image_resize(path):
    print(path)
    newPath = 'inputs/mildew_dataset/cherry-leaves/resized/powdery_mildew/'
    print ('newpathtest', newPath)

    for item in os.listdir(path):
        pathway = path + item
        if os.path.isfile(pathway):
            usePath = newPath + item
            this = Image.open(pathway)
            resized = this.resize((100, 100))
            resized.save(usePath, 'JPEG')

        

In [None]:
image_resize(path='inputs/mildew_dataset/cherry-leaves/powdery_mildew/')

In [60]:
def folders_comparison(my_data_dir):
    print(my_data_dir)
    count = []
    for item in os.listdir(my_data_dir):
        pathway = my_data_dir + item
        if os.path.isfile(pathway):
            count.append('item')
            x = len(count)
    print('Number of files -', x)
    print('Folder size in bytes -', os.path.getsize(my_data_dir))
    
            


In [68]:
folders_comparison(my_data_dir='inputs/mildew_dataset/cherry-leaves/resized/healthy/')

inputs/mildew_dataset/cherry-leaves/resized/healthy/
Number of files - 2104
Folder size in bytes - 282624


In [51]:
def folder_size(path):
    for folder in os.listdir(path):
        total = 0
        with os.scandir(path) as it:
            for entry in it:
                if entry.is_file():
                    total += entry.stat().st_size
    return total

In [52]:
folder_size('inputs/mildew_dataset/cherry-leaves/healthy/')

KeyboardInterrupt: 

### Split Train and Validation Test Set

Training, validation and test sets split by 0.7, 0.1 and 0.2 ratios. 

In [None]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)


In [None]:
split_train_validation_test_images(my_data_dir=f"inputs/mildew_dataset/cherry-leaves/",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )