# Data Collection

## Objectives

* Fetch data from Kaggle and save as raw data

## Inputs

* Kaggle.JSON authenticaiton token

## Outputs

* Data divided into train, test and validation datasets in inputs/cherry-leaves-data/cherry-leaves


---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Install Packages

Section 1 content

In [None]:
pip install -r /workspaces/mildew-detection-cherry-leaves/requirements.txt

In [None]:
import numpy

In [None]:
pip install kaggle==1.5.12

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

In [None]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/cherry-leaves-data"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

In [None]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

---

In [None]:
! python --version

# Data Cleaning

Section 2 content

In [None]:
def remove_non_image_files(image_data_dir):
    image_extensions = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(image_data_dir)
    for folder in folders:
        files = os.listdir(image_data_dir + '/' + folder)
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extensions):
                file_location = image_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"The number of image files in '{folder}' folder is", len(j))
        print(f"The number of non-image files in '{folder}' folder is", len(i))

In [None]:
remove_non_image_files(image_data_dir='inputs/cherry-leaves-data/cherry-leaves')

## Split train, validation and test sets

In [None]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)


**Per convention:**
* The training set is 70% of data
* The validation set is 10% of data
* The test set is 20% of data

In [None]:
split_train_validation_test_images(my_data_dir=f"inputs/cherry-leaves-data/cherry-leaves", 
                                   train_set_ratio=0.7, 
                                   validation_set_ratio=0.1, 
                                   test_set_ratio=0.20
                                   )

---

NOTE

* You may add as many sections as you want, as long as it supports your project workflow.
* All notebook's cells should be run top-down (you can't create a dynamic wherein a given point you need to go back to a previous cell to execute some task, like go back to a previous cell and refresh a variable content)