# **Data Collection Notebook**

## Objectives

* Fetch data from Kaggle and save as raw data.
* Prepare data for future processes.

## Inputs

* Kaggle JSON file - authentification token for dataset access. 

## Outputs

* Generate Dataset: inputs/datasets/cherry_dataset

## Additional Comments

* Python 3.8.18 used as kernal during runtime.



---

# Set up notebook workspace

* change working directory to parent for requirements access
* confirm python version for continuity (3.8.18)
* install requirements

In [None]:
# Notebooks saved in subdirectory. Root access required for installing required packages.
import os

# get current directory
current_dir = os.getcwd() 
current_dir

In [None]:
os.chdir(os.path.dirname(current_dir))  # change to parent dir
print("You set a new current directory")

In [None]:
# confirm the new current directory
current_dir = os.getcwd()
current_dir

### Confirm Python Version

In [None]:
# confirm python version is 3.8.18 for continuity
! python --version

### Install packages

In [None]:
!pwd
%pip install -r requirements.txt


# Collect Data via Kaggle API

Install Kaggle

In [None]:
# confirm kaggle installed if not already
%pip install kaggle==1.5.12

Allow Kaggle configuration using auth JSON setting config dir to current dir.

In [8]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

Set Kaggle Dataset and download it.

In [None]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/cherry-leaves-dataset"  # creates new dir/dir
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Unzip downloaded dataset file

In [10]:
import zipfile

try:
    with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
        zip_ref.extractall(DestinationFolder)
except Exception as e:
    print(e)


Delete redundant zip file

In [11]:
os.remove(DestinationFolder + '/cherry-leaves.zip')

---

# Data Preparation

# Data Cleaning

* remove any non-image files

In [None]:
def remove_non_image_files(my_data_dir):
    """
    Remove any files that do not contain
    png, jpg or jpeg extension

    """
    print('Removing non image files...\n')
    image_extension = ('.png', '.jpg', 'jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(f'{my_data_dir}/{folder}')
        # print files
        non_image = []
        image_count = []

        # iterate through all files in each folder
        for given_file in files:
            try:
                if not given_file.lower().endswith(image_extension):
                    file_location = f'{my_data_dir}/{folder}/{given_file}'
                    os.remove(file_location) # remove non image file
                    non_image.append(1)
                else:
                    image_count.append(1)
                    pass
            except Exception as e:
                print(e)

        print(f'Folder: {folder} has - {len(image_count)} image files')
        print(f'Folder: {folder} has - {len(non_image)} non image files, which have been removed')

In [None]:
remove_non_image_files('inputs/cherry-leaves-dataset/cherry-leaves')

# Split train validation test sets

In [None]:
import os
import shutil
import random

def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):
    """
    Split images into train, validation and test sets defined in ratio params.
    (expected: train (70%), validation (10%), test (20%))
    """
    # confirm ratios total 1.0
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print('Ratios should total 1.0.')
        print('You entered:\n')
        print(f'Train radio: {train_set_ratio}')
        print(f'Validation radio: {validation_set_ratio}')
        print(f'Test radio: {test_set_ratio}')
        return

    # get classes labels
    labels = os.listdir(my_data_dir)  # expect only folder name
    if 'test' in labels:
        pass
    else:
        try:
            # create train, test folders with classes labels sub-folder
            for folder in ['train', 'validation', 'test']:
                for label in labels:
                    os.makedirs(name=f'{my_data_dir}/{folder}/{label}')

            for label in labels:

                files = os.listdir(f'{my_data_dir}/{label}')
                random.seed(42)  # random seed for continuity
                random.shuffle(files)

                train_set_files_qty = int(len(files) * train_set_ratio)
                validation_set_files_qty = int(len(files) * validation_set_ratio)

                count = 1
                for file_name in files:
                    if count <= train_set_files_qty:
                        # move given file to train set
                        shutil.move(f'{my_data_dir}/{label}/{file_name}',
                                    f'{my_data_dir}/train/{label}/{file_name}')
                    elif count <= (train_set_files_qty + validation_set_files_qty):
                        # move given file to the validation set
                        shutil.move(f'{my_data_dir}/{label}/{file_name}',
                                    f'{my_data_dir}/validation/{label}/{file_name}')
                    else:
                        # move given file to test set
                        shutil.move(f'{my_data_dir}/{label}/{file_name}',
                                    f'{my_data_dir}/test/{label}/{file_name}')

                    count += 1

                os.rmdir(f'{my_data_dir}/{label}')

        except Exception as e:
            print(e)
    print('Done!')





Conventional ratios will be followed here:
* The training set is divided into a 0.70 ratio of data.
* The validation set is divided into a 0.10 ratio of data.
* The test set is divided into a 0.20 ratio of data.

In [None]:
split_train_validation_test_images(
    my_data_dir='inputs/cherry-leaves-dataset/cherry-leaves',
    train_set_ratio=0.7,
    validation_set_ratio=0.1,
    test_set_ratio=0.2
)

---

# Prepare for push to repository

* Ensure the downloaded files are not added to repo by including in .gitinore file (they can be redownloaded in the workspace if neccessary)

In [None]:
import os

# Add dataset inputs folder to gitignore if not already
!grep -qxF "/inputs/cherry-leaves-dataset/cherry-leaves/" .gitignore || echo "/inputs/cherry-leaves-dataset/cherry-leaves/" >> .gitignore


Confirm included in .gitignore before pushing to repo

In [None]:
cat .gitignore

All files are now prepared for data exploration.
Only images are in the dataset according to defined file types. 