# Data collection

## Objectives

* Fetch data from Kaggle and save as raw data

## Inputs

* Kaggle JSON file - the authentication token.

## Outputs

* Generate Dataset: inputs/datasets/mildew_detection.

## Additional Comments

* No additional comments.



---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [3]:
import os
current_dir = os.getcwd()
current_dir

'/workspace/mildew-detection/jupyter_notebooks'

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [4]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [5]:
current_dir = os.getcwd()
current_dir

'/workspace/mildew-detection'

# Install Kaggle

In [4]:
%pip install kaggle==1.5.12

Collecting kaggle==1.5.12
  Downloading kaggle-1.5.12.tar.gz (58 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting tqdm (from kaggle==1.5.12)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting python-slugify (from kaggle==1.5.12)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode>=1.3 (from python-slugify->kaggle==1.5.12)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73077 sha256=a015c23890e4ee329d4612ca321a64c5388565763fc41b80e67b1ea216c3e726
  Stored in directory: /home/gitpod/.cache/pip/wheels/f5/69/4d/d701fc604b9fb09be59718b4056fd55

Change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON.

In [6]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

Set the Kaggle Dataset and Download it.

In [7]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/datasets/raw"
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Downloading cherry-leaves.zip to inputs/datasets/raw
100%|█████████████████████████████████████▉| 55.0M/55.0M [00:01<00:00, 44.3MB/s]
100%|██████████████████████████████████████| 55.0M/55.0M [00:01<00:00, 32.0MB/s]


Unzip the file and delete the zip folder.

In [8]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

---

# Data preparation

Data cleaning

Check for and remove any non-image files in the folder.

In [9]:
def remove_non_image_file(my_data_dir):
    # Allowed image extensions
    image_extension = ('.png', '.jpg', '.jpeg')
    
    # List all items in the base directory
    folders = os.listdir(my_data_dir)
    
    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)
        
        # Ensure the item is a directory before processing
        if os.path.isdir(folder_path):
            files = os.listdir(folder_path)
            image_count = 0
            non_image_count = 0
            
            for given_file in files:
                file_path = os.path.join(folder_path, given_file)
                
                # Check if the file has a valid image extension
                if not given_file.lower().endswith(image_extension):
                    os.remove(file_path)  # Remove non-image file
                    non_image_count += 1
                else:
                    image_count += 1
            
            # Print results for each folder
            print(f"Folder: {folder} - has image files: {image_count}")
            print(f"Folder: {folder} - has non-image files: {non_image_count}")

In [10]:
remove_non_image_file(my_data_dir='inputs/datasets/raw/cherry-leaves/healthy')
remove_non_image_file(my_data_dir='inputs/datasets/raw/cherry-leaves/powdery_mildew')

Split train validation test set

In [15]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)

In [16]:
split_train_validation_test_images(my_data_dir="inputs/datasets/raw/cherry-leaves",
    train_set_ratio=0.7,
    validation_set_ratio=0.1,
    test_set_ratio=0.2
)


---

# Push files to Repo

* If you don't need to push files to Repo, you may replace this section with "Conclusions and Next Steps" and state your conclusions and next steps.

In [18]:
import os
try:    
    os.makedirs(name='outputs/datasets/collection')
except Exception as e:
    print(e)
