# Data Collection

## Objectives

* Fetch data from Kaggle and prepare it for further processes.

## Inputs

*   Kaggle JSON file - the authentication token. 

## Outputs

* Generate Dataset: inputs/datasets/cherry_leaves/


# Import packages

In [2]:
# Use the %cd magic command to change the current directory to the location of requirements.txt
%cd /Users/simonpaske/Desktop/Project5-Mildew-Detection-in-Cherry-Leaves/
# Use the !pip command to install the dependencies from the requirements.txt file
%pip install -r requirements.txt

/Users/simonpaske/Desktop/Project5-Mildew-Detection-in-Cherry-Leaves
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy
import os

# Change working directory

In [4]:
current_dir = os.getcwd()
current_dir

'/Users/simonpaske/Desktop/Project5-Mildew-Detection-in-Cherry-Leaves'

We want to make the parent of the current directory the new current directory.
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [5]:
os.chdir('/Users/simonpaske/Desktop/Project5-Mildew-Detection-in-Cherry-Leaves/')
print("You set a new current directory")

You set a new current directory


In [6]:
current_dir = os.getcwd()
current_dir

'/Users/simonpaske/Desktop/Project5-Mildew-Detection-in-Cherry-Leaves'

Run the cell below **to change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON**.

In [7]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()

!chmod 600 kaggle.json

Set the Kaggle Dataset destination directory

In [8]:
DestinationFolder = "inputs/datasets/"
os.makedirs(DestinationFolder, exist_ok=True)

Download the dataset from Kaggle and add it to the destination directory
https://www.kaggle.com/codeinstitute/cherry-leaves

Unzip the downloaded file, and delete the zip file.

In [9]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry_leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry_leaves.zip')

# Data Preparation

## Data cleaning

### Check and remove non-image files

In [10]:
def remove_non_image_files(data_dir):
    image_extensions = ('.png', '.jpg', '.jpeg')

    folders = [folder for folder in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, folder))]

    for folder in folders:
        folder_path = os.path.join(data_dir, folder)
        files = os.listdir(folder_path)

        image_count = 0
        non_image_count = 0

        for file_name in files:
            file_path = os.path.join(folder_path, file_name)

            if os.path.isfile(file_path) and not file_name.lower().endswith(image_extensions):
                os.remove(file_path)  # Remove non-image file
                non_image_count += 1
            elif os.path.isfile(file_path):
                image_count += 1

        print(f"Folder: {folder} - has image files: {image_count}")
        print(f"Folder: {folder} - has non-image files: {non_image_count}")


data_dir = 'inputs/datasets/cherry-leaves'
remove_non_image_files(data_dir)

Folder: powdery_mildew - has image files: 2104
Folder: powdery_mildew - has non-image files: 0
Folder: healthy - has image files: 2104
Folder: healthy - has non-image files: 0


### Resize images

In [14]:
from PIL import Image
import os

data_dir = 'inputs/datasets/cherry-leaves'
labels = ['healthy', 'powdery_mildew']
target_shape = (100, 100, 3)

for label in labels:
    label_path = os.path.join(data_dir, label)
    
    for img_file in os.listdir(label_path):
        img_path = os.path.join(label_path, img_file)
        
        # Open the image using PIL
        img = Image.open(img_path)
        
        # Resize the image to the target shape
        img_resized = img.resize((target_shape[1], target_shape[0]))
        
        # Save the resized image back to the original path, overwriting the original
        img_resized.save(img_path)
        
print("Image resizing and overwriting completed.")


Image resizing and overwriting completed.


In [15]:
from PIL import Image

# Open the image
img_path = img_path = os.path.join(label_path, img_file)
img = Image.open(img_path)

# Get the size (width x height) of the image
img_size = img.size
print("Image size:", img_size)


Image size: (100, 100)


## Split train validation test set and delete empty folders

In [16]:
import os
import shutil
import random

def is_image_file(file_name):
    return file_name.lower().endswith(('.png', '.jpg', '.jpeg'))

def split_dataset(data_dir, output_path, train_size=0.7, valid_size=0.1, test_size=0.2, preserve_structure=True):
    # Check if train + test + valid size is equal to 1
    assert train_size + valid_size + test_size == 1

    # Get the list of all items (directories or files)
    all_items = [item for item in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, item))]

    # Filter out non-directory items (like .DS_Store)
    classes = [item for item in all_items if os.path.isdir(os.path.join(data_dir, item))]

    # Get the list of all images
    images = []
    for class_name in classes:
        class_path = os.path.join(data_dir, class_name)
        class_images = [os.path.join(class_path, file_name) for file_name in os.listdir(class_path) if is_image_file(file_name)]
        images.extend(class_images)

    # Shuffle the images
    random.shuffle(images)

    # Split the images into train, test, and validation sets
    train_end_index = int(train_size * len(images))
    valid_end_index = int(valid_size * len(images)) + train_end_index

    train_images = images[:train_end_index]
    valid_images = images[train_end_index:valid_end_index]
    test_images = images[valid_end_index:]

    # Move images to corresponding folders with subfolders 'Healthy' and 'Powdery_Mildew'
    for split, split_images in zip(['train', 'validation', 'test'], [train_images, valid_images, test_images]):
        split_path = os.path.join(output_path, split)
        os.makedirs(split_path, exist_ok=True)

        for image_path in split_images:
            image_name = os.path.basename(image_path)
            # Determine class (Healthy or Powdery_Mildew) based on the image path
            class_name = 'healthy' if 'healthy' in image_path.lower() else 'powdery_mildew'
            class_path = os.path.join(split_path, class_name)
            os.makedirs(class_path, exist_ok=True)
            destination_path = os.path.join(class_path, image_name)
            shutil.move(image_path, destination_path)

    # Remove empty folders
    for class_name in classes:
        class_path = os.path.join(data_dir, class_name)
        if not os.listdir(class_path):
            os.rmdir(class_path)

    return train_images, valid_images, test_images

data_dir = 'inputs/datasets/cherry-leaves'
output_path = 'inputs/datasets/cherry-leaves'
train_images, valid_images, test_images = split_dataset(data_dir, output_path)


In [17]:
print('Train set:', len(train_images), '\n' 'test images:', len(test_images), '\n' 'validation images:', len(valid_images))

Train set: 2945 
test images: 843 
validation images: 420
