## 01 - Data Collection

### Objective:

- Install and configure Kaggle API
- Download and unzip the "Cherry Leaves" dataset

In [None]:
%pip install -r /workspaces/cherry-leaves-health/requirements.txt

Import packages

In [2]:
import numpy
import os

In [3]:
os.chdir('/workspaces/cherry-leaves-health')
print("You set a new current directory")

You set a new current directory


In [4]:
current_dir = os.getcwd()
current_dir

'/workspaces/cherry-leaves-health'

### Step 1: Install Kaggle and Set Up Environment

In [5]:
%pip install kaggle

Note: you may need to restart the kernel to use updated packages.


### Step 2: Set File Permissions for kaggle.json

In [7]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

### Step 3: Download Dataset from Kaggle

In [8]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/cherry_leaves_dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Dataset URL: https://www.kaggle.com/datasets/codeinstitute/cherry-leaves
License(s): unknown


### Step 4: Unzip Dataset

In [9]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

## Data Preparation

### Data cleaning

### Check and remove non image files

In [10]:
import os

# Paths to the image folders
healthy_dir = "/workspaces/cherry-leaves-health/inputs/cherry_leaves_dataset/cherry-leaves/healthy"
mildew_dir = "/workspaces/cherry-leaves-health/inputs/cherry_leaves_dataset/cherry-leaves/powdery_mildew"

def remove_non_images(directory):
    removed_files = []
    valid_ext = [".jpg", ".jpeg", ".png"]

    for filename in os.listdir(directory):
        ext = os.path.splitext(filename)[1].lower()  # Normalize extension to lowercase
        if ext not in valid_ext:
            file_path = os.path.join(directory, filename)
            os.remove(file_path)
            removed_files.append(filename)

    return removed_files

# Clean both folders
removed_healthy = remove_non_images(healthy_dir)
removed_mildew = remove_non_images(mildew_dir)

print("Data cleaning complete.")
print(f"Removed from healthy: {removed_healthy}")
print(f"Removed from mildew: {removed_mildew}")

Data cleaning complete.
Removed from healthy: []
Removed from mildew: []


## Split train validation test set

In [11]:
import os
import shutil
import random
from pathlib import Path

# Define source folders
healthy_src = Path("/workspaces/cherry-leaves-health/inputs/cherry_leaves_dataset/cherry-leaves/healthy")
mildew_src = Path("/workspaces/cherry-leaves-health/inputs/cherry_leaves_dataset/cherry-leaves/powdery_mildew")

# Define destination for split dataset
base_output = Path("/workspaces/cherry-leaves-health/inputs/cherry_leaves_split")
classes = ["healthy", "powdery_mildew"]

# Define split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

def split_and_copy(class_name, src_dir):
    # List image files
    files = [f for f in os.listdir(src_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))]

    if len(files) == 0:
        print(f"No images found in {src_dir}. Skipping {class_name}.")
        return

    random.shuffle(files)

    n_total = len(files)
    n_train = int(n_total * train_ratio)
    n_val = int(n_total * val_ratio)

    train_files = files[:n_train]
    val_files = files[n_train:n_train + n_val]
    test_files = files[n_train + n_val:]

    for split_name, split_files in zip(["train", "val", "test"], [train_files, val_files, test_files]):
        split_dir = base_output / split_name / class_name
        split_dir.mkdir(parents=True, exist_ok=True)

        for file in split_files:
            src_file = src_dir / file
            dst_file = split_dir / file

            if not dst_file.exists():  # Prevent duplicates
                shutil.copy(src_file, dst_file)

    print(f"{class_name}: {n_total} images → {n_train} train, {n_val} val, {len(test_files)} test")

# Run the splitting
split_and_copy("healthy", healthy_src)
split_and_copy("powdery_mildew", mildew_src)

healthy: 2104 images → 1472 train, 315 val, 317 test
powdery_mildew: 2104 images → 1472 train, 315 val, 317 test
