# Data Collection Notebook

---

## Objectives

* Fetch data from Kaggle and save as raw data

## Inputs

* Kaggle JSON file - the authentication token.

## Outputs

* Generate Dataset: inputs/datasets/raw/cherry-leaves.

## Additional Comments

* No additional comments.



---

# Change working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/workspace/cherry-leaves/jupyter_notebooks'

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/cherry-leaves'

# Install Kaggle

In [4]:
%pip install kaggle==1.5.12

Collecting kaggle==1.5.12
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.0/59.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting tqdm (from kaggle==1.5.12)
  Obtaining dependency information for tqdm from https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl.metadata
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-slugify (from kaggle==1.5.12)
  Obtaining dependency information for python-slugify from https://files.pythonhosted.org/packages/a4/62/02da182e544a51a5c3ccf4b03ab79df279f9c60c5e82d5e8bec7ca26ac11/python_slugify-8.0.4-py2.py3-none-any.whl.metadata
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)


Change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON.

In [5]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

chmod: cannot access 'kaggle.json': No such file or directory


Set the Kaggle Dataset and Download it.

In [8]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/datasets/raw"
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Downloading cherry-leaves.zip to inputs/datasets/raw
100%|█████████████████████████████████████▉| 55.0M/55.0M [00:02<00:00, 33.3MB/s]
100%|██████████████████████████████████████| 55.0M/55.0M [00:02<00:00, 25.0MB/s]


Unzip the file and delete the zip folder.

In [9]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

---

# Data preparation

## Data cleaning

Check for and remove any non-image files in the folder.

In [10]:
def remove_non_image_file(my_data_dir):
    # Allowed image extensions
    image_extension = ('.png', '.jpg', '.jpeg')
    
    # List all items in the base directory
    folders = os.listdir(my_data_dir)
    
    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)
        
        # Ensure the item is a directory before processing
        if os.path.isdir(folder_path):
            files = os.listdir(folder_path)
            image_count = 0
            non_image_count = 0
            
            for given_file in files:
                file_path = os.path.join(folder_path, given_file)
                
                # Check if the file has a valid image extension
                if not given_file.lower().endswith(image_extension):
                    os.remove(file_path)  # Remove non-image file
                    non_image_count += 1
                else:
                    image_count += 1
            
            # Print results for each folder
            print(f"Folder: {folder} - has image files: {image_count}")
            print(f"Folder: {folder} - has non-image files: {non_image_count}")

In [11]:
remove_non_image_file(my_data_dir='inputs/datasets/raw/cherry-leaves/healthy')
remove_non_image_file(my_data_dir='inputs/datasets/raw/cherry-leaves/powdery_mildew')

Split train validation test set

In [12]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # Define labels explicitly
    labels = ["Healthy", "Powdery Mildew"]

    # Ensure train, validation, and test folders with class label subfolders exist
    for folder in ['train', 'validation', 'test']:
        for label in labels:
            os.makedirs(name=f"{my_data_dir}/{folder}/{label}", exist_ok=True)

    for label in labels:
        # Convert label to directory name for file operations (e.g., lowercase with underscores)
        label_dir_name = label.lower().replace(" ", "_")

        files = os.listdir(f"{my_data_dir}/{label_dir_name}")
        random.shuffle(files)

        train_set_files_qty = int(len(files) * train_set_ratio)
        validation_set_files_qty = int(len(files) * validation_set_ratio)

        count = 1
        for file_name in files:
            if count <= train_set_files_qty:
                # Move a given file to the train set
                shutil.move(f"{my_data_dir}/{label_dir_name}/{file_name}",
                            f"{my_data_dir}/train/{label}/{file_name}")

            elif count <= (train_set_files_qty + validation_set_files_qty):
                # Move a given file to the validation set
                shutil.move(f"{my_data_dir}/{label_dir_name}/{file_name}",
                            f"{my_data_dir}/validation/{label}/{file_name}")

            else:
                # Move a given file to the test set
                shutil.move(f"{my_data_dir}/{label_dir_name}/{file_name}",
                            f"{my_data_dir}/test/{label}/{file_name}")

            count += 1

        os.rmdir(f"{my_data_dir}/{label_dir_name}")


In [13]:
split_train_validation_test_images(my_data_dir="inputs/datasets/raw/cherry-leaves",
    train_set_ratio=0.7,
    validation_set_ratio=0.1,
    test_set_ratio=0.2
)

---

# Push files to Repo

In [14]:
import os
try:    
    os.makedirs(name='outputs/datasets/collection')
except Exception as e:
    print(e)
