# Image Dataset Preperation
This notebook contains the code for cleaning and preparing the dataset to be used for the project

Configurable Parameters:
- **root (str)**: Root directory of the project (commonly set to '..').
- **data_set_directory (str)**: Name of the directory containing all datasets (i.e.,'input')
- **raw_data_set_name (str)**: Name of the raw ZIP file or raw dataset folder (i.e., 'kaggle_raw_data').
- **expanded_folder_name (str)**: Name of the expanded folder after extracting raw ZIP data (i.e., 'Dog Emotion').
- **final_data_set (str)**: Folder name of the final processed and augmented dataset used for training.
- **google_drive_id (str)**: ID used for Google Drive downloads (if applicable).
- **log_file_name (str)**: Filename for saving preprocessing logs or debug information.
- **split_ratios (dict)**: Dictionary defining the ratios for splitting the dataset into 'train', 'test', and 'eval' subsets.
- **seed**: Ensures reproducibility when shuffling or splitting the dataset.

In [None]:
# Control Panel to adjust variables

root = ".."
data_set_directory = "input"
raw_data_set_name ="kaggle_raw_data"
expanded_folder_name ="Dog Emotion"
final_data_set = "final_split_15Apr2025"
google_drive_id = "15vCDXS-3GtNHxgL4EczcxMAGQDfVCYe5"
log_file_name = "Data_preprocessing_no_dogs_detected"

split_ratios = {'train': 0.7, 'test': 0.15, 'eval': 0.15}
seed = 42

## Imports
Used to handle the imports

In [None]:
import os
from PIL import Image
import shutil
import numpy as np
import gdown
import zipfile
from ultralytics import YOLO
import torch
import random
from utils.helperFunctions import is_rgb
import sys

# Determine the project root - required to import DataHandler from utils folder
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), root))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from utils.DataHandler import download_dataset

## File Location Variables

Global Variables Created:
- **RAW_IMAGE_ZIP_NAME (str)**: Name of the Zipfile when downloading the unprocessed dataset from the googledrive


In [None]:
RAW_IMAGE_ZIP_NAME = os.path.join(root, data_set_directory, raw_data_set_name)
print(RAW_IMAGE_ZIP_NAME)
random.seed(seed)

..\input\kaggle_raw_data
..\input\Dog Emotion
..\input\final_split_training_augmented
..\logs/Data_preprocessing_no_dogs_detected.txt


In [None]:
# Downloading dataset from kaggle that is on Gdrive
download_dataset(expanded_folder_name,f"https://drive.google.com/uc?id={google_drive_id}",  F"{RAW_IMAGE_ZIP_NAME}.zip", "../input")

Dataset already exists at Dog Emotion


## Creating Dataset

In [None]:
def create_dataset(root_dir: str, model_location: str, model_name: str, data_root_dir: str, output_data_dir_name: str, raw_data_dir: str, log_file_dir: str, log_file_name: str, split_ratios: dict):
    """
    Function used to handle the creation of the dataset, handles data cleaning, data augmentation and splitting

    Args:
        root_dir (str): Directory of the root where this file is stored (used in conjunction with)
        model_location (str): Directory of the model location in reference to the root_dir
        model_name (str): Name of the model that is used for data augmentation (our team used Yolo v8)
        data_root_dir (str): Directory of the dataset
        output_data_dir_name (str): Directory for the final dataset (Will be created if not available)
        raw_data_dir (str): Directory of the base images that are used for dataset creation
        log_file_dir (str): Directory for log files
        log_file_name (str): Name of the log_file
        split_ratios: dict): Dict of the train, eval and test split proportions
    """
    data_folder_root = os.path.join(root_dir, data_root_dir)
    input_data_folder = os.path.join(data_folder_root, raw_data_dir)
    if not os.path.exists(input_data_folder):
        print("Please run the code cell above to download and extract the dataset")
    
    else:
        output_data_folder = os.path.join(data_folder_root, output_data_dir_name)
        # Check if the output dict already exists and skips and tells the users that the final_data_dir already exists
        if not os.path.exists(output_data_folder):
            # Loading the model which will be used to identify areas of interest and perform subject focusing
            print(F"Loading {model_name} model...")
            model = YOLO(os.path.join(root_dir, model_location, model_name))
            model.to("cuda" if torch.cuda.is_available() else "cpu")
            print(f"{model_name} loaded.\n")
            
            # Deleting any existing log file for the data preprocessing so that the data is clean
            log_file_location = os.path.join(root_dir, log_file_dir, f"{log_file_name}.txt")
            if os.path.exists(log_file_location):
                os.remove(log_file_location)

            split_counts = {'train': 0, 'test': 0, 'eval': 0}
            image_pool = []  # List of (cropped_img, class_label)


            #  Iterates for each of the subfolders (classes) within the dataset
            for subfolder in os.listdir(input_data_folder):
                class_path = os.path.join(input_data_folder, subfolder)

                #  Skips over non-folder items
                if not os.path.isdir(class_path):
                    print(f"Skipping non-folder: {subfolder}")
                    continue

                print(f"Processing class: {subfolder}")

                #  Iterates through each image, finding the area of the subject and focusing there
                for filename in os.listdir(class_path):
                    if not filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                        continue

                    filepath = os.path.join(class_path, filename)

                    try:
                        # Removing images that are black and white
                        if not is_rgb(filepath):
                            with open(log_file_location, "a") as f:
                                f.write(f"Grayscale or low-color image skipped: {filepath}\n")
                            print(f"Skipped grayscale/low-color image: {filename}")
                            continue

                        img = Image.open(filepath).convert("RGB")
                        results = model(filepath, conf=0.15)[0]

                        for box in results.boxes:
                            if int(box.cls) == 16:  # Dog class
                                x1, y1, x2, y2 = map(int, box.xyxy[0])
                                cropped = img.crop((x1, y1, x2, y2)).resize((224, 224))
                                image_pool.append((cropped, subfolder, filename))
                                print(f"Cropped dog from {filename} [{subfolder}]")
                                break
                        else:
                            with open(log_file_location, "a") as f:
                                f.write(f"{filepath}\n")
                            print(f"No dog detected in {filename}")

                    except Exception as e:
                        print(f"Error processing {filepath}: {e}")

            # Split and save data according to proportions
            random.shuffle(image_pool)
            total = len(image_pool)
            n_train = int(total * split_ratios['train'])
            n_test = int(total * split_ratios['test'])

            splits = {
                'train': image_pool[:n_train],
                'test': image_pool[n_train:n_train + n_test],
                'eval': image_pool[n_train + n_test:]
            }

            for split, items in splits.items():
                for img, label, original_filename in items:
                    save_dir = os.path.join(output_data_folder, split, label)
                    os.makedirs(save_dir, exist_ok=True)
                    save_path = os.path.join(save_dir, original_filename)
                    img.save(save_path)
                    split_counts[split] += 1

            # Summary
            print("\nAll-in-One Summary:")
            print(f"Total cropped dog images : {total}")
            for split in split_counts:
                print(f"{split.capitalize()} images            : {split_counts[split]}")
            print(f"Final dataset saved to: {output_data_folder}")
            print(f"No-dog log file saved at: {log_file_location}")

        
        else: 
            print(f"Dataset already exists at {output_data_folder}")

In [8]:
print(root, data_set_directory, expanded_folder_name, final_data_set, log_file_name)
create_dataset(root, "models", "yolov8x.pt",data_set_directory, final_data_set,expanded_folder_name, "logs", log_file_name, split_ratios)

.. input Dog Emotion final_split_training_augmented Data_preprocessing_no_dogs_detected
Dataset already exists at ..\input\final_split_training_augmented
