# Image Dataset Preperation
This notebook contains the code for cleaning and preparing the dataset to be used for the project

## Imports
Used to handle the imports

In [7]:
import os
from PIL import Image
import shutil
import numpy as np
import gdown
import zipfile

from utils.helperFunctions import is_rgb

ModuleNotFoundError: No module named 'gdown'

## File Location Variables

In [2]:
data_set_directory = "inputs"
data_set_creation_working_dir = "imageMaker"
root = ".."
raw_data_folder = "dog_emotion"
processed_data_folder = "dog_emotion_rgb"

In [5]:
# Create output folder if it doesn't exist

RAW_IMAGE_INPUT_DIR = os.path.join(root, data_set_directory)
IMAGE_PROCESSING_SUBFOLDER = os.path.join(root, data_set_creation_working_dir)
print(RAW_IMAGE_INPUT_DIR)

print(IMAGE_PROCESSING_SUBFOLDER)
os.makedirs(IMAGE_PROCESSING_SUBFOLDER, exist_ok=True)


..\inputs
..\imageMaker


In [None]:
## Downloading dataset from Gdrive
if not os.path.exists("../input/final_split"):
    # os.makedirs("../input/actual")
    output_path = "../input/final_split.zip"
    gdown.download(f"https://drive.google.com/uc?id=11t8m703wcNss3w5diJSUGBA_vXnCKChr", output_path, quiet=False)
    with zipfile.ZipFile(output_path, 'r') as zip_ref:
        zip_ref.extractall("../input")


## Step 1: Remove Black & White Images
Filters out grayscale images to improve training quality.

In [6]:
# Counters
total_images = 0
bw_images = 0
copied_images = 0
skipped_images = 0

# Scan and filter images
for root, _, files in os.walk(RAW_IMAGE_INPUT_DIR):
    for filename in files:
        if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
            total_images += 1
            path = os.path.join(root, filename)
            if is_rgb(path):
                # Preserve subfolder structure
                relative_path = os.path.relpath(path, RAW_IMAGE_INPUT_DIR)
                destination = os.path.join(IMAGE_PROCESSING_SUBFOLDER, relative_path)
                os.makedirs(os.path.dirname(destination), exist_ok=True)
                shutil.copy(path, destination)
                copied_images += 1
            else:
                print(f"Removed (Not RGB): {filename}")
                bw_images += 1

# Skipped due to load errors
skipped_images = total_images - (copied_images + bw_images)

# Summary
print("\n📊 Preprocessing Summary:")
print(f"Total images scanned      : {total_images}")
print(f"Black & white images      : {bw_images}")
print(f"Images removed            : {bw_images + skipped_images}")
print(f"Images successfully kept  : {copied_images}")
print(f"Images skipped (corrupted): {skipped_images}")
print(f"✅ RGB images saved in: {IMAGE_PROCESSING_SUBFOLDER}")



📊 Preprocessing Summary:
Total images scanned      : 0
Black & white images      : 0
Images removed            : 0
Images successfully kept  : 0
Images skipped (corrupted): 0
✅ RGB images saved in: ..\imageMaker


## Step 2: Detect, Crop, and Resize Dog Faces
Uses YOLOv8 to detect dog faces, crops them, and resizes to 224x224.

In [None]:
import os
from PIL import Image
from ultralytics import YOLO
import torch

# Directories
input_dir = "datasets/dog_emotion_rgb"
output_dir = "datasets/dog_faces_224"
no_detect_log = "no_dogs_detected.txt"

# Load YOLOv8 model
print("📦 Loading YOLOv8 model...")
model = YOLO("yolov8x.pt") 
model.to("cuda" if torch.cuda.is_available() else "cpu")
print("✅ YOLOv8 loaded.\n")

# Counters
total = 0
detected = 0
no_dog = 0

# Clear old log if exists
if os.path.exists(no_detect_log):
    os.remove(no_detect_log)

# Start processing
for root, _, files in os.walk(input_dir):
    for file in files:
        if not file.lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        img_path = os.path.join(root, file)
        relative_path = os.path.relpath(img_path, input_dir)
        save_path = os.path.join(output_dir, relative_path)

        print(f"📷 Processing: {img_path}")

        try:
            results = model(img_path, conf=0.15)  # lowered threshold
            result = results[0]
            dog_found = False

            for box in result.boxes:
                class_id = int(box.cls)
                if class_id == 16:  # class 16 = dog
                    x1, y1, x2, y2 = map(int, box.xyxy[0])
                    os.makedirs(os.path.dirname(save_path), exist_ok=True)

                    img = Image.open(img_path).convert("RGB")
                    cropped = img.crop((x1, y1, x2, y2)).resize((224, 224))
                    cropped.save(save_path)

                    print(f"✅ Saved cropped dog to {save_path}")
                    detected += 1
                    dog_found = True
                    break

            if not dog_found:
                with open(no_detect_log, "a") as f:
                    f.write(f"{img_path}\n")
                print("🐶 No dog class (16) detected.")
                no_dog += 1

        except Exception as e:
            print(f"⚠️ Error processing {img_path}: {e}")
        
        total += 1

# Summary
print("\n📊 Step 2 Summary:")
print(f"Total images processed : {total}")
print(f"Dog detections         : {detected}")
print(f"No detections          : {no_dog}")
print(f"Saved to folder        : {output_dir}")
print(f"No-dog image log       : {no_detect_log}")


## Step 3: Split Dataset
Splits the cleaned dataset into 70% training, 15% validation (test), and 15% final evaluation.

In [None]:
import os
import shutil
import random

input_folder = 'datasets/dog_faces_224/Dog Emotion'
output_base = 'datasets/final_split'

# Splits
split_ratios = {
    'train': 0.7,
    'test': 0.15,
    'eval': 0.15
}

# Seed for reproducibility
random.seed(42)

# Clear and recreate output folders
for split in split_ratios:
    split_path = os.path.join(output_base, split)
    if os.path.exists(split_path):
        shutil.rmtree(split_path)
    os.makedirs(split_path)

total_images = 0
split_counts = {'train': 0, 'test': 0, 'eval': 0}

# For each class folder
for class_folder in os.listdir(input_folder):
    print(f"📂 Found folder: {class_folder}")
    class_path = os.path.join(input_folder, class_folder)
    if not os.path.isdir(class_path):
        continue

    images = [img for img in os.listdir(class_path) if img.lower().endswith(('.jpg', '.jpeg', '.png'))]
    print(f"🖼️  Found {len(images)} images in {class_folder}")

    random.shuffle(images)

    n = len(images)
    n_train = int(n * split_ratios['train'])
    n_test = int(n * split_ratios['test'])
    n_eval = n - n_train - n_test

    split_lists = {
        'train': images[:n_train],
        'test': images[n_train:n_train + n_test],
        'eval': images[n_train + n_test:]
    }

    for split, split_imgs in split_lists.items():
        for img_name in split_imgs:
            src = os.path.join(class_path, img_name)
            dst_dir = os.path.join(output_base, split, class_folder)
            os.makedirs(dst_dir, exist_ok=True)
            shutil.copy(src, os.path.join(dst_dir, img_name))
            split_counts[split] += 1

    total_images += n

# Summary
print("\n📊 Step 3 Summary:")
print(f"Total images processed : {total_images}")
for split in split_ratios:
    print(f"{split.capitalize()} images        : {split_counts[split]}")
print(f"✅ Data split saved in: {output_base}")


## Step 4: Augment Training Data
Applies horizontal flip, rotation, and color jitter to boost training diversity.

In [None]:
import os
from PIL import Image
from torchvision import transforms
import random

# Input/output folders
input_folder = 'datasets/final_split/train'
output_folder = 'datasets/final_split/train_aug_15apr2025'

# Define augmentation transforms
augmentation_pipeline = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.05),
    transforms.Resize((224, 224))
])

# Make sure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Set number of augmentations per image
NUM_AUGMENTATIONS = 2

total_images = 0
augmented_images = 0

for root, _, files in os.walk(input_folder):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            total_images += 1
            input_path = os.path.join(root, file)

            # Load image
            try:
                img = Image.open(input_path).convert('RGB')
            except Exception as e:
                print(f"⚠️ Skipping {input_path} due to error: {e}")
                continue

            # Reconstruct class subfolder path
            relative_path = os.path.relpath(root, input_folder)
            class_output_dir = os.path.join(output_folder, relative_path)
            os.makedirs(class_output_dir, exist_ok=True)

            # Save original (resized) image
            original_resized = transforms.Resize((224, 224))(img)
            original_resized.save(os.path.join(class_output_dir, file))

            # Generate augmented images
            for i in range(NUM_AUGMENTATIONS):
                augmented = augmentation_pipeline(img)
                aug_filename = f"{os.path.splitext(file)[0]}_aug{i+1}.jpg"
                augmented.save(os.path.join(class_output_dir, aug_filename))
                augmented_images += 1

print("\n📊 Step 4 Summary:")
print(f"Total original images     : {total_images}")
print(f"Augmented images created  : {augmented_images}")
print(f"✅ Augmented data saved in: {output_folder}")
