In [1]:
import os
import shutil
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:

# Paths
input_image_dir = "../artifacts/rendered_images"  # Original images
mask_dir = "../artifacts/refined_masks"  # Processed masks
output_dataset_dir = "../artifacts/output_dataset_dir"  # Final dataset

# Ensure dataset structure
split_dirs = ['train', 'val', 'test']
for split in split_dirs:
    os.makedirs(os.path.join(output_dataset_dir, split, "images"), exist_ok=True)
    os.makedirs(os.path.join(output_dataset_dir, split, "masks"), exist_ok=True)

# Load all filenames
image_filenames = [f for f in os.listdir(input_image_dir) if f.endswith((".png", ".jpg"))]

# Ensure corresponding masks exist
image_filenames = [f for f in image_filenames if os.path.exists(os.path.join(mask_dir, f))]

# Train-Validation-Test Split (70-20-10)
train_files, test_files = train_test_split(image_filenames, test_size=0.10, random_state=42)
train_files, val_files = train_test_split(train_files, test_size=0.20, random_state=42)

# Resize dimensions
IMG_SIZE = 256

def process_and_save(files, split):
    """Resizes, normalizes, and saves images/masks."""
    for file in tqdm(files, desc=f"Processing {split} data"):
        img_path = os.path.join(input_image_dir, file)
        mask_path = os.path.join(mask_dir, file)

        # Load images & masks
        img = cv2.imread(img_path)
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)  # Load as grayscale

        # Resize
        img_resized = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        mask_resized = cv2.resize(mask, (IMG_SIZE, IMG_SIZE))

        # Normalize images (convert to float, scale between 0-1)
        img_resized = img_resized.astype(np.float32) / 255.0
        mask_resized = mask_resized.astype(np.uint8)  # Keep masks as binary (0,255)

        # Save processed images & masks
        img_output_path = os.path.join(output_dataset_dir, split, "images", file)
        mask_output_path = os.path.join(output_dataset_dir, split, "masks", file)

        cv2.imwrite(img_output_path, (img_resized * 255).astype(np.uint8))  # Convert back to uint8 for saving
        cv2.imwrite(mask_output_path, mask_resized)

# Process each split
process_and_save(train_files, "train")
process_and_save(val_files, "val")
process_and_save(test_files, "test")

print("✅ Dataset preparation complete. Structured data saved in:", output_dataset_dir)


Processing train data: 100%|██████████| 7/7 [00:00<00:00, 101.51it/s]
Processing val data: 100%|██████████| 2/2 [00:00<00:00, 140.29it/s]
Processing test data: 100%|██████████| 1/1 [00:00<00:00, 164.66it/s]

✅ Dataset preparation complete. Structured data saved in: ../artifacts/output_dataset_dir



