In [None]:
# !pip install scikit-learn

In [None]:
import os
import glob
from sklearn.model_selection import train_test_split

In [7]:
# Constants
BASE_DIR = "./data"
PATCHES_DIR = "./output_batches"

def create_directory(path):
    """
    Creates a directory if it doesn't already exist.

    Args:
        path (str): Path to the directory.
    """
    os.makedirs(path, exist_ok=True)

def get_image_and_mask_paths(dataset_path):
    """
    Scans the dataset directory for image and corresponding mask file paths.

    Args:
        dataset_path (str): Root directory containing patch subdirectories.

    Returns:
        tuple: Lists of image paths and corresponding mask paths.
    """
    patches_dir = os.path.join(dataset_path, "patches")
    print(f"🔍 Searching for patches in: {os.path.abspath(patches_dir)}")
    
    image_paths = []
    mask_paths = []

    for subfolder in os.listdir(patches_dir):
        subfolder_path = os.path.join(patches_dir, subfolder)
        if os.path.isdir(subfolder_path):
            images = sorted(glob.glob(os.path.join(subfolder_path, "*.tif")))
            for img_path in images:
                if "_cl.tif" in img_path:
                    continue
                mask_path = img_path.replace(".tif", "_cl.tif")
                if os.path.exists(mask_path):
                    image_paths.append(img_path)
                    mask_paths.append(mask_path)

    return image_paths, mask_paths

def split_and_save_data(image_paths, mask_paths, output_dir):
    """
    Splits data into training, validation, and test sets, and saves paths to text files.

    Args:
        image_paths (list): List of image file paths.
        mask_paths (list): List of corresponding mask file paths.
        output_dir (str): Directory where split path files will be saved.
    """
    # Split into 80% train and 20% temp (val + test)
    train_imgs, temp_imgs, train_masks, temp_masks = train_test_split(
        image_paths, mask_paths, test_size=0.2, random_state=42
    )
    # Split temp into 50% val and 50% test (10% each of total)
    val_imgs, test_imgs, val_masks, test_masks = train_test_split(
        temp_imgs, temp_masks, test_size=0.5, random_state=42
    )

    print(f"📊 Dataset Split - Train: {len(train_imgs)}, Val: {len(val_imgs)}, Test: {len(test_imgs)}")

    # Save split data
    splits_dir = os.path.join(output_dir, "splits")
    os.makedirs(splits_dir, exist_ok=True)

    def save_paths(file_path, paths):
        with open(file_path, "w") as f:
            for path in paths:
                f.write(os.path.abspath(path) + "\n")

    # Save image paths
    save_paths(os.path.join(splits_dir, "train_X.txt"), train_imgs)
    save_paths(os.path.join(splits_dir, "val_X.txt"), val_imgs)
    save_paths(os.path.join(splits_dir, "test_X.txt"), test_imgs)

    # Save mask paths
    save_paths(os.path.join(splits_dir, "train_masks.txt"), train_masks)
    save_paths(os.path.join(splits_dir, "val_masks.txt"), val_masks)
    save_paths(os.path.join(splits_dir, "test_masks.txt"), test_masks)

    print("✅ Data paths successfully saved!")

def main():
    """
    Main function to prepare dataset directories and split image/mask paths.
    """
    create_directory(BASE_DIR)
    image_paths, mask_paths = get_image_and_mask_paths(PATCHES_DIR)
    split_and_save_data(image_paths, mask_paths, BASE_DIR)

if __name__ == "__main__":
    main()

Looking for patches in directory: /Users/nouri/repos/my-projects/plastic_detection/TriesteItalyChapter_PlasticDebrisDetection/task3-cleaning-and-preprocessing/notebooks/S2_L1C_SAFE_ACOLITE_complete_pipeline/output_batches/patches
📊 Dataset Split: Train=1221, Val=153, Test=153
✅ Successfully split and saved data paths!
