In [None]:
#This is the main notebook we are using for augmentation such as blurring, light exposures, Spliting and making copies of augmented images

In [1]:
import os
import random
import shutil
import tempfile
import numpy as np
from pathlib import Path
from PIL import Image
import albumentations as A

def set_global_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

def create_augmented_dataset(
    source_dir,
    output_dir,
    split_ratios=(0.7, 0.20, 0.10),
    seed=42,
    augment_per_image=5,
    image_size=(224, 224)
):
    set_global_seed(seed)

    source_dir = Path(source_dir)
    output_dir = Path(output_dir)
    tmp_dir = Path(tempfile.mkdtemp())
    print(f"Temporary split directory: {tmp_dir}")

    classes = [d.name for d in source_dir.iterdir() if d.is_dir()]
    splits = ["train", "val", "test"]

    rng = np.random.default_rng(seed)

    # Create softlinked split directory
    for cls in classes:
        images = list((source_dir / cls).glob("*"))
        rng.shuffle(images)

        n = len(images)
        n_train = int(split_ratios[0] * n)
        n_val = int(split_ratios[1] * n)

        split_data = {
            "train": images[:n_train],
            "val": images[n_train:n_train + n_val],
            "test": images[n_train + n_val:]
        }

        for split in splits:
            split_path = tmp_dir / split / cls
            split_path.mkdir(parents=True, exist_ok=True)
            for img in split_data[split]:
                os.symlink(os.path.abspath(img), split_path / img.name)

    # Perform augmentation
    for split in splits:
        for cls in classes:
            split_path = tmp_dir / split / cls
            images = list(split_path.glob("*"))

            save_dir = output_dir / split / cls
            save_dir.mkdir(parents=True, exist_ok=True)

            for img_index, img_path in enumerate(images):
                img = np.array(Image.open(img_path).convert("RGB"))
                base_name = img_path.stem

                # Save original resized
                orig_resize = A.Resize(height=image_size[1], width=image_size[0])
                orig_img = orig_resize(image=img)["image"]
                Image.fromarray(orig_img).save(save_dir / f"{base_name}_orig.jpg")

                for i in range(augment_per_image):
                    # Create deterministic per-image-augmentation seed
                    aug_seed = seed + hash((img_path.name, i)) % 10_000_000

                    # Set deterministic random seed for this image
                    set_global_seed(aug_seed)

                    # Define Albumentations transform
                    transform = A.Compose([
                        A.MotionBlur(blur_limit=(3, 15), p=1.0),
                        A.RandomBrightnessContrast(brightness_limit=0.4, contrast_limit=0.3, p=1.0),
                        A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=1.0),
                        A.Resize(height=image_size[1], width=image_size[0])
                    ])

                    augmented = transform(image=img)
                    aug_img = augmented["image"]
                    Image.fromarray(aug_img).save(save_dir / f"{base_name}_aug{i+1}.jpg")

    print(f"Augmented dataset saved to: {output_dir}")
    print(f"Temporary symlink split in: {tmp_dir} (can delete manually after use)")
    return tmp_dir


  check_for_updates()


In [2]:
import random

for split_number in range(1, 6):
    base_seed = 12345 + split_number
    rng = random.Random(base_seed)
    seed = rng.randint(0, 2**32 - 1)

    source_dir = "/home/esidserver/datasets/1-original-dataset"
    create_augmented_dataset(
        source_dir=source_dir,
        output_dir=source_dir + '_split' + str(split_number),
        split_ratios=(0.7, 0.2, 0.1),
        seed=seed,
        augment_per_image=5
    )


Temporary split directory: /tmp/tmpm35nti4u
Augmented dataset saved to: /home/esidserver/datasets/1-original-dataset_split1
Temporary symlink split in: /tmp/tmpm35nti4u (can delete manually after use)
Temporary split directory: /tmp/tmpul3qv7o3
Augmented dataset saved to: /home/esidserver/datasets/1-original-dataset_split2
Temporary symlink split in: /tmp/tmpul3qv7o3 (can delete manually after use)
Temporary split directory: /tmp/tmpp6deq8fp
Augmented dataset saved to: /home/esidserver/datasets/1-original-dataset_split3
Temporary symlink split in: /tmp/tmpp6deq8fp (can delete manually after use)
Temporary split directory: /tmp/tmpnw698sw7
Augmented dataset saved to: /home/esidserver/datasets/1-original-dataset_split4
Temporary symlink split in: /tmp/tmpnw698sw7 (can delete manually after use)
Temporary split directory: /tmp/tmpjthzw20s
Augmented dataset saved to: /home/esidserver/datasets/1-original-dataset_split5
Temporary symlink split in: /tmp/tmpjthzw20s (can delete manually after 