In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import math
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array

# ========== Configuration ==========
# Set these paths to match your Drive folders
# Input folder should have three subfolders: 'malignant', 'benign', and 'normal'
input_base = '/content/drive/MyDrive/Dataset_BUSI_without_mask'   # Change to your input folder path
# Output folder where augmented data will be saved (structure will be created automatically)
output_base = '/content/drive/MyDrive/BUSI_aug'       # Change to your desired output folder path

classes = ['malignant', 'benign', 'normal']
target_images = 2000  # desired number of images per class

# ========== Augmentation Settings ==========
# These parameters can be tuned based on the characteristics of ultrasound images
datagen = ImageDataGenerator(
    rotation_range=20,         # rotate images by up to 20 degrees
    width_shift_range=0.1,     # shift horizontally by up to 10%
    height_shift_range=0.1,    # shift vertically by up to 10%
    shear_range=0.1,           # shear intensity (for slight slanting)
    zoom_range=0.1,            # zoom in/out by up to 10%
    horizontal_flip=True,      # allow horizontal flipping
    fill_mode='nearest'        # fill missing pixels with nearest valid values
)

# ========== Create Output Directories ==========
for cls in classes:
    out_cls_dir = os.path.join(output_base, cls)
    if not os.path.exists(out_cls_dir):
        os.makedirs(out_cls_dir)

# ========== Process Each Class ==========
for cls in classes:
    print(f"\nProcessing class: {cls}")
    input_dir = os.path.join(input_base, cls)
    output_dir = os.path.join(output_base, cls)

    # Get list of original image filenames
    image_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]

    # First, copy original images to output folder
    count = 0
    for filename in image_files:
        src_path = os.path.join(input_dir, filename)
        dst_path = os.path.join(output_dir, filename)
        # Load the image using Keras (this also normalizes image channels)
        img = load_img(src_path)
        img.save(dst_path)  # save original image in output folder
        count += 1

    print(f"  {count} original images copied.")

    # Calculate how many augmented images are needed
    required = target_images - count
    if required <= 0:
        print("  Already have 2000 or more images. Skipping augmentation for this class.")
        continue

    # To determine how many augmented images to generate per original image:
    n_orig = len(image_files)
    aug_per_image = math.ceil(required / n_orig)
    print(f"  Generating ~{aug_per_image} augmentations per original image to add at least {required} images.")

    # For each original image, generate augmentations
    for filename in image_files:
        img_path = os.path.join(input_dir, filename)
        img = load_img(img_path)
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape)  # shape: (1, height, width, channels)

        # Generate augmentations for this image
        i = 0
        for batch in datagen.flow(x, batch_size=1,
                                  save_to_dir=output_dir,
                                  save_prefix='aug',
                                  save_format='png'):
            i += 1
            count += 1
            if i >= aug_per_image or count >= target_images:
                break  # break inner loop if enough augmentations for this image or overall count reached
        if count >= target_images:
            break  # break outer loop if this class has reached the target

    print(f"  Final count for class '{cls}': {count} images.")

print("\nData augmentation completed. Check your output folder in Google Drive.")


Mounted at /content/drive

Processing class: malignant
  210 original images copied.
  Generating ~9 augmentations per original image to add at least 1790 images.
  Final count for class 'malignant': 2000 images.

Processing class: benign
  447 original images copied.
  Generating ~4 augmentations per original image to add at least 1553 images.
  Final count for class 'benign': 2000 images.

Processing class: normal
  133 original images copied.
  Generating ~15 augmentations per original image to add at least 1867 images.
  Final count for class 'normal': 2000 images.

Data augmentation completed. Check your output folder in Google Drive.
