In [1]:
pip install tensorflow tqdm pillow scipy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import shutil 
import random
import logging
import scipy
import numpy as np
from tqdm import tqdm
from PIL import Image
from pathlib import Path
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array


In [3]:
# Configure logging level and format
logging.basicConfig(
    level=logging.INFO,
    format= '%(asctime)s - %(levelname)s - %(message)s'
)

# **Analyze The Dataset :**

In [12]:
def analyze_dataset(dir):
    
    cls_counts={}
    img_ext=('.jpg', '.jpeg', '.png')
    
    if not os.path.exists(dir):
        logging.error(f"Directory [{dir}] can not found.")
        return None
    
    for cls_name in os.listdir(dir):
        cls_path = os.path.join(dir, cls_name)    
        
        if os.path.isdir(cls_path):
            try:
                img_files=[f for f in os.listdir(cls_path) if f.lower().endswith(img_ext)]
                cls_counts[cls_name]=len(img_files)
            except Exception as e:
                logging.error(f"Error occurred during get the fils from the directory [{dir}]: {e}")
                cls_counts[cls_name]=0
                
    total_imgs=sum(cls_counts.values())
    logging.info(f"Total Images : {total_imgs}")
    
    for cls_name, count in sorted(cls_counts.items()):
        percentage =(count/total_imgs*100) if total_imgs>0 else 0
        logging.info(f"[{cls_name}] : {count} Images, [{percentage:.1f}%]")
    
    return cls_counts

In [13]:
analyze_dataset(dir="F:/MobileNetV2/Dataset/Skin diseases data")

2025-06-11 13:43:06,998 - INFO - Total Images : 755
2025-06-11 13:43:06,999 - INFO - [Chickenpox] : 75 Images, [9.9%]
2025-06-11 13:43:07,000 - INFO - [Cowpox] : 66 Images, [8.7%]
2025-06-11 13:43:07,002 - INFO - [HFMD] : 161 Images, [21.3%]
2025-06-11 13:43:07,002 - INFO - [Healthy] : 114 Images, [15.1%]
2025-06-11 13:43:07,002 - INFO - [Measles] : 55 Images, [7.3%]
2025-06-11 13:43:07,002 - INFO - [Monkeypox] : 284 Images, [37.6%]


{'Chickenpox': 75,
 'Cowpox': 66,
 'Healthy': 114,
 'HFMD': 161,
 'Measles': 55,
 'Monkeypox': 284}

# **Balance The Classes Using Data Augmentation :**

In [6]:
def augmentation(input_dir, output_dir, target_images, save_format, random_seed):
   
    if not os.path.exists(input_dir):
        logging.error(f"Directory [{input_dir}] cannot be found.")
        return None    
    
    # For reproducibility
    np.random.seed(random_seed)
    img_ext = ('.jpg', '.jpeg', '.png')
    
    # Create data generator 
    data_gen= ImageDataGenerator(rotation_range=5,
                                 horizontal_flip=False,
                                 vertical_flip=False,
                                 zoom_range=0.1,
                                 brightness_range=[0.95, 1.05],
                                 width_shift_range=0.05,
                                 height_shift_range=0.05,
                                 fill_mode='nearest' # fill new pixels with nearest value                                   
    )
    
    try:
        img_files = [f for f in os.listdir(input_dir) if f.lower().endswith(img_ext)]
        current_images = len(img_files)
        logging.info(f"Found [{current_images}] images in [{input_dir}] path.")
    except Exception as e:
        logging.error(f"Error occurred during getting the fils from the directory [{input_dir}]: {e}")
        return None
    
    # Check if any images were found 
    if current_images == 0:
        logging.error(f"No images found in directory [{input_dir}].")
        return None
    
    # Check augmentation is needed or not 
    if current_images >= target_images:
        logging.warning(f"Current images [{current_images}] >= target [{target_images}], So no augmentation needed.")
        return 0
    
    # Create output directory 
    try:
        os.makedirs(output_dir, exist_ok=True)
        logging.info(f"Output directory created or verified : [{output_dir}].")
    except Exception as e:
        logging.error(f"Error occurred during create the directory [{output_dir}]: {e}")
        return None
        
    # Calculate images to generate
    imgs_to_gen = target_images - current_images
    logging.info(f"Need to generate [{imgs_to_gen}] additional images to balance the dataset.")
    
    # Calculate distribution of augmented images per original image 
    imgs_per_original = imgs_to_gen // current_images 
    remainder = imgs_to_gen % current_images
    
    logging.info(f"Generating [{imgs_per_original}] images per original, with [{remainder}] images getting one extra.")
    total_generated = 0
    
    # Process each image from directory
    for idx, img_name in enumerate(tqdm(img_files, desc="Augmenting images")):
        try:
            img_path = os.path.join(input_dir, img_name)
            
            # load and preprocess image
            img=load_img(img_path)
            img_array=img_to_array(img)
            img_array=img_array.reshape((1,) + img_array.shape)
            
            # calculate how many augmented images to generate for this original
            target_per_img = imgs_per_original + (1 if idx < remainder else 0)
            
            # skip if no images needed for this original
            if target_per_img == 0:
                continue
            
            #Generate augmented images
            generated_count = 0
            base_name=os.path.splitext(img_name)[0]
            
            # Create a generator and iterate through it
            aug_iter = data_gen.flow(
                img_array,batch_size=1,
                save_to_dir=output_dir,
                save_prefix=f'aug_{base_name}_',
                save_format=save_format
            ) 

            # Generate the required number of augmented images 
            for i in range(target_per_img):
                try:
                    next(aug_iter) # Generate one augmented image
                    generated_count += 1
                    total_generated += 1
                
                except StopIteration:
                    logging.warning(f"Generator exhausted for [{img_name}] after [{generated_count}] images.")
                    break
                
        except Exception as e:
            logging.error(f"Error occurred during processing [{img_name}]: {str(e)}")
            continue
        
    logging.info(f"Augmentation complete! Generated [{total_generated}] new images.")
    logging.info(f"Total images now [{current_images + total_generated}] new images.")  
    
    return total_generated  

In [7]:
# Augmentation multiple classes together 
def multi_aug_classes(input_dir, output_dir, class_targets, save_format, random_seed):
    
    for class_name, target_count in class_targets.items():
        input_path = os.path.join(input_dir, class_name)
        output_path = os.path.join(output_dir, class_name)
        
        logging.info(f"Processing class: [{class_name}]")
        
        try:
            augmentation(input_dir = input_path,
                         output_dir = output_path,
                         target_images = target_count,
                         save_format = save_format,
                         random_seed = random_seed)
            
            # Verify results (Save or Not)
            img_ext = ('.jpg', '.jpeg', '.png')
            if os.path.exists(output_path):
                final_count = len([f for f in os.listdir(output_path) if f.lower().endswith(img_ext)])
                logging.info(f"Verification: [{final_count}] files in output directory.")
            else:
                logging.warning(f"Output directory [{output_path}] does not exist for class [{class_name}]")
        
        except Exception as e:
            logging.error(f"Failed to process class: [{class_name}]: {str(e)}")
            continue    

In [20]:
# Target distribution for classes
# Since 'Monkeypox' class has 284 images 
class_targets = {'Chickenpox': 284,
                 'Cowpox': 284,
                 'Healthy': 284,
                 'HFMD': 284,
                 'Measles': 284,     
                }

In [21]:
# Call and run the multi class augmentation funtion
multi_aug_classes(input_dir='F:/MobileNetV2/Dataset/Skin diseases data',
                  output_dir='F:/MobileNetV2/Dataset/augmented images', 
                  class_targets=class_targets,
                  save_format='.jpg', 
                  random_seed=42)

2025-06-11 13:51:27,526 - INFO - Processing class: [Chickenpox]
2025-06-11 13:51:27,528 - INFO - Found [75] images in [F:/MobileNetV2/Dataset/Skin diseases data\Chickenpox] path.
2025-06-11 13:51:27,530 - INFO - Output directory created or verified : [F:/MobileNetV2/Dataset/augmented images\Chickenpox].
2025-06-11 13:51:27,530 - INFO - Need to generate [209] additional images to balance the dataset.
2025-06-11 13:51:27,530 - INFO - Generating [2] images per original, with [59] images getting one extra.
Augmenting images: 100%|██████████| 75/75 [00:03<00:00, 20.46it/s]
2025-06-11 13:51:31,195 - INFO - Augmentation complete! Generated [209] new images.
2025-06-11 13:51:31,195 - INFO - Total images now [284] new images.
2025-06-11 13:51:31,195 - INFO - Verification: [209] files in output directory.
2025-06-11 13:51:31,195 - INFO - Processing class: [Cowpox]
2025-06-11 13:51:31,195 - INFO - Found [66] images in [F:/MobileNetV2/Dataset/Skin diseases data\Cowpox] path.
2025-06-11 13:51:31,19

In [22]:
# Analyze dataset after agumentation 
analyze_dataset(dir="F:/MobileNetV2/Dataset/Skin diseases data")

2025-06-11 14:00:07,187 - INFO - Total Images : 1704
2025-06-11 14:00:07,187 - INFO - [Chickenpox] : 284 Images, [16.7%]
2025-06-11 14:00:07,187 - INFO - [Cowpox] : 284 Images, [16.7%]
2025-06-11 14:00:07,187 - INFO - [HFMD] : 284 Images, [16.7%]
2025-06-11 14:00:07,187 - INFO - [Healthy] : 284 Images, [16.7%]
2025-06-11 14:00:07,187 - INFO - [Measles] : 284 Images, [16.7%]
2025-06-11 14:00:07,187 - INFO - [Monkeypox] : 284 Images, [16.7%]


{'Chickenpox': 284,
 'Cowpox': 284,
 'Healthy': 284,
 'HFMD': 284,
 'Measles': 284,
 'Monkeypox': 284}

# **Split Raw Image Data Into Train & Vaild :**

In [23]:
def split_dataset(input_dir, output_dir, train_ratio=0.8, seed=42):
    if not input_dir or not os.path.exists(input_dir):
        logging.error("Input directory can not found.")
        return False
    else:
        logging.info("Input directory found sucessfully and continue the process.")
    
    random.seed(seed)
    input_dir=Path(input_dir)
    output_dir=Path(output_dir)
    total_files=0
    processed_files=0
    
    # Create the output directory
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Iterate through class directories
    for cls_dir in input_dir.iterdir():
        if cls_dir.is_dir():
            img_ext = ['*.jpg', '*.jpeg', '*.png']
            imgs=[]
            
            for ext in img_ext:
                imgs.extend(cls_dir.glob(ext))
                imgs.extend(cls_dir.glob(ext.upper())) # For upper case ext.
            
            if not imgs:
                logging.warning(f"No image files found in [{cls_dir.name}].")
                continue 
            
            logging.info(f"Processing class [{cls_dir.name}] with [{len(imgs)}] images.")
            total_files += len(imgs)

            # Shuffle images randomly
            random.shuffle(imgs)
            
            # calculate split index
            split_idx=int(len(imgs) * train_ratio)
            train_imgs=imgs[:split_idx]
            val_imgs=imgs[split_idx:]
            
            logging.info(f"Train - [{len(train_imgs)}] , Validation - [{len(val_imgs)}].")
            
            # Copy files to output directories
            for split, split_imgs in [("train", train_imgs), ("val", val_imgs)]:
                target_dir= output_dir / split / cls_dir.name
                target_dir.mkdir(parents=True, exist_ok=True)
                
                for img_path in split_imgs:
                    try:
                        shutil.copy(img_path, target_dir / img_path.name)
                        processed_files +=1
                    except Exception as e:
                        logging.error(f"Error occurred during copying [{img_path.name}]: {e}")
                        
    logging.info(f"Dataset split completed. Processed [{processed_files}/{total_files}] files.")
    
    return processed_files == total_files
                        

In [24]:
split_dataset(input_dir="F:/MobileNetV2/Dataset/Skin diseases data",
              output_dir="F:/MobileNetV2/Dataset/Skin diseases splited Dataset",
              train_ratio=0.8, 
              seed=42)

2025-06-11 14:01:00,037 - INFO - Input directory found sucessfully and continue the process.
2025-06-11 14:01:00,052 - INFO - Processing class [Chickenpox] with [568] images.
2025-06-11 14:01:00,052 - INFO - Train - [454] , Validation - [114].
2025-06-11 14:01:02,779 - INFO - Processing class [Cowpox] with [568] images.
2025-06-11 14:01:02,779 - INFO - Train - [454] , Validation - [114].
2025-06-11 14:01:05,485 - INFO - Processing class [Healthy] with [568] images.
2025-06-11 14:01:05,501 - INFO - Train - [454] , Validation - [114].
2025-06-11 14:01:07,797 - INFO - Processing class [HFMD] with [568] images.
2025-06-11 14:01:07,797 - INFO - Train - [454] , Validation - [114].
2025-06-11 14:01:09,681 - INFO - Processing class [Measles] with [568] images.
2025-06-11 14:01:09,681 - INFO - Train - [454] , Validation - [114].
2025-06-11 14:01:12,675 - INFO - Processing class [Monkeypox] with [568] images.
2025-06-11 14:01:12,676 - INFO - Train - [454] , Validation - [114].
2025-06-11 14:01:1

True