In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2
import imutils
import matplotlib.pyplot as plt
from os import listdir
import time    

%matplotlib inline

In [4]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m}:{round(s,1)}"

In [5]:
def augment_data(file_dir, n_generated_samples, save_to_dir):
    """
    Arguments:
        file_dir: A string representing the directory where images that we want to augment are found.
        n_generated_samples: A string representing the number of generated samples using the given image.
        save_to_dir: A string representing the directory in which the generated images will be saved.
    """
    
    #from keras.preprocessing.image import ImageDataGenerator
    #from os import listdir
    
    data_gen = ImageDataGenerator(rotation_range=10, 
                                  width_shift_range=0.1, 
                                  height_shift_range=0.1, 
                                  shear_range=0.1, 
                                  brightness_range=(0.3, 1.0),
                                  horizontal_flip=True, 
                                  vertical_flip=True, 
                                  fill_mode='nearest'
                                 )

    
    for filename in listdir(file_dir):
        # load the image
        image = cv2.imread(file_dir + '\\' + filename)
        # reshape the image
        image = image.reshape((1,)+image.shape)
        # prefix of the names for the generated sampels.
        save_prefix = 'aug_' + filename[:-4]
        # generate 'n_generated_samples' sample images
        i=0
        for batch in data_gen.flow(x=image, batch_size=1, save_to_dir=save_to_dir, 
                                           save_prefix=save_prefix, save_format='jpg'):
            i += 1
            if i > n_generated_samples:
                break

In [6]:
import os
import cv2
import time
import random
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def augment_data(file_dir, n_generated_samples, save_to_dir):
    """
    Augments images in the specified directory and saves them to another directory.

    Arguments:
        file_dir: Directory containing the images to augment.
        n_generated_samples: Number of augmented samples to generate per image.
        save_to_dir: Directory to save the augmented images.
    """
    if not os.path.exists(save_to_dir):
        os.makedirs(save_to_dir)

    data_gen = ImageDataGenerator(rotation_range=10, 
                                  width_shift_range=0.1, 
                                  height_shift_range=0.1, 
                                  shear_range=0.1, 
                                  brightness_range=(0.3, 1.0),
                                  horizontal_flip=True, 
                                  vertical_flip=True, 
                                  fill_mode='nearest')

    for filename in os.listdir(file_dir):
        file_path = os.path.join(file_dir, filename)
        if os.path.isfile(file_path):
            # Load the image
            image = cv2.imread(file_path)
            if image is None:
                continue  # Skip files that cannot be read as images

            # Reshape the image
            image = image.reshape((1,) + image.shape)
            # Prefix of the names for the generated samples
            save_prefix = 'aug_' + os.path.splitext(filename)[0]
            # Generate 'n_generated_samples' sample images
            i = 0
            for batch in data_gen.flow(x=image, batch_size=1, save_to_dir=save_to_dir, 
                                           save_prefix=save_prefix, save_format='jpg'):
                i += 1
                if i >= n_generated_samples:
                    break

def hms_string(seconds):
    """Convert seconds to H:M:S string"""
    hours = int(seconds / 3600)
    minutes = int((seconds % 3600) / 60)
    seconds = int(seconds % 60)
    return f"{hours:02}:{minutes:02}:{seconds:02}"

# Main script
start_time = time.time()

# Manually specify the current directory path
current_path = os.getcwd()  # Use the current working directory instead of __file__
augmented_data_path = os.path.join(current_path, 'augmented_data')
yes_path = os.path.join(current_path, 'yes')
no_path = os.path.join(current_path, 'no')

# Create the directories for augmented data
os.makedirs(os.path.join(augmented_data_path, 'yes'), exist_ok=True)
os.makedirs(os.path.join(augmented_data_path, 'no'), exist_ok=True)

# Augment data for the examples with label equal to 'yes' representing tumorous examples
augment_data(file_dir=yes_path, n_generated_samples=30, save_to_dir=os.path.join(augmented_data_path, 'yes'))

# Manage negative examples: reduce them to a target count
target_negative_count = 100
negative_files = os.listdir(no_path)
if len(negative_files) > target_negative_count:
    selected_files = random.sample(negative_files, target_negative_count)
else:
    selected_files = negative_files

# Copy selected negative examples to the new directory
for filename in selected_files:
    src_path = os.path.join(no_path, filename)
    dst_path = os.path.join(augmented_data_path, 'no', filename)
    shutil.copy(src_path, dst_path)

end_time = time.time()
execution_time = (end_time - start_time)
print(f"Elapsed time: {hms_string(execution_time)}")
    

Elapsed time: 00:00:19


In [7]:
# Data Summary
def data_summary(main_path):
    """
    Summarizes the data in the 'yes' and 'no' directories.

    Arguments:
        main_path: A string representing the main directory path containing 'yes' and 'no' subdirectories.
    """
    # Construct paths for 'yes' and 'no' directories
    yes_path = os.path.join(main_path, 'yes')
    no_path = os.path.join(main_path, 'no')
    
    # Ensure the paths exist before proceeding
    if not os.path.exists(yes_path) or not os.path.exists(no_path):
        print("Error: One or both directories do not exist.")
        return
    
    # Number of files (images) in each folder
    m_pos = len(os.listdir(yes_path))
    m_neg = len(os.listdir(no_path))
    
    # Total number of examples
    m = m_pos + m_neg
    
    # Calculate percentages
    pos_prec = (m_pos * 100.0) / m if m > 0 else 0
    neg_prec = (m_neg * 100.0) / m if m > 0 else 0
    
    # Print summary
    print(f"Number of examples: {m}")
    print(f"Percentage of positive examples: {pos_prec:.2f}%, number of positive examples: {m_pos}")
    print(f"Percentage of negative examples: {neg_prec:.2f}%, number of negative examples: {m_neg}")

# Print data summary for the augmented dataset
data_summary(augmented_data_path)


Number of examples: 1814
Percentage of positive examples: 79.05%, number of positive examples: 1434
Percentage of negative examples: 20.95%, number of negative examples: 380
