# Data Augmentation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
import cv2
import imutils
import matplotlib.pyplot as plt
import os
import time
from os import listdir


In [None]:
def augment_data(file_dir, n_generated_samples, save_to_dir):
    '''
    file_dir: Directory path where original img is located
    n_generated_samples: Number of augmented samples to generate for each original image
    save_to_dir: Directory where the augmented images will be saved
    '''
    data_gen = ImageDataGenerator(
        rotation_range=10,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.1,
        brightness_range=(0.3, 1.0),
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest'  '''#constant,reflect,wrap
        #aaaaaa|abcd|dddddd'''
    )

    for filename in os.listdir(file_dir):
        image = cv2.imread(os.path.join(file_dir, filename))
        if image is not None:
        ''' # It reshape the image to have a batch dimension of 1'''
            image = image.reshape((1,) + image.shape)
            save_prefix = 'aug_' + filename[:-4]
            i = 0
            # no.of augmented image sample generater

            #x=image -> Original image
            #batch_size=1 -> generates one augment sample at a time
            #flow() -> generates batches of augmented or preprocessed image data, usefull in DL model when dealing with large dataset
            
            for batch in data_gen.flow(x=image, batch_size=1, save_to_dir=save_to_dir,
                                       save_prefix=save_prefix, save_format='jpg'):
                i += 1
                if i > n_generated_samples:
                    break

In [None]:
start_time = time.time()

augmented_data_path = 'augmented_data/'
yes_path = '/content/drive/MyDrive/yes'
no_path = '/content/drive/MyDrive/no'

os.makedirs(augmented_data_path, exist_ok=True)
os.makedirs(os.path.join(augmented_data_path, 'yes'), exist_ok=True)
os.makedirs(os.path.join(augmented_data_path, 'no'), exist_ok=True)

augment_data(file_dir=yes_path, n_generated_samples=6, save_to_dir=os.path.join(augmented_data_path, 'yes'))
augment_data(file_dir=no_path, n_generated_samples=9, save_to_dir=os.path.join(augmented_data_path, 'no'))

end_time = time.time()
execution_time = end_time - start_time
print(f"Elapsed time: {execution_time:.1f} seconds")

Elapsed time: 156.3 seconds


In [None]:
def data_summary(main_path):

    yes_path = main_path+'yes'
    no_path = main_path+'no'

    # number of files (images) that are in the the folder named 'yes' that represent tumorous (positive) examples
    m_pos = len(listdir(yes_path))
    # number of files (images) that are in the the folder named 'no' that represent non-tumorous (negative) examples
    m_neg = len(listdir(no_path))
    # number of all examples
    m = (m_pos+m_neg)

    pos_prec = (m_pos* 100.0)/ m
    neg_prec = (m_neg* 100.0)/ m

    print(f"Number of examples: {m}")
    print(f"Percentage of positive examples: {pos_prec}%, number of pos examples: {m_pos}")
    print(f"Percentage of negative examples: {neg_prec}%, number of neg examples: {m_neg}")

In [None]:
data_summary(augmented_data_path)

Number of examples: 2065
Percentage of positive examples: 52.54237288135593%, number of pos examples: 1085
Percentage of negative examples: 47.45762711864407%, number of neg examples: 980
