# Data Augmentation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**About the data:** <br>
The dataset contains 2 folders: yes and no which contains 253 Brain MRI Images. The folder yes contains 155 Brain MRI Images that are tumorous and the folder no contains 98 Brain MRI Images that are non-tumorous. You can find [here](https://www.kaggle.com/navoneel/brain-mri-images-for-brain-tumor-detection).

Since this is a small dataset, I used data augmentation in order to create more images.

Also, we could solve the data imbalance issue (since 61% of the data belongs to the tumorous class) using data augmentation.

## Import Necessary Modules

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2
import imutils
import matplotlib.pyplot as plt
from os import listdir
import time
import os

%matplotlib inline

In [None]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m}:{round(s,1)}"

In [None]:
def augment_data(file_dir, n_generated_samples, save_to_dir):
    """
    Arguments:
        file_dir: A string representing the directory where images that we want to augment are found.
        n_generated_samples: A string representing the number of generated samples using the given image.
        save_to_dir: A string representing the directory in which the generated images will be saved.
    """

    #from keras.preprocessing.image import ImageDataGenerator
    #from os import listdir

    data_gen = ImageDataGenerator(rotation_range=10,
                                  width_shift_range=0.1,
                                  height_shift_range=0.1,
                                  shear_range=0.1,
                                  brightness_range=(0.3, 1.0),
                                  horizontal_flip=True,
                                  vertical_flip=True,
                                  fill_mode='nearest'
                                 )


    for filename in listdir(file_dir):
        # load the image
        #print(os.path.join(file_dir, filename))
        image = cv2.imread(os.path.join(file_dir, filename))
        # reshape the image
        image = image.reshape((1,)+image.shape)
        # prefix of the names for the generated sampels.
        save_prefix = 'aug_' + filename[:-4]
        print(save_prefix)
        # generate 'n_generated_samples' sample images
        i=0
        for batch in data_gen.flow(x=image, batch_size=1, save_to_dir=save_to_dir,
                                           save_prefix=save_prefix, save_format='jpg'):
            i += 1
            if i > n_generated_samples:
                break

Remember that 61% of the data (155 images) are tumorous. And, 39% of the data (98 images) are non-tumorous.<br>
So, in order to balance the data we can generate 9 new images for every image that belongs to 'no' class and 6 images for every image that belongs the 'yes' class.<br>

In [None]:
start_time = time.time()

augmented_data_path = '/content/drive/MyDrive/DSE project/Brain-Tumor-Detection/Aug_data'

# augment data for the examples with label equal to 'yes' representing tumurous examples
augment_data(file_dir='/content/drive/MyDrive/DSE project/Brain-Tumor-Detection/yes', n_generated_samples=6, save_to_dir=augmented_data_path+'/yes')
# augment data for the examples with label equal to 'no' representing non-tumurous examples
augment_data(file_dir='/content/drive/MyDrive/DSE project/Brain-Tumor-Detection/no', n_generated_samples=9, save_to_dir=augmented_data_path+'/no')

end_time = time.time()
execution_time = (end_time - start_time)
print(f"Elapsed time: {hms_string(execution_time)}")

aug_Y47
aug_Y183
aug_Y102
aug_Y79
aug_Y91
aug_Y114
aug_Y113
aug_Y75
aug_Y38
aug_Y147
aug_Y21
aug_Y148
aug_Y56
aug_Y254
aug_Y163
aug_Y249
aug_Y100
aug_Y108
aug_Y22
aug_Y161
aug_Y112
aug_Y184
aug_Y253
aug_Y247
aug_Y86
aug_Y185
aug_Y117
aug_Y65
aug_Y24
aug_Y167
aug_Y165
aug_Y192
aug_Y162
aug_Y182
aug_Y187
aug_Y92
aug_Y245
aug_Y40
aug_Y81
aug_Y67
aug_Y15
aug_Y71
aug_Y41
aug_Y1
aug_Y66
aug_Y45
aug_Y8
aug_Y106
aug_Y9
aug_Y153
aug_Y154
aug_Y258
aug_Y193
aug_Y255
aug_Y51
aug_Y12
aug_Y27
aug_Y50
aug_Y46
aug_Y62
aug_Y33
aug_Y103
aug_Y186
aug_Y32
aug_Y92
aug_Y97
aug_Y61
aug_Y194
aug_Y160
aug_Y170
aug_Y52
aug_Y146
aug_Y115
aug_Y54
aug_Y17
aug_Y70
aug_Y59
aug_Y244
aug_Y11
aug_Y26
aug_Y4
aug_Y19
aug_Y10
aug_Y18
aug_Y257
aug_Y95
aug_Y49
aug_Y60
aug_Y259
aug_Y89
aug_Y44
aug_Y25
aug_Y159
aug_Y251
aug_Y73
aug_Y120
aug_Y104
aug_Y36
aug_Y109
aug_Y188
aug_Y35
aug_Y195
aug_Y39
aug_Y7
aug_Y77
aug_Y2
aug_Y74
aug_Y78
aug_Y30
aug_Y55
aug_Y20
aug_Y31
aug_Y105
aug_Y166
aug_Y37
aug_Y181
aug_Y96
aug_Y76
aug_Y69
aug

Let's see how many tumorous and non-tumorous examples after performing data augmentation:

In [None]:
def data_summary(main_path):

    yes_path = main_path+'/yes'
    no_path = main_path+'/no'

    # number of files (images) that are in the the folder named 'yes' that represent tumorous (positive) examples
    m_pos = len(listdir(yes_path))
    # number of files (images) that are in the the folder named 'no' that represent non-tumorous (negative) examples
    m_neg = len(listdir(no_path))
    # number of all examples
    m = (m_pos+m_neg)

    pos_prec = (m_pos* 100.0)/ m
    neg_prec = (m_neg* 100.0)/ m

    print(f"Number of examples: {m}")
    print(f"Percentage of positive examples: {pos_prec}%, number of pos examples: {m_pos}")
    print(f"Percentage of negative examples: {neg_prec}%, number of neg examples: {m_neg}")

In [None]:
data_summary(augmented_data_path)

Number of examples: 2065
Percentage of positive examples: 52.54237288135593%, number of pos examples: 1085
Percentage of negative examples: 47.45762711864407%, number of neg examples: 980


That's it for this notebook. Now, we can use the augmented data to train our convolutional neural network.