# Data Augmentation
### For Job v0.2:
* 29 original images & annotations are excluded, and the remaining 140 images & annotations are in 'ex_x' and 'ex_y' folders.
* Use 140 original images & annotations from 'ex_x' and 'ex_y' to augment them into 2000, and then store them into 'ex_x_aug' and 'ex_y_aug'.

### For Job v0.3-separating-glands:
* 27 original images & annotations are excluded, and the remaining images & annotations are in 'v0.3-separating-glands-filtered2-x' and 'v0.3-separating-glands-filtered2-y' folders.
* Use these remaining original images & annotations to augment them to 2000, and then store them into 'v0.3-separating-glands-aug-x' and 'v0.3-separating-glands-aug-y'.

In [2]:
# !pip install scikit-image
# !pip install albumentations
# !pip install scikit-learn

Collecting albumentations
  Downloading albumentations-1.1.0-py3-none-any.whl (102 kB)
[K     |████████████████████████████████| 102 kB 12.0 MB/s eta 0:00:01
Collecting opencv-python-headless>=4.1.1
  Downloading opencv_python_headless-4.5.3.56-cp38-cp38-manylinux2014_x86_64.whl (37.1 MB)
[K     |████████████████████████████████| 37.1 MB 24.6 MB/s eta 0:00:01
Collecting qudida>=0.0.4
  Downloading qudida-0.0.4-py3-none-any.whl (3.5 kB)
Installing collected packages: opencv-python-headless, qudida, albumentations
Successfully installed albumentations-1.1.0 opencv-python-headless-4.5.3.56 qudida-0.0.4
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import numpy as np
from matplotlib import pyplot as plt
from skimage.transform import AffineTransform, warp
from skimage import io, img_as_ubyte
import random
import os
import cv2
import glob
import shutil
from scipy.ndimage import rotate
import albumentations as A
images_to_generate=2000

In [3]:
## File paths for Job v0.2
# images_path="/data/Unet_data/ex_x/"
# masks_path="/data/Unet_data/ex_y/"
# img_augmented_path="/data/Unet_data/ex_x_aug"
# msk_augmented_path="/data/Unet_data/ex_y_aug"

In [4]:
# # File paths for Job v0.3-separating-glands
# img_path="/data/Unet_data/v0.3-separating-glands-filtered3-x/"
# msk_path="/data/Unet_data/v0.3-separating-glands-filtered3-y/"
# img_augmented_path="/data/Unet_data/v0.3-separating-glands-aug-x"
# msk_augmented_path="/data/Unet_data/v0.3-separating-glands-aug-y"

In [4]:
# File paths for Job v0.4-separating-glands-2
img_path="/data/Unet_data/v0.4-separating-glands-2/x/"
msk_path="/data/Unet_data/v0.4-separating-glands-2/y/"
img_augmented_path="/data/Unet_data/v0.4-separating-glands-2/x-aug"
msk_augmented_path="/data/Unet_data/v0.4-separating-glands-2/y-aug"

In [5]:
images=[]
masks=[]

for im in os.listdir(img_path):
    if im.endswith('bmp'):
        images.append(os.path.join(img_path,im))
#         print(os.path.join(images_path,im))
for msk in os.listdir(msk_path):
    if msk.endswith('jpg'):
        masks.append(os.path.join(msk_path,msk))

In [6]:
# Number of images to augment
len(images)

315

In [7]:
# Number of masks to augment
len(masks)

315

### Below is a sanity check to make sure that files are in the matching order before augmentation.

In [8]:
# Capture original image path as a list
images_path = []
dict_x = {}

i=0
# for directory_path in glob.glob("/data/Unet_data/v0.3-separating-glands-filtered-x/*.bmp"):
for directory_path in glob.glob(img_path + "*.bmp"):
    img_name = directory_path.split('/')
    img_name = img_name[5].split('.')
    dict_x[img_name[0]] = i
    i+=1
    images_path.append(directory_path)
# print('X:', dict_x)
# images_path

# Capture mask path as a list
train_masks2 = []
dict_y = {}
            
# for directory_path in glob.glob("/data/Unet_data/v0.3-separating-glands-filtered-y/*.jpg"):
for directory_path in glob.glob(msk_path + "*.jpg"):
    mask_name = directory_path.split('/')
    mask_name = mask_name[5].split('.')
    mask_name = mask_name[0]
    if mask_name in dict_x:
        dict_y[dict_x[mask_name]] = directory_path
# dict_y
masks_path = []

# Match the order of files in train_masks2 to images_path, then store y paths in list 'masks_path'
for i in sorted(dict_y.keys()):
    masks_path.append(dict_y[i])

In [9]:
dict_x

{'2020_02_05_09_46_10_46374990': 0,
 '2020_02_05_09_46_10_46384083': 1,
 '2020_02_05_09_46_10_46067444': 2,
 '2020_02_05_09_46_10_46197185': 3,
 '2020_02_05_09_46_10_46550909': 4,
 '2020_02_05_09_45_00_46374990': 5,
 '2020_02_05_09_45_00_46547941': 6,
 '2020_02_05_09_45_00_46197504': 7,
 '2020_02_05_09_45_00_46546857': 8,
 '2020_02_05_09_45_00_46547885': 9,
 '2020_02_05_13_29_05_46374990': 10,
 '2020_02_05_13_29_05_46067444': 11,
 '2020_02_05_13_29_05_46067465': 12,
 '2020_02_05_13_29_05_46384083': 13,
 '2020_02_05_13_29_05_46197185': 14,
 '2020_02_05_13_30_09_46547923': 15,
 '2020_02_05_13_30_09_46374990': 16,
 '2020_02_05_13_30_09_46197185': 17,
 '2020_02_05_13_30_09_46067465': 18,
 '2020_02_05_13_30_09_46067444': 19,
 '2020_02_05_13_31_17_46374990': 20,
 '2020_02_05_13_31_17_46067444': 21,
 '2020_02_05_13_31_17_46067465': 22,
 '2020_02_05_13_31_17_46384083': 23,
 '2020_02_05_13_31_17_46197185': 24,
 '2020_02_05_13_32_02_46547923': 25,
 '2020_02_05_13_32_02_46374990': 26,
 '2020_02_0

In [10]:
dict_y

{290: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_25_51_46197185.jpg',
 291: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_25_51_46067444.jpg',
 292: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_25_51_46067465.jpg',
 293: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_25_51_46074771.jpg',
 294: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_25_51_46547929.jpg',
 250: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_27_43_46067465.jpg',
 251: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_27_43_46067444.jpg',
 252: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_27_43_46197185.jpg',
 253: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_27_43_46374990.jpg',
 254: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_27_43_46197504.jpg',
 220: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_29_50_46197185.jpg',
 221: '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_10_2

### Verify that file names in 'masks_path' and 'images_path' are in the matching order. 

In [11]:
masks_path[:10]

['/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_09_46_10_46374990.jpg',
 '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_09_46_10_46384083.jpg',
 '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_09_46_10_46067444.jpg',
 '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_09_46_10_46197185.jpg',
 '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_09_46_10_46550909.jpg',
 '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_09_45_00_46374990.jpg',
 '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_09_45_00_46547941.jpg',
 '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_09_45_00_46197504.jpg',
 '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_09_45_00_46546857.jpg',
 '/data/Unet_data/v0.4-separating-glands-2/y/2020_02_05_09_45_00_46547885.jpg']

In [12]:
len(masks_path)

315

In [13]:
images_path[:10]

['/data/Unet_data/v0.4-separating-glands-2/x/2020_02_05_09_46_10_46374990.bmp',
 '/data/Unet_data/v0.4-separating-glands-2/x/2020_02_05_09_46_10_46384083.bmp',
 '/data/Unet_data/v0.4-separating-glands-2/x/2020_02_05_09_46_10_46067444.bmp',
 '/data/Unet_data/v0.4-separating-glands-2/x/2020_02_05_09_46_10_46197185.bmp',
 '/data/Unet_data/v0.4-separating-glands-2/x/2020_02_05_09_46_10_46550909.bmp',
 '/data/Unet_data/v0.4-separating-glands-2/x/2020_02_05_09_45_00_46374990.bmp',
 '/data/Unet_data/v0.4-separating-glands-2/x/2020_02_05_09_45_00_46547941.bmp',
 '/data/Unet_data/v0.4-separating-glands-2/x/2020_02_05_09_45_00_46197504.bmp',
 '/data/Unet_data/v0.4-separating-glands-2/x/2020_02_05_09_45_00_46546857.bmp',
 '/data/Unet_data/v0.4-separating-glands-2/x/2020_02_05_09_45_00_46547885.bmp']

In [14]:
len(images_path)

315

In [15]:
# Define augmenting methods
aug = A.Compose([
                 A.VerticalFlip(p=0.5),
                 A.RandomRotate90(p=0.5),
                 A.HorizontalFlip(p=1),
                 A.Transpose(p=1),
                 A.GridDistortion(p=1)
])

In [16]:
# Augment images up to 2000
i=1
while i<=images_to_generate:
    number = random.randint(0, len(images)-1)
    image = images_path[number]
    mask = masks_path[number]
    original_image = cv2.imread(image)
    original_mask = cv2.imread(mask)

    augmented = aug(image=original_image, mask=original_mask)
    transformed_image = augmented['image']
    transformed_mask = augmented['mask']

    new_image_path="%s/augmented_image_%s.bmp" %(img_augmented_path, i)
    new_mask_path="%s/augmented_mask_%s.jpg" %(msk_augmented_path, i)
    cv2.imwrite(new_image_path, transformed_image)
    cv2.imwrite(new_mask_path, transformed_mask)
#     print(new_image_path)
#     print(new_mask_path)
    i = i+1

### Verify that images are augmented to 2000 images in each folder and delete any files(e.g. '.ipynb_checkpoints') or folders that does not end with 'bmp' or 'jpg'.

In [17]:
count=0
# for file in os.listdir('ex_x_aug/'):

img_aug_path = img_augmented_path + '/'

for file in os.listdir(img_aug_path):
    count+=1
    if not file.endswith('bmp'):
        count+=1
        print(file)
        
        try:
            os.remove(img_aug_path + '{}'.format(file))
        except:
            shutil.rmtree(img_aug_path + '{}'.format(file))
        
print(count)

2000


In [18]:
count=0
msk_aug_path = msk_augmented_path + '/'

for file in os.listdir(msk_aug_path):
    count+=1
    if not file.endswith('jpg'):
        count+=1
        print(file)
        
        try:
            os.remove(msk_aug_path + '{}'.format(file))
        except:
            shutil.rmtree(msk_aug_path + '{}'.format(file))
        
print(count)

2000
