In [1]:
import numpy as np
import glob
import skimage.io as io
import os.path
import tensorflow as tf

In [2]:
def fimg_to_fmask(img_path):
    # convert an image file path into a corresponding mask file path 
    dirname, basename = os.path.split(img_path)
    maskname = basename.replace(".tif", "_mask.tif")
    return os.path.join(dirname, maskname)

In [3]:
origin_images_subset = [img for img in glob.glob("train_subset/*.tif") if 'mask' not in img]
paired_images_subset = [(img, fimg_to_fmask(img)) for img in origin_images_subset]
print("number of image segmentation pairs: ", len(paired_images_subset))

number of image segmentation pairs:  599


In [4]:
origin_images_full = [img for img in glob.glob("train/*.tif") if 'mask' not in img]
paired_images_full = [(img, fimg_to_fmask(img)) for img in origin_images_full]
print("number of image segmentation pairs: ", len(paired_images_full))

number of image segmentation pairs:  5635


In [None]:
%matplotlib inline
# check an image instance
img = io.imread('train_subset/1_1.tif')
mask =io.imread('train_subset/1_1_mask.tif')
print(type(img))
print(img.shape)
io.imshow(img)
io.show()
io.imshow(mask)
io.show()

In [5]:
from keras.preprocessing.image import ImageDataGenerator
import numpy as np

def image_augmentation(img, save_dir):
    datagen = ImageDataGenerator(
            rotation_range=180,
            horizontal_flip=True,
            fill_mode='nearest')
    img = np.expand_dims(img, 0)
    img = np.expand_dims(img, -1)
    
    i = 0
    for batch in datagen.flow(img, batch_size=1,
                          save_to_dir=save_dir, save_prefix="mask", save_format='jpeg'):
        i += 1
        if i > 10:
            break

Using TensorFlow backend.


In [6]:
def images_split(paired_images, full=False, train=True):
    
    fcn_img = "data_fcn"
    fcn_mask = "data_fcn"
    simple_cnn_img = "data_simple_cnn"
    simple_cnn_mask = "data_simple_cnn"
    
    if full:
        fcn_img += "_full"
        fcn_mask += "_full"
        simple_cnn_img += "_full"
        simple_cnn_mask += "_full"
    
    if train:
        fcn_img += "/train/images/images/"
        fcn_mask += "/train/masks/masks/"
        simple_cnn_img += "/train/no_mask/"
        simple_cnn_mask += "/train/mask/"
    else: 
        fcn_img += "/validation/images/images/"
        fcn_mask += "/validation/masks/masks/"
        simple_cnn_img += "/validation/no_mask/"
        simple_cnn_mask += "/validation/mask/"
        
    
    count_no_mask = 1
    count_mask = 1
    count_fcn = 1
    for raw_img, raw_mask in paired_images:
        img = io.imread(raw_img)
        mask = io.imread(raw_mask) / 255
        for i in range(6):
            for j in range(5):
                small_img = img[i*70:(i+1)*70, j*116:(j+1)*116]
                small_mask = mask[i*70:(i+1)*70, j*116:(j+1)*116]
                io.imsave(fcn_img + str(count_fcn) + ".jpg", small_img / 255)
                io.imsave(fcn_mask + str(count_fcn) + "_mask.jpg", small_mask)
                count_fcn += 1
                if np.sum(mask[i*70:(i+1)*70, j*116:(j+1)*116]) >= 400:
                    io.imsave(simple_cnn_mask + str(count_mask) + "_mask.jpg", small_img)
                    count_mask += 1
                else:
                    io.imsave(simple_cnn_img + str(count_no_mask) + ".jpg", small_img)
                    count_no_mask += 1
    print("Finished splitting and saving images and segmentations")

In [None]:
images_split(paired_images_subset[:480], full=False, train=True)
images_split(paired_images_subset[480:], full=False, train=False)

In [7]:
imgs_no_mask = [img for img in glob.glob("data_simple_cnn/train/no_mask/*")]
imgs_mask = [img for img in glob.glob("data_simple_cnn/train/mask/*")]

print("images with no mask: ", len(imgs_no_mask))
print("images with mask: ", len(imgs_mask))
print("mask to no mask ratio", float(len(imgs_mask))/len(imgs_no_mask))

images with no mask:  13982
images with mask:  10275
mask to no mask ratio 0.7348734086682878


In [None]:
# 5635
images_split(paired_images_full[:5000], full=True, train=True)

In [None]:
images_split(paired_images_full[5000:], full=True, train=False)

In [8]:
imgs_no_mask = [img for img in glob.glob("data_simple_cnn_full/train/no_mask/*")]
imgs_mask = [img for img in glob.glob("data_simple_cnn_full/train/mask/*")]

print("images with no mask: ", len(imgs_no_mask))
print("images with mask: ", len(imgs_mask))
print("mask to no mask ratio", float(len(imgs_mask))/len(imgs_no_mask))

images with no mask:  323227
images with mask:  350497
mask to no mask ratio 1.0843679519347085


In [9]:
from keras.preprocessing.image import ImageDataGenerator
import numpy as np

def image_augmentation(img, save_dir, save_prefix):
    datagen = ImageDataGenerator(
            rotation_range=180,
            horizontal_flip=True,
            fill_mode='nearest')
    img = np.expand_dims(img, 0)
    img = np.expand_dims(img, -1)
    
    i = 0
    for batch in datagen.flow(img, batch_size=1,
                          save_to_dir=save_dir, save_prefix=save_prefix, save_format='jpg'):
        i += 1
        if i > 5:
            break

### augment train/mask

In [None]:
import random

for count in range((len(imgs_no_mask) - len(imgs_mask))//5):
    n = len(imgs_mask)
    i = random.randint(1, n-1)
    small_img = io.imread(imgs_mask[i])
    image_augmentation(small_img, "data_simple_cnn_full/train/mask/", "aug_"+str(count))

#### check image numbers

In [10]:
imgs_no_mask = [img for img in glob.glob("data_simple_cnn_full/train/no_mask/*")]
imgs_mask = [img for img in glob.glob("data_simple_cnn_full/train/mask/*")]

print("images with no mask: ", len(imgs_no_mask))
print("images with mask: ", len(imgs_mask))
print("mask to no mask ratio", float(len(imgs_mask))/len(imgs_no_mask))

images with no mask:  323227
images with mask:  350497
mask to no mask ratio 1.0843679519347085


### augment both train/mask and train/no_mask

In [12]:
import random

for count in range(300000//5):
    i = random.randint(1, len(imgs_mask)-1)
    small_img = io.imread(imgs_mask[i])
    image_augmentation(small_img, "data_simple_cnn_full/train/mask/", "even_more_aug_"+str(count))
    j = random.randint(1, len(imgs_no_mask)-1)
    small_img = io.imread(imgs_no_mask[j])
    image_augmentation(small_img, "data_simple_cnn_full/train/no_mask/", "even_more_aug_"+str(count))

#### check image numbers

In [14]:
imgs_no_mask = [img for img in glob.glob("data_simple_cnn_full/train/no_mask/*")]
imgs_mask = [img for img in glob.glob("data_simple_cnn_full/train/mask/*")]

print("images with no mask: ", len(imgs_no_mask))
print("images with mask: ", len(imgs_mask))
print("mask to no mask ratio", float(len(imgs_mask))/len(imgs_no_mask))

images with no mask:  683136
images with mask:  710408
mask to no mask ratio 1.0399217725313847


### check validation image numbers

In [15]:
imgs_no_mask_val = [img for img in glob.glob("data_simple_cnn_full/validation/no_mask/*")]
imgs_mask_val = [img for img in glob.glob("data_simple_cnn_full/validation/mask/*")]

print("images with no mask: ", len(imgs_no_mask_val))
print("images with mask: ", len(imgs_mask_val))
print("mask to no mask ratio", float(len(imgs_mask_val))/len(imgs_no_mask_val))

images with no mask:  43955
images with mask:  47440
mask to no mask ratio 1.0792856330337846


### augment validation/mask

In [None]:
for count in range((len(imgs_no_mask_val) - len(imgs_mask_val))//5):
    n = len(imgs_mask_val)
    i = random.randint(1, n-1)
    small_img = io.imread(imgs_mask_val[i])
    image_augmentation(small_img, "data_simple_cnn_full/validation/mask/", "aug_"+str(count))

### check validation image numbers

In [16]:
imgs_no_mask_val = [img for img in glob.glob("data_simple_cnn_full/validation/no_mask/*")]
imgs_mask_val = [img for img in glob.glob("data_simple_cnn_full/validation/mask/*")]

print("images with no mask: ", len(imgs_no_mask_val))
print("images with mask: ", len(imgs_mask_val))
print("mask to no mask ratio", float(len(imgs_mask_val))/len(imgs_no_mask_val))

images with no mask:  43955
images with mask:  47440
mask to no mask ratio 1.0792856330337846


### augment both validation/mask and validation/no_mask

In [17]:
for count in range(60000//5):
    i = random.randint(1, len(imgs_mask_val)-1)
    small_img = io.imread(imgs_mask_val[i])
    image_augmentation(small_img, "data_simple_cnn_full/validation/mask/", "more_aug_"+str(count))
    j = random.randint(1, len(imgs_no_mask_val)-1)
    small_img = io.imread(imgs_no_mask_val[j])
    image_augmentation(small_img, "data_simple_cnn_full/validation/no_mask/", "more_aug_"+str(count))

#### check image numbers

In [18]:
imgs_no_mask_val = [img for img in glob.glob("data_simple_cnn_full/validation/no_mask/*")]
imgs_mask_val = [img for img in glob.glob("data_simple_cnn_full/validation/mask/*")]

print("images with no mask: ", len(imgs_no_mask_val))
print("images with mask: ", len(imgs_mask_val))
print("mask to no mask ratio", float(len(imgs_mask_val))/len(imgs_no_mask_val))

images with no mask:  115932
images with mask:  119411
mask to no mask ratio 1.030008970775972


# Augment data_fcn and data_fcn_full for FCN model training

## augment images and corresponding masks if defected

In [None]:
def defected(mask):
    return np.sum(mask[:,:]) >= 400

train_masks = [ma for ma in glob.glob("data_fcn_full/train/masks/masks/*")]
defected_masks = [defected(io.imread(ma)) for ma in train_masks]
print("No. of train masks: ", len(train_masks))
print("No. of defected masks: ", sum(defected_masks))
print("defected percentage: ", float(sum(defected_masks))/len(train_masks))

In [None]:
def image_augmentation(img, save_dir, save_prefix, seed):
    datagen = ImageDataGenerator(rotation_range=180, horizontal_flip=True, fill_mode='nearest')
    img = np.expand_dims(img, 0)
    img = np.expand_dims(img, -1)
    i = 0
    for batch in datagen.flow(img, batch_size=1, save_to_dir=save_dir, 
                              save_prefix=save_prefix, save_format='jpg', seed=seed):
        i += 1
        if i > 5:
            break

In [20]:
def fimg_to_fmask(img_path, mask_folder):
    dirname, basename = os.path.split(img_path)
    maskname = basename.replace(".jpg", "_mask.jpg")
    return os.path.join(mask_folder, maskname)

img_folder = "data_fcn_full/train/images/images/"
mask_folder = "data_fcn_full/train/masks/masks/"

fcn_train_imgs = [img for img in glob.glob(img_folder+"*")]
fcn_train_pairs = [(img, fimg_to_fmask(img, mask_folder)) for img in fcn_train_imgs]

n = len(fcn_train_pairs)
print(n)

307371


### augmentation process

In [None]:
import random
count = 1
for _ in range(n):
    i = random.randint(1, n-1)
    seed = random.randint(1, 10000001)
    small_mask = io.imread(fcn_train_pairs[i][1])
    if defected(small_mask):
        image_augmentation(small_mask, mask_folder, "mask_aug_defect_"+str(count), seed)
        small_img = io.imread(fcn_train_pairs[i][0])
        image_augmentation(small_img, img_folder, "aug_defect_"+str(count), seed)
        count += 1
        if count > 1000:
            break

### check numbers

In [None]:
train_masks = [ma for ma in glob.glob("data_fcn_full/train/masks/masks/*")]
defected_masks = [defected(io.imread(ma)) for ma in train_masks]
print("No. of train masks: ", len(train_masks))
print("No. of defected masks: ", sum(defected_masks))
print("defected percentage: ", float(sum(defected_masks))/len(train_masks))

### augment validation data

In [21]:
img_folder = "data_fcn_full/validation/images/images/"
mask_folder = "data_fcn_full/validation/masks/masks/"

def fimg_to_fmask(img_path, mask_folder):
    dirname, basename = os.path.split(img_path)
    maskname = basename.replace(".jpg", "_mask.jpg")
    return os.path.join(mask_folder, maskname)

fcn_val_imgs = [img for img in glob.glob(img_folder+"*")]
fcn_val_pairs = [(img, fimg_to_fmask(img, mask_folder)) for img in fcn_val_imgs]

n = len(fcn_val_pairs)
print(n)

19050


In [26]:
import random
count = 1

def defected(mask):
    return np.sum(mask[:,:]) >= 400

def image_augmentation(img, save_dir, save_prefix, seed):
    datagen = ImageDataGenerator(rotation_range=180, horizontal_flip=True, fill_mode='nearest')
    img = np.expand_dims(img, 0)
    img = np.expand_dims(img, -1)
    i = 0
    for batch in datagen.flow(img, batch_size=1, save_to_dir=save_dir, 
                              save_prefix=save_prefix, save_format='jpg', seed=seed):
        i += 1
        if i > 5:
            break

for _ in range(n):
    i = random.randint(1, n-1)
    seed = random.randint(1, 10000001)
    small_mask = io.imread(fcn_val_pairs[i][1])
    if defected(small_mask):
        image_augmentation(small_mask, mask_folder, "mask_aug_defect_"+str(count), seed)
        small_img = io.imread(fcn_val_pairs[i][0])
        image_augmentation(small_img, img_folder, "aug_defect_"+str(count), seed)
        count += 1
        if count > 2000:
            break

In [27]:
val_masks = [ma for ma in glob.glob("data_fcn_full/validation/masks/masks/*")]
defected_masks = [defected(io.imread(ma)) for ma in val_masks]
print("No. of validation masks: ", len(val_masks))
print("No. of defected masks: ", sum(defected_masks))
print("defected percentage: ", float(sum(defected_masks))/len(val_masks))

No. of validation masks:  32264
No. of defected masks:  13025
defected percentage:  0.40370071906769156


## augment both train/masks and train/no_masks to match fcn parameter numbers

In [33]:
img_folder = "data_fcn_full/train/images/images/"
mask_folder = "data_fcn_full/train/masks/masks/"


def fimg_to_fmask(img_path, mask_folder):
    dirname, basename = os.path.split(img_path)
    if basename[0].isalpha():
        maskname = "mask_"+basename
    else:
        maskname = basename.replace(".jpg", "_mask.jpg")
    return os.path.join(mask_folder, maskname)

fcn_train_imgs = [img for img in glob.glob(img_folder+"*")]
fcn_train_masks = [mask for mask in glob.glob(mask_folder+"*")]
fcn_train_pairs = [(img, fimg_to_fmask(img, mask_folder)) for img in fcn_train_imgs]

n = len(fcn_train_pairs)
print(n)
print(fcn_train_pairs[3458])
print(len(fcn_train_imgs))
print(len(fcn_train_masks))

307377
('data_fcn_full/train/images/images/aug_defect_2026_0_2482.jpg', 'data_fcn_full/train/masks/masks/mask_aug_defect_2026_0_2482.jpg')
307377
307377


In [36]:
import random
count = 1
for _ in range(n):
    i = random.randint(1, n-1)
    seed = random.randint(1, 10000001)
    small_mask = io.imread(fcn_train_pairs[i][1])
    image_augmentation(small_mask, mask_folder, "mask_aug_defect_"+str(count), seed)
    small_img = io.imread(fcn_train_pairs[i][0])
    image_augmentation(small_img, img_folder, "aug_defect_"+str(count), seed)
    count += 1
    if count > 100000:
        break

In [37]:
train_masks = [ma for ma in glob.glob("data_fcn_full/train/masks/masks/*")]
defected_masks = [defected(io.imread(ma)) for ma in train_masks]
print("No. of train masks: ", len(train_masks))
print("No. of defected masks: ", sum(defected_masks))
print("defected percentage: ", float(sum(defected_masks))/len(train_masks))

No. of train masks:  1506512
No. of defected masks:  709056
defected percentage:  0.47066070499272494


## augment both validation/masks and validation/no_masks

In [38]:
img_folder = "data_fcn_full/validation/images/images/"
mask_folder = "data_fcn_full/validation/masks/masks/"

fcn_val_imgs = [img for img in glob.glob(img_folder+"*")]
fcn_val_pairs = [(img, fimg_to_fmask(img, mask_folder)) for img in fcn_val_imgs]

n = len(fcn_val_pairs)
print(n)

32264


In [41]:
import random
count = 1
for _ in range(n):
    i = random.randint(1, n-1)
    seed = random.randint(1, 10000001)
    small_mask = io.imread(fcn_val_pairs[i][1])
    image_augmentation(small_mask, mask_folder, "mask_aug_defect_"+str(count), seed)
    small_img = io.imread(fcn_val_pairs[i][0])
    image_augmentation(small_img, img_folder, "aug_defect_"+str(count), seed)
    count += 1
    if count > 10000:
        break

In [42]:
val_masks = [ma for ma in glob.glob("data_fcn_full/validation/masks/masks/*")]
defected_masks = [defected(io.imread(ma)) for ma in val_masks]
print("No. of validation masks: ", len(val_masks))
print("No. of defected masks: ", sum(defected_masks))
print("defected percentage: ", float(sum(defected_masks))/len(val_masks))

No. of validation masks:  152184
No. of defected masks:  58921
defected percentage:  0.3871694790516743
