## Libraries

In [None]:
import os
import shutil
import numpy as np
import cv2
from keras.preprocessing.image import ImageDataGenerator
from google.colab.patches import cv2_imshow

## Data locations

### Mount google drive - provide google drive access

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


### Set paths correctly

In [None]:
#Train

ORIGINAL_CANCER_TRAIN_PATH = '/gdrive/My Drive/projects/cancer_detection/train/original/cancer'
ORIGINAL_NON_CANCER_TRAIN_PATH = '/gdrive/My Drive/projects/cancer_detection/train/original/non cancer'
AUGMENTED_TRAIN_PATH = '/gdrive/My Drive/projects/cancer_detection/train/augmented/images'
MASK_TRAIN_PATH = '/gdrive/My Drive/projects/cancer_detection/train/augmented/labels/'

#Test
ORIGINAL_TEST_PATH = "/gdrive/My Drive/projects/cancer_detection/test/images"
MASK_TEST_PATH = "/gdrive/My Drive/projects/cancer_detection/test/labels"

## **Create dataset**

### Augmentation of Train data

Augment original cancer images and create augmented train images, which will be used for training. 

In [None]:
def delete_folder_contents(folder):
  
  for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print('Failed to delete %s. Reason: %s' % (file_path, e))    

770 images of cancer . Below are augmentation applied:

* image rotation
* Horizontal flip
* Vertical flip
* Shift image height
* Shift image width
* Zoom image
* shear image - slant image by some angle clockwise or anti clockwise

330 images of non cancer.Beow are augmentation applied

* image rotation
* Horizontal flip
* Vertical flip

The imbalance in data is required to avoid bias towards non cancer, during segmentation

https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator


Data augmentation example
https://machinelearningmastery.com/how-to-configure-image-data-augmentation-when-training-deep-learning-neural-networks/


In [None]:
# ImageDataGenerator - Used for data augmentation
datagen = ImageDataGenerator(
    #random rotation of image by 20 degrees
    rotation_range=20,
    height_shift_range=0.1,
    width_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.2,
    shear_range=0.2,
    #fill the newly created pixels based on nearest neighbour basis
    fill_mode='nearest'
) 

#Clean the folder before adding fresh images
delete_folder_contents(AUGMENTED_TRAIN_PATH)

#Load data from ORIGINAL_CANCER_TRAIN_PATH and save in AUGMENTED_TRAIN_PATH
cancer_data_generator_1 = datagen.flow_from_directory(
        #Original cancer image folder
        ORIGINAL_CANCER_TRAIN_PATH,
        #target size
        target_size = (720 , 720),
        #Save the augmented images in this folder
        save_to_dir= AUGMENTED_TRAIN_PATH,
        #prefix for image file name - cancer_img01.png for example
        save_prefix='cancer',
        #Number of images in a batch
        batch_size=1)

#Load data from ORIGINAL_CANCER_TRAIN_PATH and save in AUGMENTED_TRAIN_PATH
cancer_data_generator_2 = datagen.flow_from_directory(
        #Original cancer image folder
        ORIGINAL_CANCER_TRAIN_PATH,
        #target size
        target_size = (720 , 720),
        #Save the augmented images in this folder
        save_to_dir= AUGMENTED_TRAIN_PATH,
        #prefix for image file name - cancer_img01.png for example
        save_prefix='non_cancer',
        #Number of images in a batch
        batch_size=1)

#For non cancer
non_cancer_data_generator_1 = ImageDataGenerator(rotation_range=20,
    fill_mode='nearest',
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.2,
).flow_from_directory(
        ORIGINAL_NON_CANCER_TRAIN_PATH,
        target_size = (720 , 720),
        save_to_dir= AUGMENTED_TRAIN_PATH,
        save_prefix='non_cancer',
        batch_size=1)

non_cancer_data_generator_2 = ImageDataGenerator(rotation_range=20,
    fill_mode='nearest',
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.2,
).flow_from_directory(
        ORIGINAL_NON_CANCER_TRAIN_PATH,
        target_size = (720 , 720),
        save_to_dir= AUGMENTED_TRAIN_PATH,
        save_prefix='cancer',
        batch_size=1)

#Genrate 770 cancer images
for i in range(770):
    next(cancer_data_generator_1)

#Genrate 330 non cancer images
for i in range(330):
  next(non_cancer_data_generator_1)



Found 125 images belonging to 1 classes.
Found 125 images belonging to 1 classes.
Found 23 images belonging to 1 classes.
Found 23 images belonging to 1 classes.


'for i in range(220):\n  next(non_cancer_data_generator_2)'

### Load dataset

Lets use mask image name is same as train image name. Filenames of the images are stored for this purpose

In [None]:
def create_dataset(path):

  #Image dataset
  dataset = []
  #Corresponding image file names
  filenames = []
    
  for img in os.listdir(path):  # iterate over each image
            try:
                filenames.append(img)
                #read image
                img_array = cv2.imread(os.path.join(path,img)) # convert to array
                if (img_array is None):
                    print("\n",os.path.join(path,img)," is not an image file...not added to the datasets\n")
                    continue
                dataset.append(img_array)  # add this to our cancer_data
                             
            except Exception as e:  # in the interest in keeping the output clean...
                pass
            
  return dataset , filenames

### Train

In [None]:
#train - loaded augmented image dataset
#train_filenames - Files names of all train images
train , train_filenames = create_dataset(AUGMENTED_TRAIN_PATH)

In [None]:
#sample_file_names
train_filenames[0:5]

['cancer_64_113161.png',
 'cancer_7_9000135.png',
 'cancer_68_3965186.png',
 'cancer_92_5605225.png',
 'cancer_65_3927776.png']

Verify number of train images

In [None]:
print(len(train))

1100


Create Test dataset

In [None]:
test , test_filenames = create_dataset(ORIGINAL_TEST_PATH)

Verify number of test images

In [None]:
print(len(test))

31


### Create mask

Create label dataset

In [None]:
#Create mask for given image
def create_mask(img_array):

  blurred_frame=cv2.GaussianBlur(img_array, (5, 5), 0)

  # Convert blurred frame from BGR to HSV
  hsv=cv2.cvtColor(blurred_frame, cv2.COLOR_BGR2HSV)

  # Set the lower and upper bound of HSV.
  # Lower bound = (Lower hue,lower saturation,lower value)
  # Upper bound = (upper hue,upper saturation,upper value)
  l_b=np.array([58, 132, 186])
  u_b=np.array([255, 255, 255])

  # Set the mask range of lower and upper HSV bounds
  mask=cv2.inRange(hsv, l_b, u_b)

  kernel=np.ones((5, 5), np.uint8)

  #Morphological transformation:
  #Operation:Opening - Erosion followed by dilation. Useful for removing noise
  #Erosion - It is useful for removing small white noises based on kernel size.It also shrinks the mask area.
  #Dilation - Increase area of the mask after erosion.
  mask=cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)

  #Converts every pixel to white or black based on the given threshold. It also smoothens the edges
  #Otsu's Binarization - Otsu's method avoids having to choose a threshold value and determines it automatically. - https://docs.opencv.org/3.4/d7/d4d/tutorial_py_thresholding.html
  ret, thresh=cv2.threshold(mask, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)

  return thresh


### Create mask ground truth images Dataset

In [None]:
#Return masked images
def create_mask_dataset(dataset ):
  mask_dataset = []
  for img_array in dataset:
            mask_dataset.append(create_mask(img_array))
  return mask_dataset

### Train

In [None]:
#Create mask dataset from the loaded train images
train_mask = create_mask_dataset(train)

In [None]:
len(train_mask)

1100

### Test

In [None]:
#Mask for test dataset
test_mask = create_mask_dataset(test)

In [None]:
len(test_mask)

31

### Save mask datasets

In [None]:
#Saves the masks created in google drive
#File names ensure both train image and its corresponding mask have same filename.This will be required to create segmentation dataset
def save_dateset(mask_dataset , filenames , path):
  delete_folder_contents(path)
  for i,img in enumerate(mask_dataset):
    cv2.imwrite(os.path.join(path ,filenames[i]), img)

### Train

In [None]:
#Save train masks
save_dateset(train_mask ,train_filenames, MASK_TRAIN_PATH)

### Test

In [None]:
#Save test masks
save_dateset(test_mask ,test_filenames, MASK_TEST_PATH)