# Breast Cancer Detection Model U-Net

## 0. Import Module

- [OpenSlide](https://openslide.org/api/python/#module-openslide)

In [2]:
%matplotlib inline
import os
import csv
import cv2
import openslide
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from PIL import Image
from skimage.filters import threshold_otsu
from sklearn.model_selection import StratifiedShuffleSplit
from tensorflow.keras import layers, models
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from openslide.deepzoom import DeepZoomGenerator

## 1. Patch Gen DataFrame

In [3]:
def find_patches_from_slide(slide_path, 
                            truth_path, 
                            patch_size=256, 
                            filter_non_tissue=True,
                            filter_only_all_tumor=True):
    '''Returns a DataFrame of all patches in slide
        Args:
            - slide_path: path of slide
            - truth_path: path of truth(mask)
            - patch_size: patch size for samples
            - filter_non_tissue: remove samples no tissue detected
        Returns:
            - all_tissue_samples: patch samples from slide'''
    # 해당 데이터가 양성인지 판단
    slide_contains_tumor = 'pos' in slide_path

    # read_region을 위한 start, level, size 계산
    bounds_offset_props = (openslide.PROPERTY_NAME_BOUNDS_X, openslide.PROPERTY_NAME_BOUNDS_Y)
    bounds_size_props = (openslide.PROPERTY_NAME_BOUNDS_WIDTH, openslide.PROPERTY_NAME_BOUNDS_HEIGHT)

    with openslide.open_slide(slide_path) as slide:
        start = (0, 0)
        size_scale = (1, 1)
        level = int(np.log2(patch_size))
        l_dimensions = [(int(np.ceil(dim_x * size_scale[0])), int(np.ceil(dim_y * size_scale[1])))
                        for dim_x, dim_y in slide.level_dimensions]
        size = l_dimensions[level]
        
        if slide_contains_tumor: 
            start = (int(slide.properties.get(bounds_offset_props[0], 0)), 
                     int(slide.properties.get(bounds_offset_props[1], 0)))
            size_scale = tuple(int(slide.properties.get(prop, dim)) / dim 
                               for prop, dim in zip(bounds_size_props, slide.dimensions))
            
            with openslide.open_slide(truth_path) as truth:
                z_dimensions = []
                z_size = truth.dimensions
                z_dimensions.append(z_size)
                while z_size[0] > 1 or z_size[1] > 1:
                    z_size = tuple(max(1, int(np.ceil(z/2))) for z in z_size)
                    z_dimensions.append(z_size)
            size = z_dimensions[level-4]

        slide4 = slide.read_region(start, level, size)
        
    # is_tissue 부분
    slide4_grey = np.array(slide4.convert('L'))

    # background에 대한 작업
    slide4_not_black = slide4_grey[slide4_grey > 0]
    # thresh = threshold_otsu(slide4_not_black)
    ret, th = cv2.threshold(slide4_not_black, 0, 255, 
                            cv2.THRESH_BINARY+cv2.THRESH_OTSU)

    binary = slide4_grey > 0  # black == 0
    h, w = slide4_grey.shape
    for i in range(h):
        for j in range(w):
            if slide4_grey[i, j] > ret:
                binary[i, j] = False

    # patch_df
    patches = pd.DataFrame(pd.DataFrame(binary).stack(), columns=['is_tissue'])
    patches.loc[:, 'slide_path'] = slide_path
    
    # is_tumor 부분
    if slide_contains_tumor:
        with openslide.open_slide(truth_path) as truth:
            thumbnail_truth = truth.get_thumbnail(size)

        # truth pathes_df
        patches_y = pd.DataFrame(
                pd.DataFrame(np.array(thumbnail_truth.convert('L'))).stack())
        patches_y['is_tumor'] = patches_y[0] > 0

        # mask된 영역이 애매한 경우
        patches_y['is_all_tumor'] = patches_y[0] == 255
        patches_y.drop(0, axis=1, inplace=True)
        samples = pd.concat([patches, patches_y], axis=1)
    else: 
        samples = patches
        samples.loc[:, 'is_tumor'] = False
        samples.loc[:, 'is_all_tumor'] = False

    if filter_non_tissue:  # tissue인것만 가져오기
        samples = samples[samples['is_tissue'] == True]
        
    if filter_only_all_tumor:  # 어떤 의미?
        samples['tile_loc'] = list(samples.index)
        all_tissue_samples = samples[samples['is_tumor'] == False]
        all_tissue_samples = all_tissue_samples.append(samples[samples['is_all_tumor'] == True])
        all_tissue_samples.reset_index(inplace=True, drop=True)
    else:
        return samples
    
    return all_tissue_samples

In [4]:
%%time
slide_path = '../data/train/pos/16-S-042893_A1.mrxs'  # slide
truth_path = '../data/train/pos/Mask_16-S-042893_A1.png'  # mask

all_tissue_samples = find_patches_from_slide(slide_path, truth_path)

CPU times: user 1.38 s, sys: 72 ms, total: 1.45 s
Wall time: 1.45 s


In [5]:
all_tissue_samples['is_tumor'].value_counts()

False    74490
True     30723
Name: is_tumor, dtype: int64

## 2. Train data Gen

In [6]:
def generator(samples,
              slide_paths,
              truth_paths,
              batch_size,
              patch_size=256,
              shuffle=True):
    '''The generator for DataSet
        Args:
            - samples: DataFrame of samples
            - slide_paths: paths of all slides 
            - truth_paths: paths of all truth(masks)
            - batch_size: mini-batch size
            - patch_size: patch size for samples
            - shuffle: bool, if True shuffle samples
        Returns(yield):
            - train_x: train dataset → [batch_size, patch_size, patch_size, 3]
            - train_y: train labelset → [batch_size, patch_size, patch_size, 2]'''
    
    # 4개씩 묶은 slide path
    slide0 = openslide.open_slide(slide_paths[0])
    slide1 = openslide.open_slide(slide_paths[1])
    slide2 = openslide.open_slide(slide_paths[2])
    slide3 = openslide.open_slide(slide_paths[3])

    # tiles
    tiles0 = DeepZoomGenerator(slide0, tile_size=patch_size, overlap=0, limit_bounds=False)
    tiles1 = DeepZoomGenerator(slide1, tile_size=patch_size, overlap=0, limit_bounds=False)
    tiles2 = DeepZoomGenerator(slide2, tile_size=patch_size, overlap=0, limit_bounds=False)
    tiles3 = DeepZoomGenerator(slide3, tile_size=patch_size, overlap=0, limit_bounds=False)

    start_x0, start_y0 = 0, 0
    start_x1, start_y1 = 0, 0
    start_x2, start_y2 = 0, 0
    start_x3, start_y3 = 0, 0
    if 'pos' in slide_paths[0]:
        start_x0 = int(slide0.properties.get('openslide.bounds-x', 0)) / patch_size
        start_y0 = int(slide0.properties.get('openslide.bounds-y', 0)) / patch_size
        truth0 = openslide.open_slide(truth_paths[0])
        truth_tiles0 = DeepZoomGenerator(truth0, tile_size=16,overlap=0, limit_bounds=False)
    
    if 'pos' in slide_paths[1]: 
        start_x1 = int(slide1.properties.get('openslide.bounds-x', 0)) / patch_size
        start_y1 = int(slide1.properties.get('openslide.bounds-y', 0)) / patch_size
        truth1 = openslide.open_slide(truth_paths[1])
        truth_tiles1 = DeepZoomGenerator(truth1, tile_size=16,overlap=0, limit_bounds=False)
        
    if 'pos' in slide_paths[2]:
        start_x2 = int(slide2.properties.get('openslide.bounds-x', 0)) / patch_size
        start_y2 = int(slide2.properties.get('openslide.bounds-y', 0)) / patch_size
        truth2 = openslide.open_slide(truth_paths[2])
        truth_tiles2 = DeepZoomGenerator(truth2, tile_size=16, overlap=0, limit_bounds=False)
        
    if 'pos' in slide_paths[3]:
        start_x3 = int(slide3.properties.get('openslide.bounds-x', 0)) / patch_size
        start_y3 = int(slide3.properties.get('openslide.bounds-y', 0)) / patch_size
        truth3 = openslide.open_slide(truth_paths[3])
        truth_tiles3 = DeepZoomGenerator(truth3, tile_size=16, overlap=0, limit_bounds=False)
        
    num_samples = len(samples)
    while 1:
        if shuffle:
            samples = samples.sample(frac=1)  # shuffling

        for offset in range(0, num_samples, batch_size):
            batch_samples = samples.iloc[offset:offset+batch_size]

            batch_tiles, batch_masks = [], []
            for slide_path, (y, x) in zip(batch_samples['slide_path'].values, 
                                          batch_samples['tile_loc'].values):
                
                mask_tile_zoom = np.zeros((patch_size,patch_size))
                if slide_path == slide_paths[0]:
                    img = tiles0.get_tile(tiles0.level_count-1, (x+start_x0, y+start_y0))
                    if 'pos' in slide_path:
                        mask_tile = truth_tiles0.get_tile(truth_tiles0.level_count-1, (x, y))
                        mask_tile = (cv2.cvtColor(np.array(mask_tile), cv2.COLOR_RGB2GRAY) > 0).astype(int)
                        # mask_size_up , 16 to 256
                        k, l = mask_tile.shape
                        for i in range(k):
                            for j in range(l):
                                for o in range(16):
                                    for p in range(16):
                                        mask_tile_zoom[i*16+o,j*16+p] = mask_tile[i][j]
                        
                elif slide_path == slide_paths[1]:
                    img = tiles1.get_tile(tiles1.level_count-1, (x+start_x1, y+start_y1))
                    if 'pos' in slide_path:
                        mask_tile = truth_tiles1.get_tile(truth_tiles1.level_count-1, (x, y))
                        mask_tile = (cv2.cvtColor(np.array(mask_tile), cv2.COLOR_RGB2GRAY) > 0).astype(int)
                        # mask_size_up , 16 to 256
                        k, l = mask_tile.shape
                        for i in range(k):
                            for j in range(l):
                                for o in range(16):
                                    for p in range(16):
                                        mask_tile_zoom[i*16+o,j*16+p] = mask_tile[i][j]
                
                elif slide_path == slide_paths[2]:
                    img = tiles2.get_tile(tiles2.level_count-1, (x+start_x2, y+start_y2))
                    if 'pos' in slide_path:
                        mask_tile = truth_tiles2.get_tile(truth_tiles2.level_count-1, (x, y))
                        mask_tile = (cv2.cvtColor(np.array(mask_tile), cv2.COLOR_RGB2GRAY) > 0).astype(int)
                        # mask_size_up , 16 to 256
                        k, l = mask_tile.shape
                        for i in range(k):
                            for j in range(l):
                                for o in range(16):
                                    for p in range(16):
                                        mask_tile_zoom[i*16+o,j*16+p] = mask_tile[i][j]

                elif slide_path == slide_paths[3]:
                    img = tiles3.get_tile(tiles3.level_count-1, (x+start_x3, y+start_y3))
                    if 'pos' in slide_path:
                        mask_tile = truth_tiles3.get_tile(truth_tiles3.level_count-1, (x, y))
                        mask_tile = (cv2.cvtColor(np.array(mask_tile), cv2.COLOR_RGB2GRAY) > 0).astype(int)
                        # mask_size_up , 16 to 256
                        k, l = mask_tile.shape
                        for i in range(k):
                            for j in range(l):
                                for o in range(16):
                                    for p in range(16):
                                        mask_tile_zoom[i*16+o,j*16+p] = mask_tile[i][j]

                
                if img.size != (patch_size, patch_size):
                    img = Image.new('RGB', (patch_size, patch_size))
                    mask_tile_zoom = np.zeros((patch_size, patch_size))
                    
                batch_tiles.append(np.array(img))
                batch_masks.append(mask_tile_zoom)
                
            # train_x & train_y
            train_x = np.array(batch_tiles)
            train_y = to_categorical(np.array(batch_masks), num_classes=2)
            
            # data augmentation
#             train_x, train_y = next(
#                 ImageDataGenerator(rotation_range=90,
#                                    horizontal_flip=True,
#                                    vertical_flip=True,
#                                    brightness_range=(0.25, 1.)).flow(train_x, y=train_y, batch_size=batch_size))
            yield train_x, train_y

## 3. Modeling

In [22]:
K.clear_session()

def create_model(patch_size=256, pre_trained_path=False):
    # Build U-Net model
    inputs = layers.Input(shape=(patch_size, patch_size, 3), dtype='float32', name='inputs')
    inputs_norm = layers.Lambda(lambda x: x/255. - .5)(inputs)

    # Conv layers
    conv1 = layers.Conv2D(16, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(inputs_norm)
    conv1 = layers.Dropout(0.1)(conv1)
    conv1 = layers.Conv2D(16, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(conv1)
    pool1 = layers.MaxPooling2D()(conv1)

    conv2 = layers.Conv2D(32, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(pool1)
    conv2 = layers.Dropout(0.1)(conv2)
    conv2 = layers.Conv2D(32, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(conv2)
    pool2 = layers.MaxPooling2D()(conv2)

    conv3 = layers.Conv2D(64, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(pool2)
    conv3 = layers.Dropout(0.2)(conv3)
    conv3 = layers.Conv2D(64, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(conv3)
    pool3 = layers.MaxPooling2D()(conv3)

    conv4 = layers.Conv2D(128, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(pool3)
    conv4 = layers.Dropout(0.2)(conv4)
    conv4 = layers.Conv2D(128, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(conv4)
    pool4 = layers.MaxPooling2D()(conv4)

    conv5 = layers.Conv2D(256, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(pool4)
    conv5 = layers.Dropout(0.3)(conv5)
    conv5 = layers.Conv2D(256, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(conv5)

    # Up-Conv layers
    up_conv6 = layers.Conv2DTranspose(128, 2, strides=2, padding='same')(conv5)
    up_conv6 = layers.concatenate([up_conv6, conv4])
    conv6 = layers.Conv2D(128, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(up_conv6)
    conv6 = layers.Dropout(0.2)(conv6)
    conv6 = layers.Conv2D(128, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(conv6)

    up_conv7 = layers.Conv2DTranspose(64, 2, strides=2, padding='same')(conv6)
    up_conv7 = layers.concatenate([up_conv7, conv3])
    conv7 = layers.Conv2D(64, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(up_conv7)
    conv7 = layers.Dropout(0.2)(conv7)
    conv7 = layers.Conv2D(64, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(conv7)

    up_conv8 = layers.Conv2DTranspose(32, 2, strides=2, padding='same')(conv7)
    up_conv8 = layers.concatenate([up_conv8, conv2])
    conv8 = layers.Conv2D(32, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(up_conv8)
    conv8 = layers.Dropout(0.1)(conv8)
    conv8 = layers.Conv2D(32, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(conv8)

    up_conv9 = layers.Conv2DTranspose(16, 2, strides=2, padding='same')(conv8)
    up_conv9 = layers.concatenate([up_conv9, conv1])
    conv9 = layers.Conv2D(16, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(up_conv9)
    conv9 = layers.Dropout(0.1)(conv9)
    conv9 = layers.Conv2D(16, 3, padding='same', 
                          activation='relu', kernel_initializer='he_normal')(conv9)

    outputs = layers.Conv2D(2, 1, activation='softmax', 
                           kernel_initializer='he_normal')(conv9)

    model = models.Model(inputs, outputs)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy', 
                  metrics=['acc'])
    
    if pre_trained_path:
        model = models.load_model(pre_trained_path)
    
    return model


model = create_model()

In [23]:
# from tensorflow.keras.models import Sequential
# def simple_model(pretrained_weights = None):
#     model = Sequential()
#     model.add(layers.Lambda(lambda x: x / 255.0 - 0.5, input_shape=(256, 256, 3)))
#     model.add(layers.Convolution2D(100, (3, 3), strides=(2, 2), activation='elu', padding='same'))
#     model.add(layers.MaxPooling2D())
#     model.add(layers.Convolution2D(200, (3, 3), strides=(2, 2), activation='elu', padding='same'))
#     model.add(layers.MaxPooling2D())
#     model.add(layers.Convolution2D(300, (3, 3), activation='elu', padding='same'))
#     model.add(layers.Convolution2D(300, (3, 3), activation='elu',  padding='same'))
#     model.add(layers.Dropout(0.2))
#     model.add(layers.Convolution2D(2, (1, 1))) # this is called upscore layer for some reason?
#     model.add(layers.Conv2DTranspose(2, (31, 31), strides=(16, 16), activation='softmax', padding='same'))

#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
#     if(pretrained_weights):
#         model.load_weights(pretrained_weights)
        
#     return model

# model = simple_model()

In [24]:
def get_data_path():
    slide_paths, mask_paths = {}, {}
    with open('./train.txt', 'r') as f:
        for idx, line in enumerate(f):
            path = line.rstrip('\n')
            slide_paths[idx] = path
    
    with open('./train_mask.txt', 'r') as f:
        for idx, line in enumerate(f):
            path = line.rstrip('\n')
            mask_paths[idx] = path
            
    return slide_paths, mask_paths

In [25]:
slide_paths, mask_paths = get_data_path()

In [26]:
slide_4_list_1 = [[102,104,29,44],[144,55,30,18],[54,65,21,36],[139,82,1,49],[105,151,15,2],[75,100,41,9],[156,113,32,37]]
slide_4_list_2 = [[109,58,14,28],[101,69,11,43],[94,74,3,20],[64,140,17,16],[92,154,8,26],[99,60,0,33],[86,146,25,19],[68,112,38,51],
                 [71,136,31,4],[59,91,12,6]]
slide_4_list_3 = [[143,132,124,85],[95,120,81,77],[97,96,110,83],[152,128,149,155],[153,111,57,138],[134,135,114,76],
                  [123,90,121,61],[147,148,119,142],[66,137,63,80],[70,79,115,133],[129,141,127,145]]
slide_4_test = [[55,55, 0, 0]]

columns = ['is_tissue','slide_path','is_tumor','is_all_tumor','tile_loc']

In [27]:
%%time
batch_size = 32
n_epochs = 10
for slides in slide_4_test:
    sample_group_df = pd.DataFrame(
            columns=['is_tissue','slide_path','is_tumor','is_all_tumor','tile_loc'])
    
    group_slide_path, group_mask_path = [], []
    for idx in slides:
        slide_path, truth_path = slide_paths[idx], mask_paths[idx]
        samples = find_patches_from_slide('.'+slide_path, '.'+truth_path)
        sample_group_df = sample_group_df.append(samples)
        group_slide_path.append('.'+slide_path)
        group_mask_path.append('.'+truth_path)
        
    num_samples = len(sample_group_df)
    if num_samples > 5000:
        num_samples = 5000
    
    samples = sample_group_df.sample(num_samples, random_state=42)
    samples.reset_index(drop=True, inplace=True)
    
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(samples, samples["is_tumor"]):
            train_samples = samples.loc[train_index]
            validation_samples = samples.loc[test_index]
            
    train_gen = generator(train_samples, group_slide_path, group_mask_path, batch_size)
    val_gen = generator(validation_samples, group_slide_path, group_mask_path, batch_size)
    
    model.fit_generator(train_gen, 
                        steps_per_epoch=np.ceil(len(train_samples)/batch_size),
                        epochs=n_epochs,
                        validation_data=val_gen,
                        validation_steps=np.ceil(len(validation_samples)/batch_size))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Epoch 1/10
Epoch 2/10
Epoch 3/10
 28/125 [=====>........................] - ETA: 47s - loss: 7.1492 - acc: 0.9181

KeyboardInterrupt: 

In [219]:
# val_x, val_y = next(val_gen)

# f, axes = plt.subplots(4, 8, figsize=(20, 4));
# ax = axes.flatten()
# for i in range(0, val_x.shape[0]):
#     _ = ax[i].imshow(val_x[i]);
#     _ = ax[i].axis('off');
# f.suptitle('Batch of Patches 32x256x256x3');
    
# f, axes = plt.subplots(4, 8, figsize=(20, 4));
# ax = axes.flatten()    
# for i in range(0, val_x.shape[0]):
#     _ = ax[i].imshow(val_y[i].argmax(axis=2), cmap='gray', vmin=0, vmax=1);
#     _ = ax[i].axis('off');
# f.suptitle('Batch of Truth Masks 32x256x256x1');