In [20]:
import os
from import glob # added
import os.path as osp
import openslide
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from skimage.filters import threshold_otsu

from openslide.deepzoom import DeepZoomGenerator

import cv2
from keras.utils.np_utils import to_categorical

from keras.models import Sequential
from keras.layers import Lambda, Dropout
from keras.layers.convolutional import Convolution2D, Conv2DTranspose

from sklearn.model_selection import StratifiedShuffleSplit
from datetime import datetime

import matplotlib.gridspec as gridspec
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

import math
from PIL import Image
from xml.etree.ElementTree import ElementTree, Element, SubElement

SyntaxError: invalid syntax (<ipython-input-20-4bf852a27150>, line 2)

In [5]:
# Check whether GPU is availalbe
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
import keras.backend.tensorflow_backend as K

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10318548999558967659
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 6612275691
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2002980221518989664
physical_device_desc: "device: 0, name: GeForce RTX 2070, pci bus id: 0000:23:00.0, compute capability: 7.5"
]


---
### Data Structure

- SlideXXX에서 숫자는 중복이 없음 (pos, neg 겹치는게 없음)

```
|-- data
    |-- train
          |-- image
                |-- positive
                      |-- Slide001.mrxs
                      |-- Slide001
                          |-- Data00.dat
                          ...
                          |-- Data21.dat
                          |-- Index.dat
                      ...
                |-- negative
                      |-- Slide002.mrxs
                      |-- Slide002
                          |-- Data00.dat
                          ...
                          |-- Data21.dat
                          |-- Index.dat
                      ...
          |-- mask
                |-- positive
                      |-- Slide001.png
                      ...
                |-- negative
                      |-- Slide002.png
                      ...
    |-- test
          |-- Slide158.mrxs
          |-- Slide158
              |-- Data00.dat
              ...
              |-- Data21.dat
              |-- Index.dat
              |-- Slidedat.ini
          ...
```
---
### Class Distribution
| - | Positive class | Negative class |
|:-------------|:--------:|:--------:|
| Training | 103 | 54 |
| Validation | 19 | 21 |
| Test | ?? | ?? |

---

In [24]:
# Data Load (edited)
pos_slide_pathes = glob.glob('data/train/image/positive/*.mrxs')
pos_mask_pathes = glob.glob('data/train/mask/positive/*.png')
neg_slide_pathes = glob.glob('data/train/image/negative/*.mrxs')
neg_mask_pathes = glob.glob('data/train/mask/negative/*.png')

pos_slide_sample = openslide.open_slide(pos_slide_pathes[0])
pos_mask_sample = openslide.open_slide(pos_mask_pathes[0])

print("original slide dimensions %dx%d" % pos_slide_sample.dimensions)
print("original mask dimensions %dx%d" % pos_mask_sample.dimensions)

original slide dimensions 93970x234042
original mask dimensions 5316x10007


In [38]:
# Patch Generator
PATCH_SIZE = 256
IS_TRAIN = True
def find_patches_from_slide(slide_path, mask_path, pos_or_neg = True, patch_size=PATCH_SIZE, filter_non_tissue=True):
    """Return a dataframe of all patches in slide
    Input: slide_path - path to WSI file
    Output:
    """
    BOUNDS_OFFSET_PROPS = (openslide.PROPERTY_NAME_BOUNDS_X, openslide.PROPERTY_NAME_BOUNDS_Y)
    BOUNDS_SIZE_PROPS = (openslide.PROPERTY_NAME_BOUNDS_WIDTH, openslide.PROPERTY_NAME_BOUNDS_HEIGHT)

    #### start, level, size for read_region ####
    with openslide.open_slide(slide_path) as slide:
        start = (int(slide.properties.get('openslide.bounds-x')), int(slide.properties.get('openslide.bounds-y')))
        level = int(np.log2(patch_size)) # power of 2
        
        size_scale = tuple(int(slide.properties.get(prop, l0_dim)) / l0_dim
                          for prop, l0_dim in zip(BOUNDS_SIZE_PROPS, slide.dimensions))
        _l_dimensions = tuple(tuple(int(math.ceil(l_dim * scale))
                                   for l_dim, scale in zip(l_size, size_scale))
                             for l_size in slide.level_dimensions)
        size = _l_dimensions[level]
        
        slide4 = slide.read_region(start, level, size)
    #############################################
    
    slide4_grey = np.array(slide4.convert('L'))
    binary = slide4_grey > 0 # black is zero
    
    # threshold_otsu: http://scikit-image.org/docs/dev/auto_examples/xx_applications/plot_thresholding.html
    slide4_not_black = slide4_grey[slide4_grey>0]
    thresh = threshold_otsu(slide4_not_black)
    
    I, J = slide4_grey.shape
    for i in range(I):
        for j in range(J):
            if slide4_grey[i, j] > thresh:
                binary[i, j] = False
    patches = pd.DataFrame(pd.DataFrame(binary).stack())
    patches['is_tissue'] = patches[0]
    patches.drop(0, axis=1, inplace=True)
    patches['slide_path'] = slide_path
    
    if pos_or_neg: # is_tumor
        with openslide.open_slide(mask_path) as truth:
            thumbnail_truth = truth.get_thumbnail(size)
            
        patches_y = pd.DataFrame(pd.DataFrame(np.array(thumbnail_truth.convert('L'))).stack())
        patches_y['is_tumor'] = patches_y[0] > 0
        
        patches_y['is_all_tumor'] = patches_y[0] == 255
        patches_y.drop(0, axis=1, inplace=True)
        samples = pd.concat([patches, patches_y], axis=1)
    else: # isnt_tumor
        samples = patches
        samples['is_tumor'] = False
        
    if filter_non_tissue:
        samples = samples[samples.is_tissue == True]
    
    filter_only_all_tumor = True
    
    samples['tile_loc'] = list(samples.index)
    all_tissue_samples1 = samples[samples.is_tumor==False]
    all_tissue_samples1 =all_tissue_samples1.append(samples[samples.is_all_tumor==True])
    
    all_tissue_samples1.reset_index(inplace=True, drop=True)
    
    return all_tissue_samples1
        

In [41]:
sample_all_tissue_samples = find_patches_from_slide(pos_slide_pathes[0], pos_mask_pathes[0], pos_or_neg=True)
print('Total patches in slide: %d' % len(sample_all_tissue_samples)) 
sample_all_tissue_samples.iloc[:10]
sample_all_tissue_samples.is_tumor.value_counts() 

Total patches in slide: 105213


False    74490
True     30723
Name: is_tumor, dtype: int64

In [42]:
sample_all_tissue_samples.head(3)

Unnamed: 0,is_tissue,slide_path,is_tumor,is_all_tumor,tile_loc
0,True,data/train/image/positive\Slide001.mrxs,False,False,"(17, 165)"
1,True,data/train/image/positive\Slide001.mrxs,False,False,"(17, 166)"
2,True,data/train/image/positive\Slide001.mrxs,False,False,"(17, 167)"


In [None]:
# Train Data Generator
NUM_CLASSES = 2 # not_tumor, tumor

def gen_imgs(samples, batch_size, patch_size=PATCH_SIZE, base_truth_dir=BASE_TRUTH_DIR, shuffle=True):
    num_samples = len(samples)
    
    slide_path = pos_slide_pathes[0]
    slide_contains_tumor = True
    
    slide = openslide.opens_slide(slide_path)
    tiles = DeppZoomGenerator(slide, tile_size=patch_size, overlap=0, limit_bounds=False)
    
    start_x = int(slide.properties.get('openslide.bounds-x'))
    start_y int(slide.properties.get('openslide.bounds-y'))
    start_x /= patch_size
    start_y /= patch_size
    
    if slide_contains_tumor:
        mask_path = pos_mask_pathes[0]
        mask = openslide.open_slide(mask_path)
        mask_tiles = DeepZoomGenerator(mask, tile_size=16. overlap=0, limit_bounds=False)
        
    while True: # Loop forever so the generator never terminates
        if shuffle:
            smaples = samples.sampl(frac=1)
            
        for offset in range(0, num_samples, batch_size):
            batch_samples = samples.iloc[offset : offset + batch_size]
            
            images = []
            masks = []
            for _, batch_sample in batch_samples.iterrows():
                x, y = batch_sample.tile_loc[::-1]
                x += start_x
                y += start_y
                img = tiles.get_tile(tiles.level_count - 1, (x, y))
    