# 1-1. Data Load

 - 이미지 샘플에서는 mask(암을표시한)에 대한 min_offset, max_offset이 주어져있지 않음
 
 - mask의 dimension은 slide의 최고 레벨(max_size)의 dimensions의 1/16
 
 - 따라서 mask와 최고 레벨 슬라이드의 사이즈를 맞춰주어야함.
  - mask image up size후 slide와 비교 (이거 정확하게 해줘야함. 이미지 slide에서 down sample 할때, 어떻게하는지 참고)
 - 아래의 예제는 level 4의 예제로만 실행

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
import os.path as osp
import openslide
from pathlib import Path

BASE_TRUTH_DIR = Path('mask(level4)')
slide_path = 'slide/16-S-042893_A1.mrxs'
slide_png_path = 'image(level4)/Tumor_16-S-042893_A1(positive class).png'
truth_path = str(BASE_TRUTH_DIR/'Mask_16-S-042893_A1(positive class).png')

slide = openslide.open_slide(slide_path)
slide_png = openslide.open_slide(slide_png_path)
truth = openslide.open_slide(truth_path)

print("original slide dimensions %dx%d" % slide.dimensions)
print("original slide png dimensions %dx%d" % slide_png.dimensions)
print("original truth dimensions %dx%d" % truth.dimensions)

original slide dimensions 93970x234042
original slide png dimensions 5316x10007
original truth dimensions 5316x10007


# 1-2. Patch Generate

In [14]:
def find_patches_from_slide(slide_path, base_truth_dir=BASE_TRUTH_DIR, filter_non_tissue=True):
    """Returns a dataframe of all patches in slide
    input: slide_path: path to WSI file
    output: samples: dataframe with the following columns:
        slide_path: path of slide
        is_tissue: sample contains tissue
        is_tumor: truth status of sample
        tile_loc: coordinates of samples in slide
        
    
    option: base_truth_dir: directory of truth slides
    option: filter_non_tissue: Remove samples no tissue detected
    """
    base_truth_dir = Path(base_truth_dir)
    slide_contains_tumor = osp.basename(slide_path).startswith('Tumor_') # true or false
    
    with openslide.open_slide(slide_path) as slide:
        thumbnail = slide.get_thumbnail((slide.dimensions[0]/256 , slide.dimensions[1]/256 ))
    
    thumbnail_grey = np.array(thumbnail.convert('L')) # convert to grayscale
    thresh = threshold_otsu(thumbnail_grey)
    binary = thumbnail_grey > thresh
    
    patches = pd.DataFrame(pd.DataFrame(binary).stack())
    patches['is_tissue'] = ~patches[0]
    patches.drop(0, axis=1, inplace=True)
    patches['slide_path'] = slide_path
    
    if slide_contains_tumor:
        #truth_slide_path = base_truth_dir / osp.basename(slide_path).replace('.tif', '_Mask.tif')
        truth_slide_path = str(BASE_TRUTH_DIR/'Mask_16-S-042893_A1(positive class).png')
        with openslide.open_slide(str(truth_slide_path)) as truth:
            thumbnail_truth = truth.get_thumbnail((truth.dimensions[0] , truth.dimensions[1] )) 
        
        patches_y = pd.DataFrame(pd.DataFrame(np.array(thumbnail_truth.convert("L"))).stack())
        patches_y['is_tumor'] = patches_y[0] > 0
        patches_y.drop(0, axis=1, inplace=True)

        samples = pd.concat([patches, patches_y], axis=1)
    else:
        samples = patches
        samples['is_tumor'] = False
    
    if filter_non_tissue:
        samples = samples[samples.is_tissue == True] # remove patches with no tissue
    samples['tile_loc'] = list(samples.index)
    samples.reset_index(inplace=True, drop=True)
    return samples

In [7]:
from skimage.filters import threshold_otsu

In [15]:
all_tissue_samples = find_patches_from_slide(slide_png_path)
print('Total patches in slide: %d' % len(all_tissue_samples))
all_tissue_samples.iloc[:10]
all_tissue_samples.is_tumor.value_counts()

Total patches in slide: 25261256


False    17571817
True      7689439
Name: is_tumor, dtype: int64

In [17]:
all_tissue_samples.iloc[:10]

Unnamed: 0,is_tissue,slide_path,is_tumor,tile_loc
0,True,image(level4)/Tumor_16-S-042893_A1(positive cl...,False,"(70, 3703)"
1,True,image(level4)/Tumor_16-S-042893_A1(positive cl...,False,"(70, 3704)"
2,True,image(level4)/Tumor_16-S-042893_A1(positive cl...,False,"(70, 3705)"
3,True,image(level4)/Tumor_16-S-042893_A1(positive cl...,False,"(71, 3702)"
4,True,image(level4)/Tumor_16-S-042893_A1(positive cl...,False,"(71, 3703)"
5,True,image(level4)/Tumor_16-S-042893_A1(positive cl...,False,"(71, 3704)"
6,True,image(level4)/Tumor_16-S-042893_A1(positive cl...,False,"(71, 3705)"
7,True,image(level4)/Tumor_16-S-042893_A1(positive cl...,False,"(72, 3702)"
8,True,image(level4)/Tumor_16-S-042893_A1(positive cl...,False,"(72, 3704)"
9,True,image(level4)/Tumor_16-S-042893_A1(positive cl...,False,"(72, 3705)"


In [22]:
# !pip install opencv-python

Collecting opencv-python
  Downloading https://files.pythonhosted.org/packages/96/30/99bd865802cd5f425c42efd2ee4e10bd3bc605640008f03e3c72a1dbe320/opencv_python-4.0.0.21-cp36-cp36m-win_amd64.whl (30.4MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.0.0.21


In [23]:
import cv2
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [27]:
from openslide.deepzoom import DeepZoomGenerator 

In [24]:
NUM_CLASSES = 2 # not_tumor, tumor

def gen_imgs(samples, batch_size, base_truth_dir=BASE_TRUTH_DIR, shuffle=True):
    """This function returns a generator that 
    yields tuples of (
        X: tensor, float - [batch_size, 256, 256, 3]
        y: tensor, int32 - [batch_size, 256, 256, NUM_CLASSES]
    )
    
    
    input: samples: samples dataframe
    input: batch_size: The number of images to return for each pull
    output: yield (X_train, y_train): generator of X, y tensors
    
    option: base_truth_dir: path, directory of truth slides
    option: shuffle: bool, if True shuffle samples
    """
    
    num_samples = len(samples)
    while 1: # Loop forever so the generator never terminates
        if shuffle:
            samples = samples.sample(frac=1) # shuffle samples
        
        for offset in range(0, num_samples, batch_size):
            batch_samples = samples.iloc[offset:offset+batch_size]
        
            images = []
            masks = []
            for _, batch_sample in batch_samples.iterrows():
                slide_contains_tumor = osp.basename(batch_sample.slide_path).startswith('Tumor_')
                
                with openslide.open_slide(batch_sample.slide_path) as slide:
                    tiles = DeepZoomGenerator(slide, tile_size=256, overlap=0, limit_bounds=False)
                    img = tiles.get_tile(tiles.level_count-1, batch_sample.tile_loc[::-1])
                    
                # only load truth mask for tumor slides
                if slide_contains_tumor:
                    truth_slide_path = base_truth_dir / osp.basename(slide_path).replace('.tif', '_Mask.tif')
                    with openslide.open_slide(str(truth_slide_path)) as truth:
                        truth_tiles = DeepZoomGenerator(truth, tile_size=256, overlap=0, limit_bounds=False)
                        mask = truth_tiles.get_tile(truth_tiles.level_count-1, batch_sample.tile_loc[::-1])
                        mask = (cv2.cvtColor(np.array(mask), cv2.COLOR_RGB2GRAY) > 0).astype(int)
                else:
                    mask = np.zeros((256, 256))

                images.append(np.array(img))
                masks.append(mask)

            X_train = np.array(images)
            y_train = np.array(masks)
            y_train = to_categorical(y_train, num_classes=2).reshape(y_train.shape[0], 256, 256, 2)
            yield X_train, y_train

In [29]:
sample_gen = gen_imgs(all_tissue_samples.sample(32, random_state=42), 32, shuffle=False)

In [30]:
%time example_X, example_y  = next(sample_gen)

ValueError: Invalid address