# Breast Cancer Detection Model Ver01

## 0. Import Module

- [OpenSlide](https://openslide.org/api/python/#module-openslide)

In [116]:
%matplotlib inline
import os
import csv
import cv2
import openslide
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from skimage.filters import threshold_otsu
from tensorflow.keras import layers, models
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from openslide.deepzoom import DeepZoomGenerator

## 1. Data Load

In [125]:
# slide path
slide_path = '../data/train/pos/16-S-042893_A1.mrxs'  # pos
# neg_slide_path = '../data/train/16-S-042725_A.mrxs'  # neg

# mask path
truth_path = '../data/train/pos/Mask_16-S-042893_A1.png'  # pos
# neg_mask_path = '../data/train/Mask_16-S-042725_A.png'  # neg

# openslide
## slide
slide = openslide.open_slide(slide_path)
# neg_slide = openslide.open_slide(neg_slide_path)
## mask
truth = openslide.open_slide(truth_path)
# neg_mask = openslide.open_slide(neg_mask_path)

In [126]:
print('origin slide.dimensions :', slide.dimensions)
# print('origin neg_slide.dimensions :', neg_slide.dimensions)
print('origin truth.dimensions :', truth.dimensions)
# print('origin neg_mask.dimensions :', neg_mask.dimensions)

origin slide.dimensions : (93970, 234042)
origin truth.dimensions : (5316, 10007)


## 2. Patch Gen

### 1) Find Patches from Slide

In [15]:
patch_size = 256
is_train = True

# 해당 데이터가 양성인지 판단
slide_contains_tumor = 'pos' in slide_path

# read_region을 위한 start, level, size 계산
bounds_offset_props = (openslide.PROPERTY_NAME_BOUNDS_X, openslide.PROPERTY_NAME_BOUNDS_Y)
bounds_size_props = (openslide.PROPERTY_NAME_BOUNDS_WIDTH, openslide.PROPERTY_NAME_BOUNDS_HEIGHT)

with openslide.open_slide(slide_path) as slide:
    start = (int(slide.properties.get(bounds_offset_props[0], 0)), 
             int(slide.properties.get(bounds_offset_props[1], 0)))
    level = int(np.log2(patch_size))
    
    size_scale = tuple(int(slide.properties.get(prop, dim)) / dim 
                       for prop, dim in zip(bounds_size_props, slide.dimensions))
    l_dimensions = [(int(np.ceil(dim_x * size_scale[0])), int(np.ceil(dim_y * size_scale[1])))
                    for dim_x, dim_y in slide.level_dimensions]
    size = l_dimensions[level]
    
    slide4 = slide.read_region(start, level, size)

In [124]:
# is_tissue 부분
slide4_grey = np.array(slide4.convert('L'))

# background에 대한 작업
slide4_not_black = slide4_grey[slide4_grey > 0]
# thresh = threshold_otsu(slide4_not_black)
ret, th = cv2.threshold(slide4_not_black, 0, 255, 
                        cv2.THRESH_BINARY+cv2.THRESH_OTSU)

binary = slide4_grey > 0  # black == 0
h, w = slide4_grey.shape
for i in range(h):
    for j in range(w):
        if slide4_grey[i, j] > ret:
            binary[i, j] = False
            
# patch_df
patches = pd.DataFrame(pd.DataFrame(binary).stack(), columns=['is_tissue'])
patches['slide_path'] = slide_path
patches.head()

Unnamed: 0,Unnamed: 1,is_tissue,slide_path
0,0,False,../data/train/pos/16-S-042893_A1.mrxs
0,1,False,../data/train/pos/16-S-042893_A1.mrxs
0,2,False,../data/train/pos/16-S-042893_A1.mrxs
0,3,False,../data/train/pos/16-S-042893_A1.mrxs
0,4,False,../data/train/pos/16-S-042893_A1.mrxs


In [127]:
# is_tumor 부분
if slide_contains_tumor:
    with openslide.open_slide(truth_path) as truth:
        thumbnail_truth = truth.get_thumbnail(size)
        
    # truth pathes_df
    patches_y = pd.DataFrame(
            pd.DataFrame(np.array(thumbnail_truth.convert('L'))).stack())
    patches_y['is_tumor'] = patches_y[0] > 0
    
    # mask된 영역이 애매한 경우
    patches_y['is_all_tumor'] = patches_y[0] == 255
    patches_y.drop(0, axis=1, inplace=True)
    samples = pd.concat([patches, patches_y], axis=1)
else: 
    samples = patches
    samples['is_tumor'] = False

samples.head()

Unnamed: 0,Unnamed: 1,is_tissue,slide_path,is_tumor,is_all_tumor
0,0,False,../data/train/pos/16-S-042893_A1.mrxs,False,False
0,1,False,../data/train/pos/16-S-042893_A1.mrxs,False,False
0,2,False,../data/train/pos/16-S-042893_A1.mrxs,False,False
0,3,False,../data/train/pos/16-S-042893_A1.mrxs,False,False
0,4,False,../data/train/pos/16-S-042893_A1.mrxs,False,False


In [128]:
filter_non_tissue = True
if filter_non_tissue:  # tissue인것만 가져오기
    samples = samples[samples.is_tissue == True]
    
samples.head()

Unnamed: 0,Unnamed: 1,is_tissue,slide_path,is_tumor,is_all_tumor
17,165,True,../data/train/pos/16-S-042893_A1.mrxs,False,False
17,166,True,../data/train/pos/16-S-042893_A1.mrxs,False,False
17,167,True,../data/train/pos/16-S-042893_A1.mrxs,False,False
17,201,True,../data/train/pos/16-S-042893_A1.mrxs,False,False
17,205,True,../data/train/pos/16-S-042893_A1.mrxs,False,False


In [130]:
filter_only_all_tumor = True

samples['tile_loc'] = list(samples.index)
all_tissue_samples = samples[samples['is_tumor'] == False]
all_tissue_samples = all_tissue_samples.append(samples[samples['is_all_tumor'] == True])
all_tissue_samples.head()

Unnamed: 0,Unnamed: 1,is_tissue,slide_path,is_tumor,is_all_tumor,tile_loc
17,165,True,../data/train/pos/16-S-042893_A1.mrxs,False,False,"(17, 165)"
17,166,True,../data/train/pos/16-S-042893_A1.mrxs,False,False,"(17, 166)"
17,167,True,../data/train/pos/16-S-042893_A1.mrxs,False,False,"(17, 167)"
17,201,True,../data/train/pos/16-S-042893_A1.mrxs,False,False,"(17, 201)"
17,205,True,../data/train/pos/16-S-042893_A1.mrxs,False,False,"(17, 205)"


In [133]:
all_tissue_samples['is_tumor'].value_counts()

False    74490
True     30723
Name: is_tumor, dtype: int64