In [1]:
import math
import os

import numpy as np
import openslide
from PIL import Image
from openslide import OpenSlideError
from openslide.deepzoom import DeepZoomGenerator
import pandas as pd

from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as F
from scipy.ndimage.morphology import binary_fill_holes
from skimage.color import rgb2gray
from skimage.feature import canny
from skimage.morphology import binary_closing, binary_dilation, disk

In [2]:
tile_size = 3000
sample_size = 3000
grayscale = False
num_partitions = 200
training = True
save_jpegs = True
convert2DF = False
row_indices = False
train_frac = 0.8
sample_frac=0.01
seed = 42
overlap = 0

In [4]:
# open slide image
slide = openslide.open_slide('/home/ss4yd/test/data/C17-83_04.svs')

generator = DeepZoomGenerator(slide, tile_size=tile_size, overlap=overlap, limit_bounds=True)

highest_zoom_level = generator.level_count - 1
# mag = int(slide.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER])
zoom_level = highest_zoom_level

slide_num = os.path.join('./data', 'C17-83_04.svs').split('.')[-2].split('/')[-1]

In [5]:
cols, rows = generator.level_tiles[zoom_level]
tile_indices = [(slide_num, tile_size, overlap, zoom_level, col, row)
                  for col in range(cols) for row in range(rows)]

In [6]:
tiles = []
for index in tile_indices:
    slide_num, tile_size, overlap, zoom_level, col, row = index
    tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
    tiles.append(tile)

In [8]:
tiles[0].shape

(3000, 3000, 3)

In [58]:
def optical_density(tile):
    tile = tile.astype(np.float64)
    od = -np.log((tile+1)/240)
    return od

In [62]:
def keep_tile(tile, tile_size, tissue_threshold = 0.50):
    if tile.shape[0:2] == (tile_size, tile_size):
        print("inside if")
        tile_orig = tile
        tile = rgb2gray(tile)
        tile = 1 - tile
        
        tile = canny(tile)
        
        tile = binary_closing(tile, disk(10))
        tile = binary_dilation(tile, disk(10))
        tile = binary_fill_holes(tile)
        percentage = tile.mean()
        
        check1 = percentage >= tissue_threshold

        # Check 2
        # Convert to optical density values
        tile = optical_density(tile_orig)
        # Threshold at beta
        beta = 0.15
        tile = np.min(tile, axis=2) >= beta
        # Apply morphology for same reasons as above.
        tile = binary_closing(tile, disk(2))
        tile = binary_dilation(tile, disk(2))
        tile = binary_fill_holes(tile)
        percentage = tile.mean()
        check2 = percentage >= tissue_threshold
        print(check1, check2)
        return check1 and check2
    else:
        return False


In [64]:
keep_tile(tiles[0], 3000, tissue_threshold=50)

inside if


  safe = ((np.issubdtype(dt, int) and dt.itemsize <= int_size) or


False False


False

In [65]:
def process_tile(tile, sample_size, grayscale):
    
    """
    Process a tile into a group of smaller samples.
    Cut up a tile into smaller blocks of sample_size x sample_size pixels,
    change the shape of each sample from (H, W, channels) to
    (channels, H, W), then flatten each into a vector of length
    channels*H*W.
    Args:
      tile_tuple: A (slide_num, tile) tuple, where slide_num is an
        integer, and tile is a 3D NumPy array of shape
        (tile_size, tile_size, channels).
      sample_size: The new width and height of the square samples to be
        generated.
      grayscale: Whether or not to generate grayscale samples, rather
        than RGB.
    Returns:
      A list of (slide_num, sample) tuples representing cut up tiles,
      where each sample is a 3D NumPy array of shape
      (sample_size_x, sample_size_y, channels).
    """
    if grayscale:
        tile = rgb2gray(tile)[:, :, np.newaxis]  # Grayscale
        # Save disk space and future IO time by converting from [0,1] to [0,255],
        # at the expense of some minor loss of information.
        tile = np.round(tile * 255).astype("uint8")
    x, y, ch = tile.shape
    # 1. Reshape into a 5D array of (num_x, sample_size_x, num_y, sample_size_y, ch), where
    # num_x and num_y are the number of chopped tiles on the x and y axes, respectively.
    # 2. Swap sample_size_x and num_y axes to create
    # (num_x, num_y, sample_size_x, sample_size_y, ch).
    # 3. Combine num_x and num_y into single axis, returning
    # (num_samples, sample_size_x, sample_size_y, ch).
    samples = (tile.reshape((x // sample_size, sample_size, y // sample_size, sample_size, ch))
                   .swapaxes(1, 2)
                   .reshape((-1, sample_size, sample_size, ch)))
    samples = [(slide_num, sample) for sample in list(samples)]
    return samples


In [66]:
filter_tiles = [tile for tile in tiles if keep_tile(tile,tile_size, 0.50)]

inside if


  safe = ((np.issubdtype(dt, int) and dt.itemsize <= int_size) or


False False
inside if
False False
inside if
False False
inside if
False False
inside if
True True
inside if
False False
inside if
True True
inside if
True True


In [67]:
keep_tile(tiles[3], 4000, tissue_threshold=50)

False

In [68]:
samples = [process_tile(tile, sample_size, False) for tile in filter_tiles]

In [69]:
def save_nonlabelled_sample_2_jpeg(sample, save_dir):
    """
    Save the sample without labels into JPEG
    Args:
      sample_element: a sample tuple without labels, e.g. (slide_num, sample)
      save_dir: the file directory at which to save JPEGs
    """
    slide_num, img_value = sample
    filename = '{slide_num}_{hash}.jpeg'.format(
        slide_num=slide_num, hash=np.random.randint(1e4))
    filepath = os.path.join(save_dir, filename)
    save_jpeg_help(img_value, filepath)


def save_jpeg_help(img_value, filepath):
    """
     Save data into JPEG
     Args:
       img_value: the image value with the size (img_size_x, img_size_y, channels)
       file path: the file path at which to save JPEGs
     """
    dir = os.path.dirname(filepath)
    os.makedirs(dir, exist_ok=True)
    img = Image.fromarray(img_value.astype(np.uint8), 'RGB')
    img.save(filepath)


In [70]:
for sample in samples:
    save_nonlabelled_sample_2_jpeg(sample[0], './')

In [16]:
samples

[]