In [None]:
#################################################################################
# Written by Sean Harris
# 
# Problem: 
#   It is not easy to efficiently sample training patches from gigapixel biopsy 
#   slide. A majority of the slide is non-tissue background, and most of the 
#   tissue is noncancerous.
#
# Solution: 
#   Ahead of training, generate and save a list of indices pointing to slides 
#   containing tissue (normal and tumorous). Important to not merely iterate 
#   through entire slide, as it's mostly white background. 
#
#   Instead, the entire slide is rendered in a lower magnification level (6), 
#   and then large empty regions are filtered out with edge detection. We keep 
#   regions with lots of edges, as they contain the complex organic tissue.
#
#   Finally we exhaustively iterate through the remaining regions, filtering
#   out slides with little tissue (determined by greyscale intensity), and
#   labelling slides containing tumor anotated pixels as tumorous.
#
#
# NOTE: 
#   This file works with slides with pixel tumor annotations in TIF form.
#################################################################################

In [None]:
#IMPORTS

!apt-get install openslide-tools
!pip install openslide-python
 
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from openslide import open_slide, __library_version__ as openslide_version
import os
from PIL import Image
from skimage.color import rgb2gray
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import os
import gdown
from zipfile import ZipFile
import math
import random
import sklearn
from skimage.transform import resize
import cv2
from tensorflow.keras.applications.inception_v3 import preprocess_input
import random
import copy
import scipy
import torch
import torch.nn.functional as F
from skimage import feature
import xml.etree.ElementTree as ET

 
#from google.colab import drive
#drive.mount("/content/drive")
slide_root = '/path/to/slide/directory/'

In [None]:
#FUNCTIONS

def read_slide(slide, x, y, level, width, height, as_float=False):
    im = slide.read_region((x,y), level, (width, height))
    im = im.convert('RGB') # drop the alpha channel
    if as_float:
        im = np.asarray(im, dtype=np.float32)
    else:
        im = np.asarray(im)
    assert im.shape == (height, width, 3)
    return im

def find_tissue_pixels(image, intensity=0.8):
    im_gray = rgb2gray(image)
    assert im_gray.shape == (image.shape[0], image.shape[1])
    indices = np.where(im_gray <= intensity)
    return list(zip(indices[0], indices[1]))

In [None]:
level = 2
pos_patch_count = 0
index_dict = {}
for k in range(150): 

    if k > 90: # Saved for testing
        continue

    if k >= 100:
        slide_path = slide_root + 'Copy of tumor_' + str(k) + '.tif'
        tumor_mask_path = slide_root + 'Copy of tumor_' + str(k) + '_mask.tif'
    elif k >= 10:
        slide_path = slide_root + 'Copy of tumor_0' + str(k) + '.tif'
        tumor_mask_path = slide_root + 'Copy of tumor_0' + str(k) + '_mask.tif'
    elif k >= 0:
        slide_path = slide_root + 'Copy of tumor_00' + str(k) + '.tif'
        tumor_mask_path = slide_root + 'Copy of tumor_00' + str(k) + '_mask.tif'

    if not os.path.exists(slide_path) or not os.path.exists(tumor_mask_path):
        continue
    print(slide_path)
    slide = open_slide(slide_path)
    tumor_mask = open_slide(tumor_mask_path)
    input_shape = (299,299,3)
    n_x = math.floor(slide.level_dimensions[level][0] / input_shape[0])
    n_y = math.floor(slide.level_dimensions[level][1] / input_shape[1])
    bin_map = np.zeros((n_x,n_y))
    n = n_x * n_y
    print(n, end='')
    i=0
    for i in range(n_y):
        for j in range(n_x):
            i+= 1
            if i % 100 == 0:
                print(i, end='')

            patch = read_slide(slide, 
                                x= int(j * input_shape[0] * slide.level_downsamples[level]), 
                                y= int(i * input_shape[1] * slide.level_downsamples[level]), 
                                level=level, 
                                width=input_shape[0], 
                                height=input_shape[1])
            
            tissue_indices = find_tissue_pixels(patch)
            if len(tissue_indices)/(input_shape[0] * input_shape[1]) < .2:
                continue

            patch_mask = read_slide(tumor_mask, 
                            x= int(j * input_shape[0] * slide.level_downsamples[level]), 
                            y= int(i * input_shape[1] * slide.level_downsamples[level]), 
                            level=level, 
                            width=input_shape[0], 
                            height=input_shape[1])

            label = np.max(patch_mask[:,:,0])
            bin_map[j,i] = label + 1
            pos_patch_count += label
    save_root = '/path/for/saved/coords/'
    np.save(save_root + str(k),bin_map)
    pos_indices = np.argwhere(bin_map == 2).tolist()
    neg_indices = np.argwhere(bin_map == 1).tolist()
    np.random.shuffle(pos_indices)
    np.random.shuffle(neg_indices)
    index_dict[slide_path] = (pos_indices,neg_indices)
