# Applied Deep Learning Project
### Peter Grantcharov (pdg2116)
### Po-Chieh Liu (pl2441)


#### INSATALL SLIDE READING PACKAGES

In [1]:
# Install the OpenSlide C library and Python bindings
!apt-get install openslide-tools
!pip install openslide-python


Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-430
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libopenslide0
Suggested packages:
  libtiff-tools
The following NEW packages will be installed:
  libopenslide0 openslide-tools
0 upgraded, 2 newly installed, 0 to remove and 7 not upgraded.
Need to get 92.5 kB of archives.
After this operation, 268 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libopenslide0 amd64 3.4.1+dfsg-2 [79.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 openslide-tools amd64 3.4.1+dfsg-2 [12.7 kB]
Fetched 92.5 kB in 2s (53.1 kB/s)
Selecting previously unselected package libopenslide0.
(Reading database ... 145655 files and directories currently installed.)
Preparing to unpack .../libopenslide0_3.4.1+dfsg-2

In [2]:
# import necessary packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from openslide import open_slide
from skimage.color import rgb2gray
from tqdm._tqdm_notebook import tnrange, tqdm
from sklearn.model_selection import train_test_split
%tensorflow_version 2.x
import tensorflow as tf


TensorFlow 2.x selected.


### Loading 1 Image

In [3]:
# Download an example slide and tumor mask
slide_path = 'tumor_091.tif' # only this file is available
tumor_mask_path = 'tumor_091_mask.tif' # only this file is available
slide_url = 'https://storage.googleapis.com/applied-dl/%s' % slide_path
mask_url = 'https://storage.googleapis.com/applied-dl/%s' % tumor_mask_path

# Download the whole slide image
if not os.path.exists(slide_path):
  !curl -O $slide_url

# Download the tumor mask
if not os.path.exists(tumor_mask_path):
  !curl -O $mask_url
  

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  521M  100  521M    0     0  81.2M      0  0:00:06  0:00:06 --:--:-- 91.7M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 14.6M  100 14.6M    0     0  45.3M      0 --:--:-- --:--:-- --:--:-- 45.3M


In [0]:
def verify_validity(top_left, dims, image_dims):
    assert len(top_left) == 2, "Top left corner needs to have length 2"
    assert len(dims) == 2, 'Dims have to be length 2'
    
    msg = f"Top left corner {top_left} is outside image {image_dims}"
    assert top_left[0] < image_dims[0] and top_left[1] < image_dims[1], msg


In [0]:
def get_x_y(top_left, level):
    """
    Gets the top left corner for image of the level we're fetching.
    """
    scale = (2 ** level)
    return tuple(np.array(top_left) * scale)

def get_width_height(top_left, dims, image_dims):
    """
    Gets height and width; ensures that it does not go outside the
    image borders.
    """
    width = min(image_dims[0] - top_left[0], dims[0])
    height = min(image_dims[1] - top_left[1], dims[1])
    return width, height


In [0]:
def read_slide(slide, top_left, level, dims):
    """
    Give corner pixel values (top_left) for image of the level 
    we're fetching, instead of the highest resolution image.
    
    Dims are amount of (x, y) pixels to include.
    """
    image_dims = slide.level_dimensions[level]
    verify_validity(top_left, dims, image_dims)

    x, y = get_x_y(top_left, level)
    width, height = get_width_height(top_left, dims, image_dims)
    im = slide.read_region((x, y), level, (width, height)).convert('RGB')
    return np.asarray(im).copy()


# Build pipeline

# Scan dataset


In [0]:
def valid_window(slide_window, intensity_threshold=0.5, std_threshold=35):
    """
    Returns boolean of whether slide square is valid 
    (threshold of tissue pixels surpassed)
    """
    intensity_check = (rgb2gray(slide_window) <= 0.8).mean() > intensity_threshold
    variation_check = slide_window.std() > std_threshold
    return intensity_check and variation_check


In [0]:
def label_window(mask_window, threshold=0.2, label_ambiguous=True):
    """
    Returns boolean of whether mask square has enough cancer 
    pixels to be meet threshold for being labelled 'tumor'.
    """
    tumor_pixels = mask_window[:,:,0].mean()
    if tumor_pixels == 0:
        label = 0
    elif tumor_pixels > threshold:
        label = 1
    else:  # there are some tumor pixels but < threshold
        label = 2 if label_ambiguous else 0
    return label


In [0]:
def generate_corners(slide, level, shape):
    """
    Generator for the top left corners pixel values of windows 
    when applying a non-overlapping sliding window of given shape.
    """
    width, height = slide.level_dimensions[level]
    
    x_locs = np.arange(0, width - shape[0], shape[0])
    y_locs = np.arange(0, height - shape[1], shape[1])

    for x_loc in x_locs:
        for y_loc in y_locs:
            yield (x_loc, y_loc)


In [0]:
def load_slide_mask(slide_loc):
    """
    Returns slide and mask from given slide name
    """
    return open_slide(slide_loc + '.tif'), open_slide(slide_loc + '_mask.tif')


In [0]:
def get_windows(slide, mask, level=2, shape=(300, 300)):
    """
    For a given slide, will return list of regions and labels
    for windows of given shape that meet the tissue pixel 
    threshold. Also returns list of slide paths, which will be
    necessary when loading from multiple images.

    Takes maybe 5 min for level 0 image, 20 sec for level 2.
    """
    windows, labels = [], []
    for top_left_corner in generate_corners(slide, level, shape):
        slide_window = read_slide(slide, top_left_corner, level, shape)

        if valid_window(slide_window):
            mask_window = read_slide(mask, top_left_corner, level, shape)
            windows.append(top_left_corner)
            labels.append(label_window(mask_window))
    
    return np.array(windows), np.array(labels)
    

In [0]:
import shutil

def make_dir():
    try:
        os.mkdir('data')
    except FileExistsError:
        shutil.rmtree('data', ignore_errors=True)
        os.mkdir('data')
        

In [0]:
def save_windows(slide, windows, labels, slide_loc, save_folder='data', 
                 level=2, shape=(300, 300)):
    """
    Saves every image to data/ directory as jpeg
    """
    for idx, (window, label) in enumerate(zip(windows, labels)):
        slide_name = slide_loc.split('/')[-1]
        save_loc = f"{save_folder}/{slide_name}{idx}.jpeg"
        image = read_slide(slide, window, level, shape)
        tf.keras.preprocessing.image.save_img(save_loc, image)


In [0]:
def save_archive(windows, labels, slide_loc):
    """
    Saves a CSV containing all of the labels for each saved window
    """
    slide_name = slide_loc.split('/')[-1]
    new_archive = pd.DataFrame(data={'slide_name': slide_name,
                                     'x': windows[:, 0], 
                                     'y': windows[:, 1], 
                                     'labels': labels, 
                                     'suffix': np.arange(len(labels))})
    new_archive.to_csv(f'archive_{slide_name}.csv', index=False)
    
def merge_archives():
    archives = [pd.read_csv(x, index_col=None) for x in os.listdir() if x.startswith('archive')]
    pd.concat(archives).to_csv('archive.csv', index=False)
    !rm archive_*
    

In [0]:
import multiprocessing as mp

def process_image(slide_loc):
    slide, mask = load_slide_mask(slide_loc)
    windows, labels = get_windows(slide, mask)
    save_windows(slide, windows, labels, slide_loc)
    save_archive(windows, labels, slide_loc)
    print(f"Completed {len(windows)} windows from {slide_loc.split('/')[-1]}.")

def process_images(slide_locs):
    make_dir()
    pool = mp.Pool(processes=mp.cpu_count())
    pool.map(process_image, slide_locs)
    merge_archives()
    

# Load all images

In [0]:
def get_valid_images(loc):
    """
    Scans all images in loc/ folder and returns file
    names if there is both an image & mask file.
    """
    all_images = os.listdir(loc)
    masks = [f for f in all_images if f.endswith('_mask.tif')]

    valid_images = []
    for mask in masks:
        prefix = mask[:9]
        if prefix + '.tif' in all_images:
            valid_images.append(loc + prefix)
    return valid_images


In [0]:
# Processes all images 
"""
valid_images = get_valid_images('drive/My Drive/ADL_images/')
process_images(valid_images)
""";

# Save all files to Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Moved everything to Google Drive
"""
!cp -r data drive/My\ Drive/ADL_windows/
!cp archive.csv drive/My\ Drive/
"""


'\n!cp -r data drive/My\\ Drive/ADL_windows/\n!cp archive.csv drive/My\\ Drive/\n'

# Surrounding region processing


In [0]:
archive = pd.read_csv('drive/My Drive/ADL project/archive.csv', index_col=None)


In [0]:
slide = open_slide('drive/My Drive/ADL_images/tumor_078.tif')
mask = open_slide('drive/My Drive/ADL_images/tumor_078_mask.tif')


In [0]:
def get_middle_pixel(corner):
    return (corner[0] + 150, corner[1] + 150)
    

In [0]:
def get_low_res_corner(mid_x, mid_y, multiple):
    """
    multiple = magnification factor difference between low_res level
               and high res level
    mid_x and mid_y = mid points of the high_res (low level) slide
                      for which we are looking for the low_res corner
    
    Returns (x, y) tuple of the top left corner of the low_res window 
    with the same center point defined by (mid_x, mid_y). 
    """
    low_mid_x, low_mid_y = np.array([mid_x, mid_y]) // multiple
    low_x, low_y = (low_mid_x - 150, low_mid_y - 150)
    return low_x, low_y


In [0]:
def get_low_res_window(window_corner, low_res_level, high_res_level=2):
    """
    Returns the corner (an (x, y) tuple) of the low resolution (higher
    level) window with the same center point as the window defined by 
    window_corner (also (x, y) tuple) of level high_res_level. 
    """
    mid_x, mid_y = get_middle_pixel(window_corner)
    multiple = 2 ** (low_res_level - high_res_level)
    return get_low_res_corner(mid_x, mid_y, multiple)


In [0]:
"""
Make dict with slide dimensions of all resolutions of all images

IMAGE_LEVEL_DIMS['tumor_001'][3] will return tuple of (x, y)
dimensions of level 3 of tumor_001.
"""
IMAGE_LEVEL_DIMS = {} 
valid_images = get_valid_images('drive/My Drive/ADL_images/')
for slide_loc in valid_images:
    s = open_slide(slide_loc + '.tif')
    slide_name = slide_loc.split('/')[-1]
    IMAGE_LEVEL_DIMS[slide_name] = dict(zip(np.arange(s.level_count), 
                                            s.level_dimensions))
    

In [0]:
def process_corners(slide_names, corners, low_res_level):
    """
    This slides window regions inside the dimension space of the slide
    """
    for idx, (x, y) in enumerate(corners):
        max_x, max_y = IMAGE_LEVEL_DIMS[slide_names[idx]][low_res_level]
        if x < 0:
            corners[idx, 0] += -1*x
        if y < 0:
            corners[idx, 1] += -1*y
        if (x + 300) > max_x:
            corners[idx, 0] -= (x + 300) - max_x
        if (y + 300) > max_y:
            corners[idx, 1] -= (y + 300) - max_y

    return corners


In [0]:
def add_low_res(archive, low_res_level):
    """
    For a desired low resulution level, it will add 
    coordinate columns to archive for the window top 
    left corner that aligns with a given archive entry.
    """
    corners = archive.apply(lambda window: get_low_res_window((window.x, window.y), low_res_level), axis=1)
    corners = process_corners(slide_names=archive.slide_name.values, 
                              corners=np.array(list(corners)), 
                              low_res_level=low_res_level)
    archive[f'x_{low_res_level}'] = corners[:, 0]
    archive[f'y_{low_res_level}'] = corners[:, 1]
    return archive


In [0]:
# This appends the low level resolution windows to archive
"""
for low_res_level in range(3, 8):
    archive = add_low_res(archive, low_res_level)
archive.to_csv('drive/My Drive/ADL project/archive.csv', index_label=None)
"""

# Make datasets of low resolution images
Will use naming convention:
    **\<slide_name>_\<x corner>_\<y corner>**

In [0]:
def save_low_res_windows(archive, low_res_level):
    # make directory to save windows
    save_folder = f'data_level_{low_res_level}'
    !mkdir $save_folder

    # loop unique slides (more efficient so we only load each slide once)
    for slide_name in archive.slide_name.unique():
        slide, mask = load_slide_mask('drive/My Drive/ADL_images/' + slide_name)

        # loop unique (x, y) locations for that slide
        slide_archive = archive.loc[archive.slide_name == slide_name]
        unique_windows = set(tuple(zip(slide_archive[f'x_{low_res_level}'], 
                                       slide_archive[f'y_{low_res_level}'])))
        for x, y in unique_windows:
            save_loc = f"{save_folder}/{slide_name}_{x}_{y}.jpeg"
            image = read_slide(slide, (x, y), low_res_level, (300, 300))
            tf.keras.preprocessing.image.save_img(save_loc, image)

        print(f'Completed {len(unique_windows)} windows for {slide_name}')


In [0]:
# This saves all the unique windows for each slide level 
# and then copies them to Google Drive
"""
for low_res_level in range(3, 8):
    save_low_res_windows(archive, low_res_level)
    !cp -r data_level_$low_res_level drive/My\ Drive/ADL_windows/
""";

# Make rotated images of class 1 images
This will help balance the dataset

In [0]:
archive = pd.read_csv('drive/My Drive/ADL project/archive.csv', index_col=None)

In [0]:
tumors = archive.loc[archive.labels == 1]


In [0]:
tumors.head()

Unnamed: 0,slide_name,x,y,labels,suffix,x_3,y_3,x_4,y_4,x_5,y_5,x_6,y_6,x_7,y_7,x_N,y_N,x_NE,y_NE,x_E,y_E,x_SE,y_SE,x_S,y_S,x_SW,y_SW,x_W,y_W,x_NW,y_NW
45,tumor_078,4200,13200,1,45,2025,6525,937,3187,393,1518,121,684,0,267,4200,12900,4500,12900,4500,13200,4500,13500,4200,13500,3900,13500,3900,13200,3900,12900
46,tumor_078,4200,13500,1,46,2025,6675,937,3262,393,1556,121,703,0,276,4200,13200,4500,13200,4500,13500,4500,13800,4200,13800,3900,13800,3900,13500,3900,13200
47,tumor_078,4200,13800,1,47,2025,6825,937,3337,393,1593,121,721,0,285,4200,13500,4500,13500,4500,13800,4500,14100,4200,14100,3900,14100,3900,13800,3900,13500
48,tumor_078,4200,14100,1,48,2025,6975,937,3412,393,1631,121,740,0,295,4200,13800,4500,13800,4500,14100,4500,14400,4200,14400,3900,14400,3900,14100,3900,13800
49,tumor_078,4200,14400,1,49,2025,7125,937,3487,393,1668,121,759,0,304,4200,14100,4500,14100,4500,14400,4500,14700,4200,14700,3900,14700,3900,14400,3900,14100


In [0]:
"""
# loop slides with tumors
for slide_name in tumors.slide_name.unique():
    slide, mask = load_slide_mask('drive/My Drive/ADL_images/' + slide_name)

    # loop unique (x, y) locations for that slide
    sub_tumors = tumors.loc[tumors.slide_name == slide_name]
    for row_idx, row in sub_tumors.iterrows():
        image = read_slide(slide, (row.x, row.y), 2, (300, 300))

        # prepare copy of row to append to archive
        row_copy = row.copy()

        # rotate image three times, save each time, and append to archive
        for idx, rotations in enumerate(['one', 'two', 'three']):
            suffix = str(row.suffix) + '_' + rotations
            save_loc = f"drive/My Drive/ADL_windows/data/{slide_name}_{suffix}.jpeg"
            tf.keras.preprocessing.image.save_img(save_loc, np.rot90(image, idx+1))
            
            # add row to archive
            row_copy['suffix'] = suffix
            archive.loc[len(archive)] = list(row_copy)
""";

# Last thing
## Add reference for each level 2 window to the windows around it (also level 2)

In [0]:
DIRECTION_MAP = {
    'N': (0, -300),
    'NE': (300, -300), 
    'E': (300, 0), 
    'SE': (300, 300), 
    'S': (0, 300), 
    'SW': (-300, 300), 
    'W': (-300, 0), 
    'NW': (-300, -300)
} 

In [0]:
for dir, (x_delta, y_delta) in DIRECTION_MAP.items():
    archive[f'x_{dir}'] = archive.x + x_delta
    archive[f'y_{dir}'] = archive.y + y_delta
    

# Last Last thing!
One slide has some noise that tricks our valid_window() check, so here we just remove those windows from the archive

In [0]:
bad_rows = archive.loc[(archive.slide_name == 'tumor_059') & 
                       ((archive.y < 20000) | (archive.y > 43000))].index
archive.drop(bad_rows, inplace=True)


In [0]:
archive.to_csv('drive/My Drive/ADL project/archive.csv', index=None)
