# Apply Deep Learning Project
### Peter Grantcharov (pdg2116)
### Po-Chieh Liu (pl2441)


#### INSATALL SLIDE READING PACKAGES

In [0]:
# Install the OpenSlide C library and Python bindings
!apt-get install openslide-tools
!pip install openslide-python


Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-430
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libopenslide0
Suggested packages:
  libtiff-tools
The following NEW packages will be installed:
  libopenslide0 openslide-tools
0 upgraded, 2 newly installed, 0 to remove and 7 not upgraded.
Need to get 92.5 kB of archives.
After this operation, 268 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libopenslide0 amd64 3.4.1+dfsg-2 [79.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 openslide-tools amd64 3.4.1+dfsg-2 [12.7 kB]
Fetched 92.5 kB in 1s (172 kB/s)
Selecting previously unselected package libopenslide0.
(Reading database ... 145655 files and directories currently installed.)
Preparing to unpack .../libopenslide0_3.4.1+dfsg-2_

In [0]:
# import necessary packages
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from openslide import open_slide
from skimage.color import rgb2gray
from tqdm._tqdm_notebook import tnrange, tqdm
from sklearn.model_selection import train_test_split
%tensorflow_version 2.x
import tensorflow as tf


TensorFlow 2.x selected.


# Loading images

In [0]:
def verify_validity(top_left, dims, image_dims):
    assert len(top_left) == 2, "Top left corner needs to have length 2"
    assert len(dims) == 2, 'Dims have to be length 2'
    
    msg = f"Top left corner {top_left} is outside image {image_dims}"
    assert top_left[0] < image_dims[0] and top_left[1] < image_dims[1], msg


In [0]:
def get_x_y(top_left, level):
    """
    Gets the top left corner for image of the level we're fetching.
    """
    scale = (2 ** level)
    return tuple(np.array(top_left) * scale)

def get_width_height(top_left, dims, image_dims):
    """
    Gets height and width; ensures that it does not go outside the
    image borders.
    """
    width = min(image_dims[0] - top_left[0], dims[0])
    height = min(image_dims[1] - top_left[1], dims[1])
    return width, height


In [0]:
def read_slide(slide, top_left, level, dims):
    """
    Give corner pixel values (top_left) for image of the level 
    we're fetching, instead of the highest resolution image.
    
    Dims are amount of (x, y) pixels to include.
    """
    image_dims = slide.level_dimensions[level]
    verify_validity(top_left, dims, image_dims)

    x, y = get_x_y(top_left, level)
    width, height = get_width_height(top_left, dims, image_dims)
    im = slide.read_region((x, y), level, (width, height)).convert('RGB')
    return np.asarray(im).copy()


In [0]:
from google.colab import drive
drive.mount('/content/drive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
archive = pd.read_csv('drive/My Drive/ADL project/archive.csv', index_col=None)


In [0]:
archive.head()

Unnamed: 0,slide_name,x,y,labels,suffix,x_3,y_3,x_4,y_4,x_5,y_5,x_6,y_6,x_7,y_7,x_N,y_N,x_NE,y_NE,x_E,y_E,x_SE,y_SE,x_S,y_S,x_SW,y_SW,x_W,y_W,x_NW,y_NW
0,tumor_078,3000,10200,0,0,1425,5025,637,2437,243,1143,46,496,0,173,3000,9900,3300,9900,3300,10200,3300,10500,3000,10500,2700,10500,2700,10200,2700,9900
1,tumor_078,3300,9900,0,1,1575,4875,712,2362,281,1106,65,478,0,164,3300,9600,3600,9600,3600,9900,3600,10200,3300,10200,3000,10200,3000,9900,3000,9600
2,tumor_078,3300,10200,0,2,1575,5025,712,2437,281,1143,65,496,0,173,3300,9900,3600,9900,3600,10200,3600,10500,3300,10500,3000,10500,3000,10200,3000,9900
3,tumor_078,3300,10500,0,3,1575,5175,712,2512,281,1181,65,515,0,182,3300,10200,3600,10200,3600,10500,3600,10800,3300,10800,3000,10800,3000,10500,3000,10200
4,tumor_078,3300,14100,0,4,1575,6975,712,3412,281,1631,65,740,0,295,3300,13800,3600,13800,3600,14100,3600,14400,3300,14400,3000,14400,3000,14100,3000,13800


# tensorflow pipeline prep

In [0]:
def load_image(path):
    """
    Returns loaded image in numpy array format.
    """
    return tf.io.decode_jpeg(tf.io.read_file(path + '.jpeg')) / 255
    

In [0]:
def create_dataset(main_paths, auxillary_paths, labels, cache_name):
    main_path_ds = tf.data.Dataset.from_tensor_slices(main_paths)
    main_image_ds = main_path_ds.map(load_image, num_parallel_calls=AUTOTUNE)
    label_ds = tf.data.Dataset.from_tensor_slices(labels)

    if auxillary_paths is not None:
        aux_path_ds = tf.data.Dataset.from_tensor_slices(auxillary_paths)
        aux_image_ds = aux_path_ds.map(load_image, num_parallel_calls=AUTOTUNE)
        ds = tf.data.Dataset.zip((main_image_ds, aux_image_ds, label_ds))
    else:
        ds = tf.data.Dataset.zip((main_image_ds, label_ds))
    
    ds = ds.cache(filename=cache_name)
    if 'train' in cache_name:
        ds = ds.shuffle(SHUFFLE_SIZE)
    return ds.batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)


In [0]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 32
SHUFFLE_SIZE = 1000


### Train-Test Split

In [0]:
def downsample(df, n=None):
    """
    Function to downsample a DataFrame. Will keep all of the
    smaller class if n is None, else keep n of both classes.
    """
    n = df.groupby('labels').size().min() if n is None else n
    class_0 = df.loc[df.labels == 0].sample(n)
    class_1 = df.loc[df.labels == 1].sample(n)
    return pd.concat([class_0, class_1]).sample(frac=1)


In [0]:
train_images = ['tumor_016', 'tumor_019', 'tumor_059', 'tumor_002', 'tumor_023', 
                'tumor_096', 'tumor_035', 'tumor_078', 'tumor_001']
test_images = ['tumor_012', 'tumor_101', 'tumor_064', 'tumor_110', 'tumor_081', 
               'tumor_091', 'tumor_057', 'tumor_094', 'tumor_031', 'tumor_075', 
               'tumor_005', 'tumor_084']

# - split archive dataframe into train and test
# - exclude label 2 from training (hope it will make learning easier)
# - exclude rotated images in test set (no need to evaluate on augmented images)
train_archive = archive.loc[(archive.slide_name.isin(train_images)) & 
                            (archive.labels != 2)].copy()
train_archive = downsample(train_archive)

test_archive = archive.loc[(archive.slide_name.isin(test_images)) &
                           (~archive.suffix.str.startswith('_'))].copy()
test_archive.labels.replace(2, 0, inplace=True)
test_archive = downsample(test_archive, n=1000)


In [0]:
print(f'Train size: {len(train_archive)}')
print(f'Test size: {len(test_archive)}')
print(f'Train class 1 proportion: {(train_archive.labels == 1).mean()}')
print(f'Test class 1 proportion: {(test_archive.labels == 1).mean()}')


Train size: 8040
Test size: 2000
Train class 1 proportion: 0.5
Test class 1 proportion: 0.5


In [0]:
def get_paths(archive, aux_level=3, image_dir='drive/My Drive/ADL_windows/'):
    """
    From a given subset of the archive DataFrame, will return the paths to the
    main images and the auxillary images in two separate lists.  
    """
    names = archive.slide_name
    x, y = archive[f'x_{aux_level}'].astype(str), archive[f'y_{aux_level}'].astype(str)
    main_paths = 'data/' + names + archive.suffix.astype(str)
    aux_paths = f'data_level_{aux_level}/' + names + '_' + x + '_' + y
    return list(image_dir + main_paths), list(image_dir + aux_paths)


In [0]:
main_paths_train, aux_paths_train = get_paths(train_archive)
main_paths_test, aux_paths_test = get_paths(test_archive)


# Will first try with single input on InceptionV3

In [0]:
from tensorflow.keras import layers, models


In [0]:
train_ds = create_dataset(main_paths_train, None,
                          train_archive.labels, './train_single_cache.tf-data')
test_ds = create_dataset(main_paths_test, None, 
                          test_archive.labels, './test_single_cache.tf-data')


In [0]:
main_batch, labels_batch = next(iter(train_ds))
print(main_batch.shape, labels_batch.shape)


(32, 300, 300, 3) (32,)


In [0]:
# Use inceptionV3
base_model = tf.keras.applications.InceptionV3(include_top=False,
                                               weights='imagenet',
                                               input_shape = (300, 300, 3))
base_model.trainable=False

# bottom layers
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(512,activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.models.Model(inputs=base_model.input, outputs=outputs)


In [0]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
]

In [0]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=METRICS)


# Check if it can memorize 1 batch

In [0]:
steps = 20

for step in range(1, steps + 1):
    metrics = model.train_on_batch(x=main_batch, y=labels_batch)
    print(f'Loss for step {step}: {metrics[0]}')
        

Loss for step 1: 1.0567946434020996
Loss for step 2: 0.9733246564865112
Loss for step 3: 0.34825780987739563
Loss for step 4: 0.26688119769096375
Loss for step 5: 0.0934140533208847
Loss for step 6: 0.13588166236877441
Loss for step 7: 0.05031708627939224
Loss for step 8: 0.01774255372583866
Loss for step 9: 0.017326757311820984
Loss for step 10: 0.02578756958246231
Loss for step 11: 0.026178278028964996
Loss for step 12: 0.016966965049505234
Loss for step 13: 0.00896743405610323
Loss for step 14: 0.0054623764008283615
Loss for step 15: 0.004718874581158161
Loss for step 16: 0.005302498582750559
Loss for step 17: 0.006355525925755501
Loss for step 18: 0.007172468584030867
Loss for step 19: 0.007243472151458263
Loss for step 20: 0.006498036906123161


# Try ten epochs

In [0]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=METRICS)


In [0]:
model.fit(train_ds, validation_data=test_ds, epochs=15)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fafa0c3e710>

# Viable model if all else fails

# Multi-input model

In [0]:
CHECKPOINT_DIR = 'drive/My Drive/ADL project/MultiImageCheckpoints/'


In [0]:
train_ds = create_dataset(main_paths_train, aux_paths_train, 
                          train_archive.labels, './train_multi_cache.tf-data')
test_ds = create_dataset(main_paths_test, aux_paths_test,
                          test_archive.labels, './test_multi_cache.tf-data')


In [0]:
main_batch, aux_paths, labels_batch = next(iter(train_ds))
print(main_batch.shape, aux_paths.shape, labels_batch.shape)


(32, 300, 300, 3) (32, 300, 300, 3) (32,)


In [0]:
# Use inceptionV3, model 1
base_model1 = tf.keras.applications.InceptionV3(include_top=False,
                                               weights='imagenet',
                                               input_shape = (300, 300, 3))
for layer in tqdm(base_model1.layers):
    layer._name = 'level_2_' + layer.name
base_model1.trainable=False

# model 2
base_model2 = tf.keras.applications.InceptionV3(include_top=False,
                                               weights='imagenet',
                                               input_shape = (300, 300, 3))
for layer in tqdm(base_model2.layers):
    layer._name = 'level_4_' + layer.name
base_model2.trainable=False

# bottom layers1
x1 = base_model1.output
x1 = tf.keras.layers.GlobalAveragePooling2D()(x1)
x1 = tf.keras.layers.Dense(128,activation='relu')(x1)

# bottom layers2
x2 = base_model2.output
x2 = tf.keras.layers.GlobalAveragePooling2D()(x2)
x2 = tf.keras.layers.Dense(128,activation='relu')(x2)

# merge
merged = tf.keras.layers.concatenate([x1, x2])
dense = tf.keras.layers.Dense(128,activation='relu')(merged)
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
model = tf.keras.models.Model(inputs=[base_model1.input, base_model2.input],
                              outputs=output)


100%|██████████| 311/311 [00:00<00:00, 44716.62it/s]
100%|██████████| 311/311 [00:00<00:00, 42414.92it/s]


In [0]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=METRICS)


In [0]:
# Making a dict to hold all metrics
def get_metrics():
    metrics = {
        'train_loss': tf.keras.metrics.Mean(name='train_loss'),
        'train_accuracy': tf.keras.metrics.Mean(name='train_accuracy'),
        'train_precision': tf.keras.metrics.Mean(name='train_precision'),
        'train_recall': tf.keras.metrics.Mean(name='train_recall'),
        'train_auc': tf.keras.metrics.Mean(name='train_auc'),

        'val_loss': tf.keras.metrics.BinaryCrossentropy(name='val_loss'),
        'val_accuracy': tf.keras.metrics.BinaryAccuracy(name='val_accuracy'),
        'val_precision': tf.keras.metrics.Precision(name='val_precision'),
        'val_recall': tf.keras.metrics.Recall(name='val_recall'),
        'val_auc': tf.keras.metrics.AUC(name='val_auc')
    }
    return metrics


In [0]:
# Making a dict to hold history
def get_history(loc=None):
    if loc is not None:
        with open(loc + 'history.txt') as file:
            history = eval(file.read())
    else:
        history = {
            'train_loss': [],
            'train_accuracy': [],
            'train_precision': [],
            'train_recall': [],
            'train_auc': [],

            'val_loss': [],
            'val_accuracy': [],
            'val_precision': [],
            'val_recall': [],
            'val_auc': []
        }
    return history


In [0]:
def get_last_checkpoint(direc):
    checkpoints = [d for d in os.listdir(direc) if os.path.isdir(direc + d)]
    return None if len(checkpoints) == 0 else direc + sorted(checkpoints)[-1] + '/'


In [0]:
def print_status(epoch, idx):
    if idx % 25 == 0:
        print(f'{10 - (idx // 25)}', end='' if idx < 250 else '\n')
    elif idx % 5 == 0:
        print('.', end='')
        

In [0]:
def update_results(metrics, results):
    """
    metrics = metrics dictionary 
    results = list of length len(METRICS) = 5

    Updates training results for a single batch.
    """
    metric_names = ['loss', 'accuracy', 'precision', 'recall', 'auc']
    keys = [f'train_{x}' for x in metric_names]
    for idx, key in enumerate(keys):
        metrics[key](results[idx])
    return metrics


In [0]:
def evaluate_validation(model, ds, metrics):
    """
    Evaluates model on dataset (ds) using metrics
    """
    for main_batch, aux_batch, labels_batch in ds:
        labels_batch = tf.reshape(labels_batch, shape=(-1, 1))
        predictions = model.predict_on_batch(x=(main_batch, aux_batch))
        for key, val in metrics.items():
            if key.startswith('val'):
                val(labels_batch, predictions)
    return metrics


In [0]:
def display_results(start, metrics):
    print(f'\tTime: {(time.time() - start):.2f}, ', end='')
    for key, val in metrics.items():
        print(f'{key}: {val.result():.2f}', end=', ' if key != 'val_auc' else '.\n')


In [0]:
def record_history(history, metrics):
    for key, val in metrics.items():
        history[key].append(val.result().numpy())
    return history


In [0]:
def reset_metric_states(metrics):
    for key, val in metrics.items():
        val.reset_states()
    return metrics
    

In [0]:
def save_data(model, history, epoch):
    folder = f'{CHECKPOINT_DIR}epoch_{epoch}/'
    if f'epoch_{epoch}' not in os.listdir(CHECKPOINT_DIR):
        os.mkdir(folder)
    model.save_weights(folder)
    with open(folder + 'history.txt', 'w') as file:
        file.write(str(history))


In [0]:
# Load weights and history (if resuming training)
last_checkpoint = get_last_checkpoint(CHECKPOINT_DIR)
if last_checkpoint is not None:
    history = get_history(loc=last_checkpoint)
    starting_epoch = len(history['train_loss'])
    model.load_weights(last_checkpoint)
else:
    print("Checkpoint not found. Starting from scratch")
    history = get_history(loc=None)
    starting_epoch = 0

metrics = get_metrics()
target_epochs = 20

# Train Model
for epoch in range(starting_epoch, target_epochs):
    print(f'Epoch: {epoch}')
    start = time.time()

    for idx, (main_batch, aux_batch, labels_batch) in enumerate(train_ds):
        print_status(epoch, idx)
        results = model.train_on_batch(x=(main_batch, aux_batch), y=labels_batch)
        metrics = update_results(metrics, results)

    metrics = evaluate_validation(model, test_ds, metrics)
    display_results(start, metrics)
    history = record_history(history, metrics)
    metrics = reset_metric_states(metrics)

    save_data(model, history, epoch)
    


Checkpoint not found. Starting from scratch
Epoch: 0
10....9....8....7....6....5....4....3....2....1....0
	Time: 251.18, train_loss: 0.10, train_accuracy: 0.96, train_precision: 0.97, train_recall: 0.96, train_auc: 1.00, val_loss: 1.06, val_accuracy: 0.64, val_precision: 0.59, val_recall: 0.93, val_auc: 0.78.
Epoch: 1
10....9....8....7....6....5....4....3....2....1....0
	Time: 246.71, train_loss: 0.10, train_accuracy: 0.96, train_precision: 0.97, train_recall: 0.96, train_auc: 1.00, val_loss: 0.64, val_accuracy: 0.74, val_precision: 0.74, val_recall: 0.73, val_auc: 0.81.
Epoch: 2
10....9....8....7....6....5....4....3....2....1....0
	Time: 247.94, train_loss: 0.09, train_accuracy: 0.97, train_precision: 0.97, train_recall: 0.97, train_auc: 1.00, val_loss: 0.90, val_accuracy: 0.67, val_precision: 0.62, val_recall: 0.91, val_auc: 0.80.
Epoch: 3
10....9....8....7....6....5....4....3....2....1....0
	Time: 253.31, train_loss: 0.09, train_accuracy: 0.96, train_precision: 0.97, train_recall: 0