In [395]:
import tensorflow as tf
tf.enable_eager_execution()
import numpy as np
from collections import Counter
from pathlib import Path
import timeit
from PIL import Image

In [2]:
cancer = []
for i in range(10):
    cancer.append('c{}'.format(i))
    
normal = []
for i in range(1000):
    normal.append('n{}'.format(i))

In [5]:
def sample(ratio_cancer):
    prob_c = ratio_cancer / len(cancer)
    prob_n = (1 - ratio_cancer) / len(normal)
    probs = np.repeat([prob_c, prob_n], repeats=[len(cancer), len(normal)])
    return np.random.choice(cancer + normal, size=100, replace=True, p=probs)

def count(_sample):
    return Counter([x[0] for x in _sample])

In [1]:
count(sample(0.5))

NameError: name 'count' is not defined

In [None]:
TRAIN_DIR = Path('/home/matejg/Project/crc_ml_model/data/processed/train_slides/')
VALID_DIR = Path('/home/matejg/Project/crc_ml_model/data/processed/valid_slides/')
VISUAL_DIR = Path('/home/matejg/Project/crc_ml_model/data/processed/visual_slides/')
TEST_DIR = Path('/home/matejg/Project/crc_ml_model/data/processed/test_slides/')

CANCER_SAMPLE_PROBA = 0.5
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [401]:
cancer_paths  = [str(path_) for path_ in TEST_DIR.glob('*/cancer/*.png')]
normal_paths  = [str(path_) for path_ in TEST_DIR.glob('*/normal/*.png')]
num_cancer = len(cancer_paths)
num_normal = len(normal_paths)

print('Found {} cancer tiles.'.format(num_cancer))
print('Found {} normal tiles.'.format(num_normal))

Found 6912 cancer tiles.
Found 9554 normal tiles.


In [397]:
def get_valid_dataset(batch_size, seed=None):
    cancer_paths  = [str(path_) for path_ in VALID_DIR.glob('*/cancer/*.png')]
    normal_paths  = [str(path_) for path_ in VALID_DIR.glob('*/normal/*.png')]
    num_cancer = len(cancer_paths)
    num_normal = len(normal_paths)
    
    print('Found {} cancer tiles.'.format(num_cancer))
    print('Found {} normal tiles.'.format(num_normal))
    
    all_labels = np.array([1] * num_cancer + [0] * num_normal)
    all_paths = np.array(cancer_paths + normal_paths)
    del cancer_paths
    del normal_paths
    indices = np.arange(num_cancer + num_normal)
    np.random.shuffle(indices)
    all_labels = all_labels[indices]
    all_paths = all_paths[indices]
    del indices
    
    paths_ds = tf.data.Dataset.from_tensor_slices(all_paths).map(lambda x: _load_and_preprocess_image(x, seed=seed, apply_color_augmentation=False, apply_flip_augmentation=False))
    labels_ds = tf.data.Dataset.from_tensor_slices(all_labels)
    res = tf.data.Dataset.zip((paths_ds, labels_ds))
    del paths_ds, labels_ds
    
    res = res.batch(batch_size)
    res = res.prefetch(buffer_size=AUTOTUNE)
    return  all_paths, all_labels, res, int(np.ceil((num_cancer + num_normal) / batch_size))
p, l, ds, steps = get_valid_dataset(32, 2)

Found 1748 cancer tiles.
Found 21690 normal tiles.


In [358]:
def get_test_dataset(batch_size, seed=None):
    visual_sets = []
    for visual_slide in VISUAL_DIR.iterdir():
        cancer_paths  = [str(path_) for path_ in visual_slide.glob('cancer/*.png')]
        normal_paths  = [str(path_) for path_ in visual_slide.glob('normal/*.png')]
        num_cancer = len(cancer_paths)
        num_normal = len(normal_paths)

        all_labels = np.array([1] * num_cancer + [0] * num_normal)
        all_paths = np.array(cancer_paths + normal_paths)
        del cancer_paths
        del normal_paths
        indices = np.arange(num_cancer + num_normal)
        np.random.shuffle(indices)
        all_labels = all_labels[indices]
        all_paths = all_paths[indices]
        del indices

        paths_ds = tf.data.Dataset.from_tensor_slices(all_paths).map(lambda x: _load_and_preprocess_image(x, seed=seed, apply_color_augmentation=False, apply_flip_augmentation=False))
        labels_ds = tf.data.Dataset.from_tensor_slices(all_labels)
        res = tf.data.Dataset.zip((paths_ds, labels_ds))
        del paths_ds, labels_ds

        res = res.batch(batch_size)
        res = res.prefetch(buffer_size=AUTOTUNE)
        visual_sets.append((visual_slide.name, all_paths, all_labels, res, int(np.ceil((num_cancer + num_normal) / batch_size))))
    return visual_sets

for test_slide in get_test_dataset(32, 2):
    print(test_slide)
    print()

('2019_1427-08-1', array(['/home/matejg/Project/crc_ml_model/data/processed/visual_slides/2019_1427-08-1/normal/2019_1427-08-1-tile-r402-c172-x51132-y119902-w299-h299.png',
       '/home/matejg/Project/crc_ml_model/data/processed/visual_slides/2019_1427-08-1/cancer/2019_1427-08-1-tile-r412-c87-x25715-y122892-w299-h299.png',
       '/home/matejg/Project/crc_ml_model/data/processed/visual_slides/2019_1427-08-1/normal/2019_1427-08-1-tile-r573-c174-x51730-y171033-w299-h299.png',
       ...,
       '/home/matejg/Project/crc_ml_model/data/processed/visual_slides/2019_1427-08-1/cancer/2019_1427-08-1-tile-r586-c150-x44554-y174920-w299-h299.png',
       '/home/matejg/Project/crc_ml_model/data/processed/visual_slides/2019_1427-08-1/normal/2019_1427-08-1-tile-r403-c149-x44255-y120201-w299-h299.png',
       '/home/matejg/Project/crc_ml_model/data/processed/visual_slides/2019_1427-08-1/cancer/2019_1427-08-1-tile-r582-c124-x36779-y173724-w299-h299.png'],
      dtype='<U143'), array([0, 1, 0, ..., 1,

In [327]:
for idx, x in enumerate(a):
    imgs, labels = x
    print('{} _ {}\r'.format(idx, imgs.shape), end='')

732 _ (14, 299, 299, 3)

In [344]:
def get_training_dataset(batch_size, seed=None):
    """
    Creates training minibatch for a single training epoch

    Arguments:
        batch_size      -   test set batch size
        steps_per_epoch -   how many minibatches should be trained on during one epoch
        seed            -   seed for random operations
    """
    
    if type_ == 'train':
        load_dir = TRAIN_DIR
        apply_color_augmentation = True
        apply_flip_augmentation = True
    elif type_ == 'valid':
        load_dir = VALID_DIR
        apply_color_augmentation = False
        apply_flip_augmentation = False
    else:
        raise ValueError('Unrecognized \'type\' value: \'{}\''.format(type_))

    # Get list of cancer tile directories
    cancer_datasets = []
    for cancer_slide in load_dir.glob('*/cancer'):
        paths = [str(path_) for path_ in cancer_slide.iterdir()]
        ds_paths = tf.data.Dataset.from_tensor_slices(paths)        
        ds_labels = tf.data.Dataset.from_tensor_slices([1] * len(paths))
        ds = tf.data.Dataset.zip((ds_paths, ds_labels))
        ds = ds.apply(tf.data.experimental.shuffle_and_repeat(len(paths), count=None, seed=seed))
        cancer_datasets.append(ds)
        
    # Get list of normal tile directories
    normal_datasets = []
    for normal_slide in load_dir.glob('*/normal'):
        paths = [str(path_) for path_ in normal_slide.iterdir()]
        ds_paths = tf.data.Dataset.from_tensor_slices(paths)        
        ds_labels = tf.data.Dataset.from_tensor_slices([0] * len(paths))
        ds = tf.data.Dataset.zip((ds_paths, ds_labels))
        ds = ds.apply(tf.data.experimental.shuffle_and_repeat(len(paths), count=None, seed=seed))
        normal_datasets.append(ds)

    # Calculate sampling distribution on datasets
    per_cancer_proba = CANCER_SAMPLE_PROBA / len(cancer_datasets)
    per_normal_proba = (1 - CANCER_SAMPLE_PROBA) / len(normal_datasets)
    proba_vector = ([per_cancer_proba] * len(cancer_datasets)) + ([per_normal_proba] * len(normal_datasets))

    # Sample tiles
    full_ds = tf.data.experimental.sample_from_datasets(cancer_datasets + normal_datasets, weights=proba_vector, seed=seed)
    full_ds = full_ds.apply(tf.data.experimental.map_and_batch(lambda x, y: (_load_and_preprocess_image(x, seed=seed, apply_color_augmentation=apply_color_augmentation, apply_flip_augmentation=apply_flip_augmentation), y), batch_size=batch_size))
    full_ds = full_ds.prefetch(buffer_size=AUTOTUNE)
    return full_ds, steps_per_epoch

In [345]:
full_ds, _ = get_dataset(32, None, type_='valid', seed=2)

In [213]:
print('Binary Classifier')
output_activation = tf.nn.sigmoid
loss_fn = tf.keras.losses.binary_crossentropy
metrics=[tf.keras.metrics.BinaryAccuracy(), 
         tf.keras.metrics.Precision(), 
         tf.keras.metrics.Recall()]

model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(filters=8, kernel_size=5, 
                                strides=4, activation=tf.nn.relu, 
                                input_shape=(299, 299, 3)),
        tf.keras.layers.MaxPooling2D(pool_size=2, strides=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(rate=0.5),
        tf.keras.layers.Dense(512, activation=tf.nn.relu),
        tf.keras.layers.Dense(128, activation=tf.nn.relu),
        tf.keras.layers.Dense(1, activation=output_activation)
])

model.compile(loss=loss_fn, metrics=metrics, optimizer=tf.train.RMSPropOptimizer(learning_rate=0.002, 
                                                                                    decay=0.001, 
                                                                                    momentum=0.001, 
                                                                                    epsilon=0.02))

Binary Classifier


In [214]:
model.fit(full_ds, steps_per_epoch=1000)



<tensorflow.python.keras.callbacks.History at 0x7fbd05f3fa90>

In [188]:
def display_image(img):
    img = tf.image.convert_image_dtype(img, dtype=tf.uint8)
    img = tf.squeeze(img)
    return Image.fromarray(img.numpy())

In [33]:
def _load_and_preprocess_image(path, seed, apply_flip_augmentation=True, apply_color_augmentation=True):
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=3)
    return _preprocess_image(image, apply_color_augmentation, apply_flip_augmentation, seed=seed)


def _preprocess_image(image, apply_color_augmentation=True, apply_flip_augmentation=True, seed=None, width=299, height=299):
    image = tf.cast(image, dtype=tf.float32)
    image = tf.image.resize(image, [width, height])
    image = image / 255.0  # normalize to [0,1] range
    if apply_flip_augmentation:
        image = _flip_rotate_augmentation(image, seed)

    if apply_color_augmentation:
        image = _color_augmentation(image, seed)
    image = 2*image-1       # rescale to [-1,1]
    return image


def _color_augmentation(x, seed):
    x = tf.image.random_hue(x, 0.04, seed)
    x = tf.image.random_saturation(x, 0, 0.25, seed)
    x = tf.image.random_brightness(x, 64/255, seed)
    x = tf.image.random_contrast(x, 0, 0.75, seed)
    x = tf.clip_by_value(x, 0, 1)
    return x


def _flip_rotate_augmentation(x, seed):
    x = tf.image.random_flip_left_right(x, seed)
    rotation = np.random.choice([0, 1, 2, 3])
    x = tf.image.rot90(x, k=rotation)
    return x