<a href="https://colab.research.google.com/github/TIANBOQIU/AppliedDeepLearning/blob/master/DL_final_data_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install -q openslide-tools
!apt-get install -q rsync
!pip install openslide-python
!pip install tensorflow-gpu==2.0.0-alpha
!pip install -q scikit-plot

Reading package lists...
Building dependency tree...
Reading state information...
openslide-tools is already the newest version (3.4.1+dfsg-2).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.
Reading package lists...
Building dependency tree...
Reading state information...
rsync is already the newest version (3.1.2-2.1ubuntu1).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from openslide import open_slide, __library_version__ as openslide_version
import os, random, re, time, shutil
from PIL import Image
from skimage.color import rgb2gray
import scikitplot as skplt

import tensorflow as tf

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import optimizers, layers, models
from tensorflow.keras.applications import InceptionV3



print(tf.__version__)

2.0.0-alpha0


In [3]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive/


###Functions

utility functions

In [0]:
def read_slide(slide, x, y, level, width, height, as_float=False):
    im = slide.read_region((x,y), level, (width, height))
    im = im.convert('RGB') # drop the alpha channel
    if as_float:
        im = np.asarray(im, dtype=np.float32)
    else:
        im = np.asarray(im)
    assert im.shape == (height, width, 3)
    return im

def find_tumor_pixels(m):
  return [(j, i) for i, row in enumerate(m)
              for j, c in enumerate(row) if c]

def find_tissue_pixels(s, intensity=0.8):
  s_gray = rgb2gray(s)
  idx = np.where(s_gray <= intensity)
  return zip(idx[1], idx[0])

def _get_helper(slide_path, mask_path):
  slide, mask = open_slide(slide_path), open_slide(mask_path)
  s = read_slide(slide, x=0, y=0, level=7, width=slide.level_dimensions[7][0], height=slide.level_dimensions[7][1])
  m = read_slide(mask, x=0, y=0, level=7, width=mask.level_dimensions[7][0], height=mask.level_dimensions[7][1])[:,:,0]
  return slide, mask, s, m

def _get_locs(slide_path, mask_path):
  slide, mask, s, m = _get_helper(slide_path, mask_path)
  loc_tissue = set(find_tissue_pixels(s))
  loc_tumor = set(find_tumor_pixels(m))
  loc_normal = loc_tissue - loc_tumor
  return loc_tissue, loc_tumor, loc_normal

def get_patch(slide, x, y, width, height):
  patch = read_slide(slide, x=x, y=y, level=0, width=width, height=height)
  return patch

def _print_slide(slide_path, sz=10):
  mask_path = slide_path.split('.')[0] + '_mask.tif'
  slide, mask, s, m = _get_helper(slide_path, mask_path)
  plt.figure(figsize=(sz, sz))
  plt.imshow(s)
  plt.imshow(m, cmap='jet', alpha=0.5)
  plt.title(slide_path.split('/')[-1])

functions for fetching data and extracting patches 

In [0]:
def _myFetch(filenames):
  if not os.path.exists('slides'):
    os.mkdir('slides')
  _root = '/content/gdrive/My Drive/DeepLearning/datasets/DL_final/my_slides'
  for filename in filenames:
    slide_path = os.path.join(_root, 'Copy of ' + filename)
    mask_path = os.path.join(_root, 'Copy of ' + filename.split('.')[0]+'_mask.tif')
    print('fetching %s ..' %filename)
    shutil.copyfile(slide_path, 'slides/'+filename)
    shutil.copyfile(mask_path, 'slides/'+filename.split('.')[0]+'_mask.tif')
  
  

def _extract_tumor_slides(filenames):
  if not os.path.exists('patches'):
    os.mkdir('patches')
  if not os.path.exists('patches/tumor'):
    os.mkdir('patches/tumor')
  if not os.path.exists('patches/normal'):
    os.mkdir('patches/normal')
  slides = ['slides/%s' %filename for filename in filenames]
  masks = [path.split('.')[0]+'_mask.tif' for path in slides]
  for slide_path, mask_path in zip(slides, masks):
    slide_name = slide_path.split('/')[-1].split('.')[0]
    slide, mask, s, m = _get_helper(slide_path, mask_path)
    loc_tissue, loc_tumor, loc_normal = _get_locs(slide_path, mask_path)
    THRESH = len(loc_tumor) // 3
    _num_tumor, _num_normal = 0, 0
    
    samples = random.sample(loc_tumor, THRESH)
  
    for x, y in samples:
      if get_patch(mask, x*128, y*128, 128, 128)[:,:,0].any(): # is tumor
        patch = get_patch(slide, max(0, x*128-85), max(0, y*128-85), 299, 299) # extract the context
        img = Image.fromarray(patch, 'RGB')
        img.save('patches/tumor/{}_{}_{}.png'.format(slide_name, x, y))
        _num_tumor += 1
    
    samples = random.sample(loc_normal, _num_tumor//2)
    for x, y in samples:
      if not get_patch(mask, x*128, y*128, 128, 128)[:,:,0].any(): # is normal
        patch = get_patch(slide, max(0, x*128-85), max(0, y*128-85), 299, 299)
        img = Image.fromarray(patch, 'RGB')
        img.save('patches/normal/{}_{}_{}.png'.format(slide_name, x, y))
        _num_normal += 1
    print('extracted: {:>20} ||  tumor patches {:<7} || normal patches {:<7}'.format(slide_name, _num_tumor, _num_normal))  
    
def _extract_normal_slides(filenames, THRESH):
  if not os.path.exists('patches'):
    os.mkdir('patches')
  if not os.path.exists('patches/tumor'):
    os.mkdir('patches/tumor')
  if not os.path.exists('patches/normal'):
    os.mkdir('patches/normal')
  slides = ['slides/%s' %filename for filename in filenames]
  masks = [path.split('.')[0]+'_mask.tif' for path in slides]
  for slide_path, mask_path in zip(slides, masks):
    slide_name = slide_path.split('/')[-1].split('.')[0]
    slide, mask, s, m = _get_helper(slide_path, mask_path)
    loc_tissue, loc_tumor, loc_normal = _get_locs(slide_path, mask_path)
    
    _num_normal = 0
    samples = random.sample(loc_normal, min(THRESH, len(loc_normal)))
    for x, y in samples:
      if not get_patch(mask, x*128, y*128, 128, 128)[:,:,0].any(): # is normal
        patch = get_patch(slide, max(0, x*128-85), max(0, y*128-85), 299, 299)
        img = Image.fromarray(patch, 'RGB')
        img.save('patches/normal/{}_{}_{}.png'.format(slide_name, x, y))
        _num_normal += 1
    print('extracted: {:>20} ||  tumor patches {:<7} || normal patches {:<7}'.format(slide_name, 0, _num_normal))      

functions for creating, testing, and evaluating models

In [0]:
def create_model():
  conv_base = InceptionV3(weights='imagenet', include_top=False, input_shape=(299, 299, 3))
  model = models.Sequential()
  model.add(conv_base)
  model.add(layers.Flatten())
  model.add(layers.Dense(128, activation='relu'))
  model.add(layers.Dropout(0.3))
  model.add(layers.Dense(1, activation='sigmoid'))

  conv_base.trainable = False
  model.compile(loss='binary_crossentropy', optimizer=optimizers.RMSprop(lr=2e-5), metrics=['acc'])
  return model

def test_model(filename, model):
  slide_path = 'slides/' + filename
  mask_path = 'slides/' + filename.split('.')[0] + '_mask.tif'
  slide, mask, s, m = _get_helper(slide_path, mask_path)
  loc_tissue, loc_tumor, loc_normal = _get_locs(slide_path, mask_path)
  print('predicting on %i tissues' %len(loc_tissue))
  s_mask = np.zeros(shape=m.shape, dtype=float)
  i = 1
  for x, y in loc_tissue:
    if i % 2000 == 0:
      print('predicting {:>7} / {} || {:<4f} %'.format(i, len(loc_tissue), 100 * i / len(loc_tissue)))
    patch = get_patch(slide, max(0, x*128-85), max(0, y*128-85), 299, 299) # needs to normalize
    patch = patch / 255
    patch = np.expand_dims(patch, axis=0)
    pred = model.predict(patch)
    s_mask[y][x] = pred[0][0]
    i += 1
  return s_mask

def test_model_in_parts(filename, model, _slice):
  slide_path = 'slides/' + filename
  mask_path = 'slides/' + filename.split('.')[0] + '_mask.tif'
  slide, mask, s, m = _get_helper(slide_path, mask_path)
  loc_tissue, loc_tumor, loc_normal = _get_locs(slide_path, mask_path)
  print('predicting on %i tissues' %len(loc_tissue))
  s_mask = np.zeros(shape=m.shape, dtype=float)
  i = 1
  for x, y in list(loc_tissue)[_slice]:
    if i % 2000 == 0:
      print('predicting {:>7} / {} || {:<4f} %'.format(i, len(loc_tissue), 100 * i / len(loc_tissue)))
    patch = get_patch(slide, max(0, x*128-85), max(0, y*128-85), 299, 299) # needs to normalize
    patch = patch / 255
    patch = np.expand_dims(patch, axis=0)
    pred = model.predict(patch)
    s_mask[y][x] = pred[0][0]
    i += 1
  return s_mask

def _print_test(filename, s_mask):
  slide_path, mask_path = 'slides/'+filename, 'slides/'+filename.split('.')[0]+'_mask.tif'
  slide, mask, s, m = _get_helper(slide_path, mask_path)
  plt.figure(figsize=(20, 20))
  plt.title(filename)
  plt.subplot(1, 2, 1)
  plt.imshow(s)
  plt.imshow(m, cmap='jet', alpha=0.5)
  plt.title('ground truth')
  plt.subplot(1, 2, 2)
  plt.imshow(s)
  plt.imshow(s_mask, cmap='jet', alpha=0.5)
  plt.title('predicted heatmap')
  
def _plot_auc(filename, s_mask):
  slide_path, mask_path = 'slides/'+filename, 'slides/'+filename.split('.')[0]+'_mask.tif'
  slide, mask, s, m = _get_helper(slide_path, mask_path)
  y_true = m.reshape((-1,))
  y_tumor = s_mask.reshape((-1,))
  y_normal = 1 - y_tumor
  y_probas = np.array(list(zip(y_normal, y_tumor)))
  skplt.metrics.plot_roc_curve(y_true, y_probas)
  plt.show()  

###Training
**fetching slides**

In [6]:
USER_NAME = "tianboqiu"
USER_EMAIL = "tianbo@gmail.com"
TOKEN = "f949dc950a158c383ae842980e32030bafcb292a"


!git config --global user.email {USER_EMAIL}
!git config --global user.name {USER_NAME}


repo_path = "DL_final"
if not os.path.exists(os.path.join(os.getcwd(), repo_path)):
  !git clone https://{USER_NAME}:{TOKEN}@github.com/TIANBOQIU/DL_final.git
      

os.chdir(repo_path) # change directory to the cloned repo
!git pull

#!git add .
#!git commit -m "commmit message"
#!git push https://{USER_NAME}:{TOKEN}@github.com/TIANBOQIU/DL_final.git master      

Already up to date.


In [0]:
tumor_slides = ['tumor_016.tif', 'tumor_031.tif', 'tumor_064.tif',
                'tumor_078.tif', 'tumor_084.tif', 'tumor_091.tif',
                'tumor_094.tif', 'tumor_101.tif', 'tumor_110.tif']
normal_slides = ['tumor_002.tif', 'tumor_012.tif', 'tumor_035.tif',
                 'tumor_057.tif', 'tumor_059.tif', 'tumor_081.tif']

In [9]:
# fetch slides of interest, which takes about 5 min
_myFetch(tumor_slides+normal_slides)

fetching tumor_016.tif ..
fetching tumor_031.tif ..
fetching tumor_064.tif ..
fetching tumor_078.tif ..
fetching tumor_084.tif ..
fetching tumor_091.tif ..
fetching tumor_094.tif ..
fetching tumor_101.tif ..
fetching tumor_110.tif ..
fetching tumor_002.tif ..
fetching tumor_012.tif ..
fetching tumor_035.tif ..
fetching tumor_057.tif ..
fetching tumor_059.tif ..
fetching tumor_081.tif ..


**sampling, get balanced training patches**

In [10]:
print('THRESH = 1/3 loc_tummor, extract THRESH / 3 tumor patches, THRESH / 6 normal patches')
for filename in tumor_slides:

  slide_path, mask_path = 'slides/'+filename, 'slides/'+filename.split('.')[0]+'_mask.tif'
  slide, mask, s, m = _get_helper(slide_path, mask_path)
  loc_tissue, loc_tumor, loc_normal = _get_locs(slide_path, mask_path)
  print('\t{} || tissue {:<7}  ||  tumor  {:>7}  ||  normal  {:>7}'.format(filename, len(loc_tissue), len(loc_tumor), len(loc_normal)))
print('\nevenly sample normal patches from normal slides, balance the tumor patches')
for filename in normal_slides:

  slide_path, mask_path = 'slides/'+filename, 'slides/'+filename.split('.')[0]+'_mask.tif'
  slide, mask, s, m = _get_helper(slide_path, mask_path)
  loc_tissue, loc_tumor, loc_normal = _get_locs(slide_path, mask_path)
  print('\t{} || tissue {:<7}  ||  tumor  {:>7}  ||  normal  {:>7}'.format(filename, len(loc_tissue), len(loc_tumor), len(loc_normal)))

THRESH = 1/3 loc_tummor, extract THRESH / 3 tumor patches, THRESH / 6 normal patches
	tumor_016.tif || tissue 71958    ||  tumor     9811  ||  normal    62203
	tumor_031.tif || tissue 46986    ||  tumor     7602  ||  normal    39423
	tumor_064.tif || tissue 65796    ||  tumor     8159  ||  normal    58875
	tumor_078.tif || tissue 215836   ||  tumor    59291  ||  normal   156554
	tumor_084.tif || tissue 86562    ||  tumor     1994  ||  normal    84571
	tumor_091.tif || tissue 62589    ||  tumor     2924  ||  normal    59667
	tumor_094.tif || tissue 155404   ||  tumor     3833  ||  normal   151571
	tumor_101.tif || tissue 150818   ||  tumor    11756  ||  normal   139078
	tumor_110.tif || tissue 137357   ||  tumor    64266  ||  normal    73124

evenly sample normal patches from normal slides, balance the tumor patches
	tumor_002.tif || tissue 58873    ||  tumor       61  ||  normal    58826
	tumor_012.tif || tissue 84215    ||  tumor       72  ||  normal    84143
	tumor_035.tif || tissue 

In [11]:
_extract_tumor_slides(tumor_slides) # takes about 50 min

extracted:            tumor_016 ||  tumor patches 3270    || normal patches 1621   
extracted:            tumor_031 ||  tumor patches 2534    || normal patches 1208   
extracted:            tumor_064 ||  tumor patches 2719    || normal patches 1349   
extracted:            tumor_078 ||  tumor patches 19763   || normal patches 9481   
extracted:            tumor_084 ||  tumor patches 664     || normal patches 330    
extracted:            tumor_091 ||  tumor patches 974     || normal patches 482    
extracted:            tumor_094 ||  tumor patches 1277    || normal patches 637    
extracted:            tumor_101 ||  tumor patches 3918    || normal patches 1951   
extracted:            tumor_110 ||  tumor patches 21422   || normal patches 10332  


In [12]:
_extract_normal_slides(normal_slides, 4858) # takes about 10 min

extracted:            tumor_002 ||  tumor patches 0       || normal patches 4853   
extracted:            tumor_012 ||  tumor patches 0       || normal patches 4857   
extracted:            tumor_035 ||  tumor patches 0       || normal patches 4858   
extracted:            tumor_057 ||  tumor patches 0       || normal patches 4851   
extracted:            tumor_059 ||  tumor patches 0       || normal patches 4857   
extracted:            tumor_081 ||  tumor patches 0       || normal patches 4857   


**data augmentation**
1. rotate the input patch by 4 multiplies of 90
2. left-right flip

In [7]:
train_datagen = ImageDataGenerator(rescale=1./255,
                                   rotation_range=180,
                                   horizontal_flip=True
                                  )
# batch size is decreased from 32 to 8, otherwise the session will
# run out of RAM and crash
train_generator = train_datagen.flow_from_directory('patches', target_size=(299, 299), batch_size=8, class_mode='binary')

Found 113065 images belonging to 2 classes.


In [8]:
_tumorSize = len(os.listdir('patches/tumor'))
_normalSize = len(os.listdir('patches/normal'))
print('training patches\ttumor {:<7} ||  normal {:<7}'.format(_tumorSize, _normalSize))

training patches	tumor 56541   ||  normal 56524  


**train a model from scratch**

In [0]:
def create_model_nopre():
  conv_base = InceptionV3(weights=None, include_top=False, input_shape=(299, 299, 3))
  model = models.Sequential()
  model.add(conv_base)
  model.add(layers.Flatten())
  model.add(layers.Dense(128, activation='relu'))
  model.add(layers.Dropout(0.3))
  model.add(layers.Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer=optimizers.RMSprop(lr=2e-5), metrics=['acc'])
  return model

In [0]:
model = create_model_nopre()

In [0]:
history = model.fit_generator(train_generator, steps_per_epoch=113101//8+1, epochs=1) # starts 1041



*takes more than 200 min for one epoch, I think the session will be terminated for the long runtime. I will train it epoch by epoch after the session crash. needs to save checkpoints*

In [0]:
if not os.path.exists('models'):
  os.mkdir('models')
model.save_weights('models/InceptionV3_nopre_checkpoint1')
model.save('models/InceptionV3_nopre.h5')
shutil.copytree('models', '/content/gdrive/My Drive/DeepLearning/datasets/DL_final/my_models/InceptionV3_nopre')

'/content/gdrive/My Drive/DeepLearning/datasets/DL_final/my_models/InceptionV3_nopre'

In [11]:
save_dir = '/content/gdrive/My Drive/DeepLearning/datasets/DL_final/my_models/'
!ls '/content/gdrive/My Drive/DeepLearning/datasets/DL_final/my_models/'

InceptionV3_nopre  InceptionV3_pre  InceptionV3_pre_v3_0510_113074.h5


In [12]:
!ls '/content/gdrive/My Drive/DeepLearning/datasets/DL_final/my_models/InceptionV3_nopre'

checkpoint
InceptionV3_nopre_checkpoint1.data-00000-of-00001
InceptionV3_nopre_checkpoint1.index
InceptionV3_nopre.h5


In [13]:
model = create_model_nopre()
model.load_weights(os.path.join(save_dir, 'InceptionV3_nopre/InceptionV3_nopre_checkpoint1'))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fe5869a0240>

In [14]:
history2 = model.fit_generator(train_generator, steps_per_epoch=113065//8+1, epochs=1)



In [15]:
if not os.path.exists('models'):
  os.mkdir('models')
model.save_weights('models/InceptionV3_nopre_checkpoint2')
#model.save('models/InceptionV3_nopre_ck2.h5')
shutil.copytree('models', '/content/gdrive/My Drive/DeepLearning/datasets/DL_final/my_models/InceptionV3_nopre_ck2')

'/content/gdrive/My Drive/DeepLearning/datasets/DL_final/my_models/InceptionV3_nopre_ck2'

In [0]:
#model.save('/content/gdrive/My Drive/DeepLearning/datasets/DL_final/my_models/InceptionV3_nopre_ck2.h5')