In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
from sklearn.model_selection import train_test_split
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import glob
import random
import os
import imgaug as ia
import imgaug.augmenters as iaa
import tensorflow as tf
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['histopathologic-cancer-detection', 'mod-hist', 'resnet50']


In [2]:
df_train_paths = glob.glob('../input/histopathologic-cancer-detection/train/*.tif')
df_test_paths = glob.glob('../input/histopathologic-cancer-detection/test/*.tif')

In [3]:
df_train_paths[:5]

['../input/histopathologic-cancer-detection/train/f46f19fc90347d350431da5bfcf955d9c1418b43.tif',
 '../input/histopathologic-cancer-detection/train/330c56d7a3a1a808d711386c136b874a87081526.tif',
 '../input/histopathologic-cancer-detection/train/b7b8babd812d5edbad7dd9b155ee29fbede4ab81.tif',
 '../input/histopathologic-cancer-detection/train/cc55f29b2a76a534585598d48d8f9725abc4cd70.tif',
 '../input/histopathologic-cancer-detection/train/fd6d67cd86a03e3ce6baaa34965c1b5f544efe27.tif']

In [4]:
df_test_paths[:5]

['../input/histopathologic-cancer-detection/test/485b548a7ee70df49fe7cab6d7062fb8d8f172aa.tif',
 '../input/histopathologic-cancer-detection/test/0a13be637bd66856953494747e1a56e11e394acc.tif',
 '../input/histopathologic-cancer-detection/test/1d19c0a74340c60c31b677a0032e710886645bc6.tif',
 '../input/histopathologic-cancer-detection/test/f45110c8ce329fc4292c2dfb93f1bc73b31d0919.tif',
 '../input/histopathologic-cancer-detection/test/e07ddd315e3037431689b18b3631c756a7220102.tif']

In [5]:
df = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv')

In [6]:
df.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [7]:
id_label_mapping = {k:v for k,v in zip(df.id.values,df.label.values)} 

In [8]:
# df_main = pd.DataFrame({'img_path':df_train_paths})

In [9]:
def get_batch(img_data, batch_size):
    return [img_data[i:i+batch_size] for i in range(0,len(img_data),batch_size)]

In [10]:
def get_id_from_path(img_path):
    return img_path.split('/')[-1].replace('.tif','')

In [11]:
def get_label():
    return id_label_mapping[get_id_from_path(img_path)]

In [12]:
# df_main['id'] = df_main['img_path'].apply(get_id_from_path)

In [13]:
# df = df.merge(df_main,on='id')

In [14]:
df.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [15]:
# df0 = df[df['label'] == 0].sample(50000,random_state=42)
# df1 = df[df['label'] == 1].sample(50000,random_state=42)

In [16]:
# df = pd.concat([df0,df1], ignore_index=True)

In [17]:
df.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [18]:
df_gen_train, df_gen_val = train_test_split(df_train_paths, test_size=0.25, random_state=42)

In [19]:
# df_gen_val.label.value_counts()

In [20]:
# df_gen_train_imgs = [cv2.imread(img_path) for img_path in list(df_gen_train['img_path'])]
# df_gen_val_imgs = [cv2.imread(img_path) for img_path in list(df_gen_val['img_path'])]

In [21]:
# df_gen_train_imgs = np.array(df_gen_train_imgs)
# df_gen_val_imgs = np.array(df_gen_val_imgs)

In [22]:
ia.seed(1)

# Sometimes(0.5, ...) applies the given augmenter in 50% of all cases,
# e.g. Sometimes(0.5, GaussianBlur(0.3)) would blur roughly every second
# image.
sometimes = lambda aug: iaa.Sometimes(0.5, aug)
def get_seq():
    # Define our sequence of augmentation steps that will be applied to every image.
    seq = iaa.Sequential([
        iaa.Fliplr(0.5), #horizontally flip 50% images
        iaa.Flipud(0.2), #vertically flip 20% images
        
        # crop some of the images by 0-10% of their height/width
        sometimes(iaa.Crop(percent=(0,0.1))),
        
        # Apply affine transformations to some of the images
        # - scale to 80-120% of image height/width (each axis independently)
        # - translate by -20 to +20 relative to height/width (per axis)
        # - rotate by -45 to +45 degrees
        # - shear by -16 to +16 degrees
        # - order: use nearest neighbour or bilinear interpolation (fast)
        # - mode: use any available mode to fill newly created pixels
        #         see API or scikit-image for which modes are available
        # - cval: if the mode is constant, then use a random brightness
        #         for the newly created pixels (e.g. sometimes black,
        #         sometimes white)
        sometimes(iaa.Affine(
            scale = {"x":(0.8,1.2), "y":(0.8,1.2)},
            translate_percent = {"x":(-0.2,0.2), "y":(-0.2,0.2)},
            rotate = (-45,45),
            shear = (-16,16),
            order = [0,1],
            cval = (0,255),
            mode = ia.ALL
        )),
        
        #
        # Execute 0 to 5 of the following (less important) augmenters per
        # image. Don't execute all of them, as that would often be way too
        # strong.
        #
        iaa.SomeOf((0,5),
                   [
                       # Convert some images into their superpixel representation,
                       # sample between 20 and 200 superpixels per image, but do
                       # not replace all superpixels with their average, only
                       # some of them (p_replace).
                       sometimes(
                           iaa.Superpixels(
                           p_replace = (0, 1.0),
                           n_segments = (20, 200)
                           )
                       ),
                       
                       # Blur each image with varying strength using
                       # gaussian blur (sigma between 0 and 3.0),
                       # average/uniform blur (kernel size between 2x2 and 7x7)
                       # median blur (kernel size between 3x3 and 11x11).
                       iaa.OneOf([
                           iaa.GaussianBlur((0,3.0)),
                           iaa.AverageBlur(k=(2,6)),
                           iaa.MedianBlur(k=(3,7))
                       ]),
                       
                       # Sharpen each image, overlay the result with the original
                       # image using an alpha between 0 (no sharpening) and 1
                       # (full sharpening effect).
                       iaa.Sharpen(alpha=(0,1.0), lightness = (0.75, 1.5)),
                       
                       # Same as sharpen, but for an embossing effect.
                       iaa.Emboss(alpha=(0,1.0), strength=(0,2.0)),
                       
                       # Search in some images either for all edges or for
                       # directed edges. These edges are then marked in a black
                       # and white image and overlayed with the original image
                       # using an alpha of 0 to 0.7.
                       sometimes(iaa.OneOf([
                           iaa.EdgeDetect(alpha = (0,0.7)),
                           iaa.DirectedEdgeDetect(alpha=(0,0.7), direction=(0.0,1.0))
                       ])),
                       
                       # Add gaussian noise to some images.
                       # In 50% of these cases, the noise is randomly sampled per
                       # channel and pixel.
                       # In the other 50% of all cases it is sampled once per
                       # pixel (i.e. brightness change).
                       iaa.AdditiveGaussianNoise(loc=0,scale=(0.0,0.05*255),per_channel=0.5),

                       # Either drop randomly 1 to 10% of all pixels (i.e. set
                       # them to black) or drop them on an image with 2-5% percent
                       # of the original size, leading to large dropped
                       # rectangles.
                       iaa.OneOf([
                           iaa.Dropout((0.01, 0.1),per_channel=0.5),
                           iaa.CoarseDropout((0.03,0.15),size_percent=(0.02,0.05), per_channel=0.2)
                       ]),
                       
                       # Invert each image's chanell with 5% probability.
                       # This sets each pixel value v to 255-v.
                       iaa.Invert(0.05, per_channel=True), #Invert colour channels
                       
                       # Add a value of -10 to 10 to each pixel.
                       iaa.Add((-10,10), per_channel=0.5),
                       
                       # Change brightness of images (50-150% of original value).
                       iaa.Multiply((0.5,1.5), per_channel=0.5),
                       
                       # Improve or worsen the contrast of images.
                       iaa.ContrastNormalization((0.5,2.0),per_channel=0.5),
                       
                       # Convert each image to grayscale and then overlay the
                       # result with the original with random alpha. I.e. remove
                       # colors with varying strengths.
                       iaa.Grayscale(alpha=(0.0,1.0)),
                       
                       # In some images move pixels locally around (with random
                       # strengths).
                       sometimes(iaa.ElasticTransformation(alpha=(0.5,3.5),sigma=0.25)),
                       
                       # In some images distort local areas with varying strength.
                       sometimes(iaa.PiecewiseAffine(scale=(0.01,0.05)))
                   ],
                   #do all the above augmentations in random order
                   random_order = True
                  )
            ],
            random_order = True
        )
    return seq

In [23]:
def data_gen(img_data, id_label_mapping, batch_size, augment=False):
    seq = get_seq()
    while True:
        random.shuffle(img_data)
        for batch in get_batch(img_data,batch_size):
            imgs = [cv2.imread(img_path) for img_path in batch]
            y = [id_label_mapping[get_id_from_path(img_path)] for img_path in batch]
            #imgs = [cv2.resize(img,(224,224)) for img in imgs]
            if augment == True:
                imgs = seq.augment_images(imgs)
            
            imgs = [preprocess_input(img) for img in imgs]
            yield np.array(imgs), np.array(y)

In [24]:
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling2D, BatchNormalization
from keras.layers import Dropout, Flatten, Dense, Dropout, Input, Concatenate
from keras.callbacks import *
from keras.models import Sequential, Model
from tensorflow import set_random_seed
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input
from keras.optimizers import *

Using TensorFlow backend.


In [25]:
base_model = ResNet50(weights='imagenet',include_top=False,input_shape=(96,96,3))



Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [26]:
base_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 3)    0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 102, 102, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 48, 48, 64)   9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 48, 48, 64)   256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

In [27]:
inputs = Input((96,96,3))

In [28]:
for layer in base_model.layers[:20]:
    layer.trainable = False
    
for layer in base_model.layers[20:]:
    layer.trainable = True

In [29]:
# model.add(base_model)
# model.add(Flatten())
# model.add(BatchNormalization())
# model.add(Dense(256,activation='relu'))
# model.add(BatchNormalization())
# model.add(Dense(512,activation='relu'))
# model.add(BatchNormalization())
# model.add(Dense(1,activation='sigmoid'))

# model.layers[0].trainable = False

In [30]:
# define roc_callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

In [31]:
# learning_rate_reduction1 = ReduceLROnPlateau(monitor='val_loss', 
#                                             patience=2, 
#                                             verbose=1, 
#                                             factor=0.2, 
#                                             min_delta=0.0001)

# learning_rate_reduction2 = ReduceLROnPlateau(monitor='val_acc', patience=2, verbose=1, factor=0.5, 
#                                             min_lr=0.000001, cooldown=3,min_delta=0.01, mode='max')

# learning_rate_reduction3 = ReduceLROnPlateau(monitor='val_auc_roc', patience=2, verbose=1, factor=0.5, 
#                                             min_lr=0.00001, cooldown=3, min_delta=0.01, mode='max')

adam = Adam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08) 
# adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

In [32]:
x = base_model(inputs)
# x = base_model.output
# op1 = GlobalMaxPooling2D()(x)
op2 = GlobalAveragePooling2D()(x)
op3 = Flatten()(x)
op = Concatenate(axis=-1)([op2,op3])
op = Dense(512,activation='relu')(op)
op = Dropout(0.5)(op)
op = Dense(1, activation='sigmoid')(op)
model = Model(inputs,op)
# model = Model(base_model.input,op)
model.compile(optimizer=Adam(0.0001), loss='binary_crossentropy', metrics=['accuracy',auc_roc])
model.summary()

Instructions for updating:
Please switch to tf.metrics.auc. Note that the order of the labels and predictions arguments has been switched.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 96, 96, 3)    0                                            
__________________________________________________________________________________________________
resnet50 (Model)                (None, 3, 3, 2048)   23587712    input_2[0][0]                    
__________________________________________________________________________________________________
global_average_pooling2d_1 (Glo (None, 2048)         0           resnet50[1][0]                   
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 18432)        0           resn

In [33]:
filepath = 'mod11.h5'
checkpoint = ModelCheckpoint(filepath= filepath,monitor='val_auc_roc',save_best_only=True, mode='max', verbose=2)

In [34]:
# set_random_seed(42)

In [35]:
# model = Sequential()
# model.add(Conv2D(filters = 16, kernel_size = 3, padding = 'same', activation = 'relu', input_shape = (96, 96, 3)))
# model.add(Conv2D(filters = 16, kernel_size = 3, padding = 'same', activation = 'relu'))
# model.add(Conv2D(filters = 16, kernel_size = 3, padding = 'same', activation = 'relu'))
# model.add(BatchNormalization())
# model.add(Dropout(0.3))
# model.add(MaxPooling2D(pool_size = 3)) 

# model.add(Conv2D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu')) 
# model.add(Conv2D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu')) 
# model.add(Conv2D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))
# model.add(MaxPooling2D(pool_size = 3)) 

# model.add(Conv2D(filters = 64, kernel_size = 3, padding = 'same', activation = 'relu'))
# model.add(Conv2D(filters = 64, kernel_size = 3, padding = 'same', activation = 'relu'))
# model.add(Conv2D(filters = 64, kernel_size = 3, padding = 'same', activation = 'relu'))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))
# model.add(MaxPooling2D(pool_size = 3))

# model.add(Conv2D(filters = 128, kernel_size = 3, padding = 'same', activation = 'relu'))
# model.add(Conv2D(filters = 128, kernel_size = 3, padding = 'same', activation = 'relu'))
# model.add(Conv2D(filters = 256, kernel_size = 3, padding = 'same', activation = 'relu'))
# # model.add(BatchNormalization())
# model.add(Dropout(0.5))
# model.add(MaxPooling2D(pool_size = 3))

In [36]:
# model.add(Flatten())
# model.add(Dense(1, activation='sigmoid'))

In [37]:
# # define roc_callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
# def auc_roc(y_true, y_pred):
#     # any tensorflow metric
#     value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

#     # find all variables created for this metric
#     metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

#     # Add metric variables to GLOBAL_VARIABLES collection.
#     # They will be initialized for new session.
#     for v in metric_vars:
#         tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

#     # force to update metric values
#     with tf.control_dependencies([update_op]):
#         value = tf.identity(value)
#         return value

In [38]:
# my_callbacks = [EarlyStopping(monitor='auc_roc', patience=6, verbose=1, mode='max')]

In [39]:
# model.compile(optimizer= 'adam', loss='binary_crossentropy', metrics=['accuracy',auc_roc])

In [40]:
call_back_list = [checkpoint]

In [41]:
batch_size = 64
model.fit_generator(data_gen(df_gen_train,id_label_mapping,batch_size=batch_size,augment=True),
                    validation_data=data_gen(df_gen_val,id_label_mapping,batch_size=batch_size), epochs=12,
                   steps_per_epoch = len(df_gen_train)/batch_size, validation_steps = len(df_gen_val)/batch_size, callbacks=call_back_list)

Epoch 1/12

Epoch 00001: val_auc_roc improved from -inf to 0.88679, saving model to mod11.h5
Epoch 2/12

Epoch 00002: val_auc_roc improved from 0.88679 to 0.89980, saving model to mod11.h5
Epoch 3/12

In [42]:
model.save('model.h5')

In [43]:
preds = []
ids = []
for batch in get_batch(df_test_paths, batch_size=64):
    X = [preprocess_input(cv2.imread(x)) for x in batch]
    ids_batch = [get_id_from_path(x) for x in batch]
    X = np.array(X)
    preds_batch = ((model.predict(X).ravel()*model.predict(X[:, ::-1, :, :]).ravel()*model.predict(X[:, ::-1, ::-1, :]).ravel()*model.predict(X[:, :, ::-1, :]).ravel())**0.25).tolist()
    preds += preds_batch
    ids += ids_batch

In [44]:
# pred = (model.predict(val_imgs).ravel()*model.predict(val_imgs[:,::-1,:,:]).ravel()*model.predict(val_imgs[:,:,::-1,:]).ravel()*model.predict(val_imgs[:,::-1,::-1,:]).ravel())**0.25

In [45]:
# val_y = []
# for img_path in df_gen_val:
#     val_y.append(id_label_mapping[get_id_from_path(img_path)])

In [46]:
# ids = []
# pred = []
# for batch in get_batch(df_gen_val,batch_size):
#     imgs = [cv2.imread(img_path) for img_path in df_gen_val]
#     id_batch = [get_id_from_path(img_path)]
#     imgs = np.array(imgs)
#     preds_batch = ((model.predict(imgs).ravel()*model.predict(imgs[:, ::-1, :, :]).ravel()*model.predict(imgs[:, ::-1, ::-1, :]).ravel()*model.predict(imgs[:, :, ::-1, :]).ravel())**0.25).tolist()
#     pred += preds_batch
#     ids += id_batch


In [47]:
# df_train = [cv2.imread(img_path) for img_path in df_train_paths]
# df_test = [cv2.imread(img_path) for img_path in df_test_paths]

In [48]:
# df_test = np.array(df_test)

In [49]:
# df_gen_train_imgs = [cv2.imread(img_path) for img_path in df_gen_train]

In [50]:
# df_gen_val_imgs = [cv2.imread(img_path) for img_path in df_gen_val]

In [51]:
# seq = get_seq()

In [52]:
# df_gen_train_imgs = seq.augment_images(df_gen_train_imgs)

In [53]:
# df_gen_train_imgs = [preprocess_input(img) for img in df_gen_train_imgs]

In [54]:
# df_gen_train_imgs = df_gen_train_imgs[:25000]

In [55]:
# df_gen_train_imgs = np.array(df_gen_train_imgs)

In [56]:
# df_gen_val_imgs = np.array(df_gen_val_imgs)

In [57]:
# model.fit(df_gen_train_imgs,df_gen_train.label.iloc[:25000],batch_size=128,epochs=42,validation_data=(df_gen_val_imgs,df_gen_val.label))

In [58]:
# y_train = []
# for img_path in df_gen_train:
#     y_train.append(id_label_mapping[get_id_from_path(img_path)])

In [59]:
# y_test = []
# for img_path in df_gen_val:
#     y_test.append(id_label_mapping[get_id_from_path(img_path)])

In [60]:
# pred = (model.predict(df_test).ravel()*model.predict(df_test[:,::-1,:,:]).ravel()*model.predict(df_test[:,:,::-1,:]).ravel()*model.predict(df_test[:,::-1,::-1,:]).ravel())**0.25

In [61]:
# pd.DataFrame({''})

In [62]:
# test_id = []
# for img_path in df_test_paths:
#     test_id.append(get_id_from_path(img_path))

In [63]:
sub = pd.DataFrame({'id':ids,'label':preds})

In [64]:
sub.to_csv('sub.csv',index=False)

In [65]:
model.evaluate_generator(data_gen(df_gen_val,id_label_mapping,batch_size=64),len(df_gen_val)//64)

[0.263716213101558, 0.8899519790454016, 0.9338264928846615]