# Imports

In [1]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from scipy.misc import imresize
import os
import h5py
from skimage.transform import resize
import cv2
import shutil

# Keras imports
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.models import Sequential, model_from_json, Model
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D, Activation, Dropout, Flatten, Dense, Input
from keras.layers import Input, Conv2D, UpSampling2D, BatchNormalization, Activation, add, concatenate,GlobalAveragePooling2D
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.preprocessing import image
from keras import backend as K
from keras import optimizers

Using TensorFlow backend.


In [42]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [43]:
cd /content/gdrive/My Drive/Colab Notebooks

/content/gdrive/My Drive/Colab Notebooks


# copy images to folders

In [4]:
src = './images/im/'
train_dest = './images/train'
valid_dest = './images/val'
sub_dest = './images/submission'
def copy_valid(y,X,src,dest):
    for i in range(600, X.shape[0]):
        image = X[i]
        if (y[i]==0):
            destination = dest + '/benign'
        else:
            destination = dest + '/malignant'
        file = src + str(image) + '.jpg'
        shutil.copy(file,destination)
def copy_train(y,X,src,dest):
    for i in range(0, 600):
        image = X[i]
        if (y[i]==0):
            destination = dest + '/benign'
        else:
            destination = dest + '/malignant'
        file = src + str(image) + '.jpg'
        shutil.copy(file,destination)
def copy_submission(y,X,src,dest):
    for i in range(0, X.shape[0]):
        image = X[i]
        if (y[i]==0):
            destination = dest + '/benign'
        else:
            destination = dest + '/malignant'
        file = src + str(image) + '.jpg'
        shutil.copy(file,destination)

In [None]:
df_submission = pd.read_csv('data/test.csv')
df = pd.read_csv('data/train.csv')
X_df = df['ImageId']
y_df = df['Malignant']
X = X_df.values
y = y_df.values

# copy images to train, val and submission folders
copy_submission(y,df_submission['ImageId'].values,src,sub_dest)
copy_train(y,X,src,train_dest)
copy_valid(y,X,src,valid_dest)

# Initialization

In [0]:
# paths to save results
model_name = "melanoma_detection"
model_path = './models/models_trained/' +model_name+'/'

# paths to training and testing data
train_data_dir = './images/train'
validation_data_dir = './images/val'
submission_data_dir = './images/submission'

# paths to weight files
top_model_weights_path = './vgg-05-0.54.hdf5'

In [0]:
### other hyperparameters
nb_train_samples = 600				# Training samples
nb_train_samples_benign = 354		# Testing samples
nb_train_samples_malignant = 246	# Malignant Training samples
nb_validation_samples = 100			# Malignant Training samples
nb_validation_samples_benign = 64	#Benign Training samples
nb_validation_samples_maligant = 36	# Malignant Testing samples
nb_epoch = 50
img_width, img_height = 224, 224
class_weights={0:1,1:2}

# checkpoints

In [0]:
# checkpoint
checkpoint_path="./vgg-{epoch:02d}-{val_matthews_correlation:.2f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
save_best_only = [checkpoint]

# Data

In [0]:
# Load Data
def load_data(data_type):
    
    
    print('Loading data: ', data_type)
    if data_type == 'train':
        data_dir = train_data_dir
        print('Loading train data... ')
    elif data_type == 'submission':
        data_dir = submission_data_dir
        print('Loading submission data... ')
    else:
        data_dir = validation_data_dir
        print('Loading test data... ')

    
    malignant_path = os.path.join(data_dir, 'malignant')
    malignant_list = os.listdir(malignant_path)  # get a list of all malignant image files in directory
    malignant_num = len(malignant_list)
    benign_path = os.path.join(data_dir, 'benign')
    benign_list = os.listdir(benign_path)
    benign_num = len(benign_list)

    _X = np.empty((benign_num + malignant_num,img_height,img_width,3), dtype='float32')
    _y = np.zeros((benign_num + malignant_num, ), dtype='uint8')

    # store the malignant
    for i, malignant_file in enumerate(malignant_list):
        img = image.load_img(os.path.join(malignant_path, malignant_file), grayscale=False, target_size=(img_height,img_width))
        _X[i] = image.img_to_array(img)
        
    for i, benign_file in enumerate(benign_list):
        img = image.load_img(os.path.join(benign_path, benign_file), grayscale=False, target_size=(img_height,img_width))
        _X[i + malignant_num] = image.img_to_array(img)
        _y[i + malignant_num] = 1
    return _X, _y

In [0]:
# Build matrix using name list
def build_matrix(X,folder):
    # load images
    M = []
    for i in range(0, X.shape[0]):
        image = X[i]
        im = plt.imread(folder + '/im/'  + str(image) + '.jpg')/255
        mask = plt.imread(folder + '/im/'  + str(image) + '_segmentation.jpg')/255 
        img = im* np.stack([mask,mask,mask],axis=-1)
        img_down = resize(img,(224,224), mode='reflect',anti_aliasing = True) 
        M.append(img_down)
    return np.asarray(M)

In [1]:
# MEAN 0 STD 1
def standardize(img):
    mean = np.mean(img)
    std = np.std(img)
    img = (img - mean) / std
    return img

# Metrics

In [0]:
def matthews_correlation(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())
  

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    return (true_positives / (predicted_positives + K.epsilon()))

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return (true_positives / (possible_positives + K.epsilon()))

def f1(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return ( 2*(precision * recall) / (precision + recall + K.epsilon()) )



# DRN

In [0]:
# Residual block
def res_block(x,size):
    
    reduce = (int)(size/4)
    res_path = Conv2D(filters= reduce , kernel_size=(1, 1),strides = (2,2), padding='same')(x)
    res_path = BatchNormalization()(res_path)
    res_path = Activation(activation='relu')(res_path)
    
    res_path = Conv2D(filters= reduce , kernel_size=(3, 3), padding='same')(res_path)
    res_path = BatchNormalization()(res_path)
    res_path = Activation(activation='relu')(res_path)
    
    res_path = Conv2D(filters= size , kernel_size=(1, 1), padding='same')(x)
    res_path = BatchNormalization()(res_path)
    
    res_path = concatenate([x, res_path])
    res_path = Activation(activation='relu')(res_path)
    
       
    return res_path

In [2]:
# Deep Residual Network training
def DRN():
    
    img_input = Input(shape= (224, 224, 3))
    
    x = Conv2D(64, (7, 7), padding='same',strides = (2,2), name='conv1')(img_input)
    x = MaxPooling2D(pool_size = (3,3), strides = (2,2))(x)
    x = res_block(x,256)
    x = res_block(x,512)
    x = res_block(x,1024)
    x = res_block(x,2048)
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024,activation='relu')(x) #dense layer 2
    x = Dropout(0.2)(x)
    x = Dense(512,activation='relu')(x) #dense layer 2
    x = Dense(512,activation='relu')(x) #dense layer 2
    output = Dense(units = 2, activation = 'softmax')(x)
    model = Model(inputs=img_input, outputs=output)
    #model.load_weights(top_model_weights_path)

    model.compile(loss='binary_crossentropy',
                  optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
                  metrics=['accuracy',matthews_correlation])

    # Data augmentation for training images
    train_datagen = ImageDataGenerator(
        rotation_range=270,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,samplewise_center = True, samplewise_std_normalization = True,
        fill_mode='nearest')

    # Data augmentation for validation images
    valid_datagen = ImageDataGenerator(
        rotation_range=270,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,samplewise_center = True, samplewise_std_normalization = True,
        fill_mode='nearest')       

    train_generator = train_datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_height,img_width),
        batch_size=32,
        class_mode='categorical')

    validation_generator = valid_datagen.flow_from_directory(
        validation_data_dir,
        target_size=(img_height,img_width),
        batch_size=32,
        class_mode='categorical')

   
    scores = model.fit_generator(
                    train_generator,
                    samples_per_epoch=nb_train_samples,
                    nb_epoch=30,verbose = 1,
                    validation_data=validation_generator,
                    nb_val_samples=nb_validation_samples,
                    class_weight=class_weights,
                    callbacks = save_best_only)
    return model

# VGG-16

## Train top model

In [0]:
def train_top_model():
    
    from keras.applications.vgg16 import VGG16
    # Pre-build model
    vg = VGG16(include_top = False, weights = 'imagenet', input_shape = (224, 224, 3))
    for layer in vg.layers[:]:
        layer.trainable = False
        
    model  = Sequential()
    model.add(vg)
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    
    #model.load_weights(top_model_weights_path)
    
    model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy',matthews_correlation])
    
    
    # Data augmentation for training images
    train_datagen = ImageDataGenerator(
        rotation_range=270,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,samplewise_center = True, samplewise_std_normalization = True,
        fill_mode='nearest')

    # Data augmentation for validation images
    valid_datagen = ImageDataGenerator(
        rotation_range=270,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,samplewise_center = True, samplewise_std_normalization = True,
        fill_mode='nearest')       

    train_generator = train_datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_height,img_width),
        batch_size=16,
        class_mode='categorical')

    validation_generator = valid_datagen.flow_from_directory(
        validation_data_dir,
        target_size=(img_height,img_width),
        batch_size=16,
        class_mode='categorical')

    scores = model.fit_generator(
                    train_generator,
                    samples_per_epoch=nb_train_samples,
                    nb_epoch=30,verbose = 1,
                    validation_data=validation_generator,
                    nb_val_samples=nb_validation_samples,
                    class_weight=class_weights,
                    callbacks = save_best_only)


    model.save_weights(top_model_weights_path)

## Fine tune

In [0]:
def fine_tune():

    from keras.applications.vgg16 import VGG16
  
    # Pre-build model
    vg = VGG16(include_top = False, weights = 'imagenet', input_shape = (224, 224, 3))
    for layer in vg.layers[:]:
        layer.trainable = True
    model  = Sequential()
    model.add(vg)
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    #model.load_weights(top_model_weights_path)

    # compile the model with a SGD/momentum optimizer
    # and a very slow learning rate.
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizers.SGD(lr=1e-6, momentum=0.9),
                  metrics=['accuracy',matthews_correlation])

    # Data augmentation for training images
    train_datagen = ImageDataGenerator(
        rotation_range=270,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,samplewise_center = True, samplewise_std_normalization = True,
        fill_mode='nearest')

    # Data augmentation for validation images
    valid_datagen = ImageDataGenerator(
        rotation_range=270,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,samplewise_center = True, samplewise_std_normalization = True,
        fill_mode='nearest')       

    train_generator = train_datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_height,img_width),
        batch_size=32,
        class_mode='categorical')

    validation_generator = valid_datagen.flow_from_directory(
        validation_data_dir,
        target_size=(img_height,img_width),
        batch_size=32,
        class_mode='categorical')

    # fine-tune the model
    scores = model.fit_generator(
                    train_generator,
                    samples_per_epoch=nb_train_samples,
                    nb_epoch=50,verbose = 1,
                    validation_data=validation_generator,
                    nb_val_samples=nb_validation_samples,
                    class_weight=class_weights,
                    callbacks = save_best_only)

    # save the model
    json_string = model.to_json()

    with open('final_model_architecture.json', 'w') as f:
        f.write(json_string)

    model.save_weights('final_weights.h5')

    return model, scores

# Utils

In [0]:
# Make prediction for submission data given a model
def prediction(model):
    #Load data as a Numpy array
    df_submission = pd.read_csv('data/test.csv')

    X_test = build_matrix(df_submission['ImageId'].values,folder='data')
    
    y_pred = model.predict_classes(X_test, batch_size=64)
    np.savetxt('y_pred.txt', y_pred)

    y_score = model.predict_proba(X_test, batch_size=64)
    np.savetxt('y_score.txt', y_score)
    return y_pred

In [0]:
# load model from hdf5 file
def load_model():
    from keras.applications.vgg16 import VGG16
    
    # Pre-build model
    vg = VGG16(include_top = False, weights = 'imagenet', input_shape = (224, 224, 3))
    model  = Sequential()
    model.add(vg)
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='sigmoid'))

    model.load_weights(top_model_weights_path)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizers.SGD(lr=1e-5, momentum=0.9),
                  metrics=['accuracy',matthews_correlation])
    
    return model

# Plots

In [0]:
def plot(history):
    # Retrieve a list of accuracy results on training and test data
    # sets for each training epoch
    acc = history.history['acc']
    val_acc = history.history['val_acc']

    # Retrieve a list of list results on training and test data
    # sets for each training epoch
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    # Get number of epochs
    epochs = range(len(acc))

    # Plot training and validation accuracy per epoch
    plt.plot(epochs, acc, label = "training")
    plt.plot(epochs, val_acc, label = "validation")
    plt.legend(loc="upper left")
    plt.title('Training and validation accuracy')

    plt.figure()

    # Plot training and validation loss per epoch
    plt.plot(epochs, loss, label = "training")
    plt.plot(epochs, val_loss, label = "validation")
    plt.legend(loc="upper right")
    plt.title('Training and validation loss')

In [0]:
def make_plots(h):
    import matplotlib.pyplot as plt
    %matplotlib inline

    fig = plt.figure()
    plt.plot(h.history['loss'], 'r-')
    plt.plot(h.history['val_loss'], 'b-')
    plt.title('Loss plot')
    plt.legend(['Training loss', 'Validation loss'])
    plt.show()

    fig = plt.figure()
    plt.plot(h.history['f1'], 'r-')
    plt.plot(h.history['val_f1'], 'b-')
    plt.title('F1 plot')
    plt.legend(['Training F1', 'Validation F1'])
    plt.show()

    fig = plt.figure()
    plt.plot(h.history['precision'], 'r-')
    plt.plot(h.history['val_precision'], 'b-')
    plt.title('Precision plot')
    plt.legend(['Training precision', 'Validation precision'])
    plt.show()

    fig = plt.figure()
    plt.plot(h.history['recall'], 'r-')
    plt.plot(h.history['val_recall'], 'b-')
    plt.title('Recall plot')
    plt.legend(['Training recall', 'Validation recall'])
    plt.show()

# Main

In [0]:
### train top model and save weights
train_top_model()

In [0]:
### Train Vgg network using top model weights
model,history = fine_tune()

In [0]:
#Load submission data as a Numpy array
df_submission = pd.read_csv('data/test.csv')
X_test = build_matrix(df_submission['ImageId'].values,folder='data')
sub = standardize(X_test)

In [0]:
# model = load_model()
# y = prediction(model)

In [0]:
### make predictions
y_pred = model.predict(sub)
y = np.argmax(y_pred, axis = 1)

In [0]:
### submission for kaggle
i=0
for Id in df_submission['ImageId']:
    value = y[i] 
    df_submission.loc[df_submission['ImageId'] == Id, 'Malignant'] = value
    i=i+1
    
df_submission['Malignant'] = df_submission['Malignant'].astype(int) # This line is mandatory to be sure to have integer
print(df_submission.head(3))
df_submission.to_csv('data/1106.csv', index=None, sep=',', mode='w')