# Melanoma detection using tranfer learning and image augmentation 

### Inside this project we will use the feature detection of a VGG16 Neural network trained into the IMAGENET dataset and image augmentaion process to increase the number of cases to be used for treining and testing 

##### References:

> https://www.kaggle.com/amyjang/tensorflow-transfer-learning-melanoma

> https://sol.sbc.org.br/index.php/sbcas/article/view/6272/6170

In [56]:
# Import modules that will be used into the project 

import tensorflow as tf
import os
import zipfile
import random
from shutil import copyfile
import numpy as np
from tabulate import tabulate



In [2]:
# define function to create working director for data

def create_directory(source:str, object_list:list):
    """Check if the directory desired already exist into the provided path and create it otherwise"""
    for obj in object_list:
        created_path = os.path.join(source, obj)
        if os.path.exists(created_path):
            print("Directory path "+ str(created_path)+ " already exist")
        else: 
            os.mkdir(created_path)
            print("Directory " + str(created_path) + " created ")


# define function to validate if image is not corrupted and split data into test and training sets

def split_data(source:str, training:str, testing:str, split_size:float):
    """
    Function to validate if data is not corrupted and 
    split it into training and test sets. 
    """
    data_list = os.listdir(source)
    random.seed(10)
    train_list = random.sample(data_list, int(len(data_list) * split_size), )
    for pic in data_list:
        pic_path = os.path.join(source,pic)
        if os.path.getsize(pic_path) > 0: # file not empty/corrupted
             if pic in train_list:
                 training_path = os.path.join(training,pic)
                 copyfile(pic_path,training_path)
                 training_path = '' # clear path
             else:
                 testing_path = os.path.join(testing, pic)
                 copyfile(pic_path, testing_path)
                 testing_path = '' # clear path
        pic_path = '' # clear path
    print(f"Dataset cleanse and sorting completed for {source}")     
        

In [6]:
# define source zip, folders to be used and build directories
 
local_zip = "./Base.zip" # path where your zip images are located (global or related to this file)
zip_ref = zipfile.ZipFile(local_zip, 'r') # configure as read
zip_ref.extractall('/tmp') # extract information to temporary 
zip_ref.close()

In [27]:
# create working directories
create_directory("./", ["train", "test"]) #folders for test and train data
create_directory("./train", ["Positive","Negative"]) # folders for positive and negative scenarios inside train
create_directory("./test", ["Positive","Negative"]) #folders for positive and negative scenarios inside test

Directory ./train created 
Directory ./test created 
Directory ./train/Positive created 
Directory ./train/Negative created 
Directory ./test/Positive created 
Directory ./test/Negative created 


In [28]:
# split data between test and train structures

split_data("/tmp/Base/Positivos", "./train/Positive", "./test/Positive", 0.8) # split for positive images
split_data("/tmp/Base/Negativos", "./train/Negative", "./test/Negative", 0.8) # split for negative images

Dataset cleanse and sorting completed for /tmp/Base/Positivos
Dataset cleanse and sorting completed for /tmp/Base/Negativos


In [4]:
# load VGG16 model pre-trained with IMAGENET
IMAGE_RESIZE = [256, 256] # define size of image to be fed to model

base_model = tf.keras.applications.VGG16(input_shape=(*IMAGE_RESIZE,3),
                                                        include_top=False, # do not include layer with outputs
                                                        weights='imagenet' # which database weights we want ot import
                                                        )
base_model.trainable = False # lock base model so it is not trained again

base_model.summary() # show base model structure

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 256, 256, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 256, 256, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 256, 256, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 128, 128, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 128, 128, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 128, 128, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 64, 64, 128)       0     

In [64]:
# define output bias so the imbalance between positive nad negative cases inside training data

positive = len(os.listdir("./train/Positive"))
negative = len(os.listdir("./train/Negative"))

output_bias = np.log([positive/negative]) # define a initial bias to correct the model due sample imbalance

# creating weights for the classes - 0:Negative, 1:Positive -> tensorflow will autogenerate and encode labels using  
# alphabetical order. 

train_size = negative + positive
weight_0 = (1.0/negative)*(train_size)/2.0
weight_1 = (1.0/positive)*(train_size)/2.0

weights = {0:weight_0, 1:weight_1} # dictionary to be added into model compilation

print(f'Negative weight: {weights[0]}')
print(f'Positive weight: {weights[1]}')



# Is recommended to make this computation after augmentation since the simetry of methods between positive and negative can not
# be garanteed 



Negative weight: 0.75
Positive weight: 1.5


In [65]:
# add output layers to the model and bias

model = tf.keras.models.Sequential([
    base_model, # add the pre-trained model of VGG16
    tf.keras.layers.Flatten(), # transform values from matrix to arrays
    tf.keras.layers.Dense(512, activation= 'relu'), 
    tf.keras.layers.Dropout(0.2), # deactivate random conections of neural model during training to avoid overfitting
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid', bias_initializer = tf.keras.initializers.Constant(output_bias)) # output layer using weight initializer to point class imbalance
])

METRICS =[ # add metrics to be used to eveluate training
    tf.keras.metrics.TruePositives(name='tp'),
    tf.keras.metrics.FalsePositives(name='fp'),
    tf.keras.metrics.TrueNegatives(name='tn'),
    tf.keras.metrics.FalseNegatives(name='fn'),
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall'),
    tf.keras.metrics.AUC(name='auc'),
    tf.keras.metrics.AUC(name='prc', curve='PR')

    
]
model.compile( # generate graph to be trained
    optimizer=tf.keras.optimizers.Adam(), # define optimizer function to search for weights
    loss = tf.keras.losses.BinaryCrossentropy(), # define loss funtion to be minimized
    metrics=METRICS
)

In [71]:
# create data generators to configure data to train

train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255., # scale values to be into range 0 to 1
    rotation_range = 90, # rotate image
    horizontal_flip = True, #flip image
    vertical_flip = True, # flip image, 
    zoom_range = 0.3, #allow zoom in the image to create augmentaion 
    width_shift_range = 0.05, # allow move image in the horizontal  
    height_shift_range = 0.05, # allow move image in the vertical  
    fill_mode='constant', # fill empty space by move with cval
    cval = 0, # define constant value to be used
    preprocessing_function = tf.keras.applications.vgg16.preprocess_input # chnage image from RGB to BGR
) 

test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255., # scale values to be into range 0 to 1
    preprocessing_function = tf.keras.applications.vgg16.preprocess_input # chnage image from RGB to BGR
) 

train_generator = train_datagen.flow_from_directory(
    directory= "./train", # directory of training images
    target_size=(256,256), # all images will be rezide to 256X256
    class_mode= 'binary', # define label as binary
    shuffle=True, # shuffle order of images
    seed=10, # set seed tp allow reproduction 
    batch_size=5 # number of images shown in each step
    )

test_generator = train_datagen.flow_from_directory(
    directory= "./test", # directory of training images
    target_size=(256,256), # all images will be rezide to 256X256
    class_mode= 'binary', # define label as binary
    shuffle=True, # shuffle order of images
    seed=10, # set seed tp allow reproduction 
    batch_size=5 # number of images shown in each step
    )


Found 54 images belonging to 2 classes.
Found 14 images belonging to 2 classes.


In [72]:
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("melanoma_model.h5",
                                                    monitor= 'val_precision',
                                                    save_best_only=True) # save best model based on the validation value of the precision

history = model.fit(
      train_generator, # generator of images based on the original ones (augmentation)
      steps_per_epoch=10,  # number of image groups shown 
      epochs=70, # number of repetions of training
      validation_data=test_generator, # gererator with validation data 
      validation_steps= 2, # number of image groups shown in validation 
      shuffle=True, # shuffle information to better learning of pattern in training
      callbacks=[checkpoint_cb], # insert callback to save best model
      class_weight= weights # add class weights to aid avoid issues with class imbalance (control>>melanoma samples)
      )


Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


In [73]:
model = tf.keras.models.load_model("melanoma_model.h5") # load values of best model from file saved during training
validation = model.evaluate(test_generator, return_dict=True) # evaluate model with test data to have access to values
tp = validation['tp'] # access true positive 
fp = validation['fp'] # access false positive
tn = validation['tn'] # access true negative
fn = validation['fn'] # access false negative 



In [74]:
# Calculate metrics based on model evaluation data 

# accuracy: total amount of correct prediction over the total of predictions
acc = (tp+tn)/(tp+fp+fn+tn) 

# precision: correct predicted positive observation over the total of predicted positive observations
try: # cannot devide 0 by 0
    prec = tp/(tp+fp) 
except:
    prec = 0

# recall/sensitivity: correct predicted positive observations over all the positive class(TP+FN)
try: # cannot devide 0 by 0
    rec = tp/(tp+fn) 
except:
    rec = 0

# specificity: correct predicted negative observations over all the negatives (tn+fp)
try: # cannot devide 0 by 0
    spec = tn/(tn+fp)
except:
    spec = 0

# F1 score: weighted average of precision and recall
try: # cannot devide 0 by 0
    f1 = 2*(rec*prec)/(rec+prec)
except:
    f1 = 0

# print metrics into tabular form
metric = [['Accuracy', acc],['Precision', prec],['Recall', rec],['Specificity', spec],['F1 score', f1]]
print(tabulate(metric, headers=['Metric', 'Value']))




Metric          Value
-----------  --------
Accuracy     0.714286
Precision    1
Recall       0.2
Specificity  1
F1 score     0.333333
