 # **Product-Based Capstone Project**

Team ID : C22-PS096
Team Member :
* (ML) M7134F1606 - Muhammad Fadli Ramadhan - Politeknik Negeri Sriwijaya
* (ML) M2008F0851 - Adhitya Ghiffari Pramudito - Universitas Gadjah Mada
* (MD) A2191F1821 - Ahmad Ansori Palembani - Universitas Bina Darma
* (MD) A7191F1820 - Muhammad Fharid Akbar - Universitas Bina Darma
* (CC) C2322F2819 - Muhammad Mustafa Kamal - Universitas Syiah Kuala
* (CC) C7457F3068 - Wulan Ayu Rania Sari -Universitas Nahdlatul Ulama Lampung
Final Selected Themes : Human Healthcare & Animal Welfare

Title of the Project : SkinCan : Skin Cancer Detection App

**Import Library**

In [2]:
import shutil
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import os
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools

%matplotlib inline

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

**Making Dictionary**

In [3]:
dir = '../input/skin-cancer-mnist-ham10000/'
os.listdir(dir)

In [4]:
# Create a new directory
base_dir = 'base_dir'
os.mkdir(base_dir)

In [5]:
# train_dir
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# val_dir
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)

In [6]:
# create new folders inside train_dir
nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)

In [7]:
# create new folders inside val_dir
nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

In [8]:
df_data = pd.read_csv(dir+'HAM10000_metadata.csv')

df_data.head()

**Data Cleaning**

In [9]:
# grouping all images and counting them by each lesion_id
df = df_data.groupby('lesion_id').count()

# now we filter out lesion_id's that have only one image associated with it
df = df[df['image_id'] == 1]

df.reset_index(inplace=True)

df.head()

In [10]:
# here we identify lesion_id's that have duplicate images and those that have only
# one image.

def identify_duplicates(x):
    
    unique_list = list(df['lesion_id'])
    
    if x in unique_list:
        return 'no_duplicates'
    else:
        return 'has_duplicates'
    
# create a new colum that is a copy of the lesion_id column
df_data['duplicates'] = df_data['lesion_id']
# apply the function to this new column
df_data['duplicates'] = df_data['duplicates'].apply(identify_duplicates)

df_data.head()

In [11]:
df_data['duplicates'].value_counts()

In [12]:
# now we filter out images that don't have duplicates
df = df_data[df_data['duplicates'] == 'no_duplicates']

df.shape

In [13]:
# now we create a val set using df because we are sure that none of these images
# have augmented duplicates in the train set
y = df['dx']

_, df_val = train_test_split(df, test_size=0.17, random_state=101, stratify=y)

df_val.shape

In [14]:
df_val['dx'].value_counts()

In [15]:
# This set will be df_data excluding all rows that are in the val set

# This function identifies if an image is part of the train
# or val set.
def identify_val_rows(x):
    # create a list of all the lesion_id's in the val set
    val_list = list(df_val['image_id'])
    
    if str(x) in val_list:
        return 'val'
    else:
        return 'train'

# identify train and val rows

# create a new colum that is a copy of the image_id column
df_data['train_or_val'] = df_data['image_id']
# apply the function to this new column
df_data['train_or_val'] = df_data['train_or_val'].apply(identify_val_rows)
   
# filter out train rows
df_train = df_data[df_data['train_or_val'] == 'train']


print(len(df_train))
print(len(df_val))

In [16]:
print("Training data\n",df_train['dx'].value_counts())
print("Validating data\n",df_val['dx'].value_counts())

**Transfering the Images into the Folders**

In [17]:
# Set the image_id as the index in df_data
df_data.set_index('image_id', inplace=True)

In [18]:
base_dir = 'base_dir'

train_dir = os.path.join(base_dir, 'train_dir')

val_dir = os.path.join(base_dir, 'val_dir')

In [19]:
# Get a list of images in each of the two folders
folder_1 = os.listdir(dir+'ham10000_images_part_1')
folder_2 = os.listdir(dir+'ham10000_images_part_2')

# Get a list of train and val images
train_list = list(df_train['image_id'])
val_list = list(df_val['image_id'])



# Transfer the train images

for image in train_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join(dir+'ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join(dir+'ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)


# Transfer the val images

for image in val_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join(dir+'ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(val_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join(dir+'ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(val_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [20]:
# check how many train images we have in each folder

print(len(os.listdir('base_dir/train_dir/nv')))
print(len(os.listdir('base_dir/train_dir/mel')))
print(len(os.listdir('base_dir/train_dir/bkl')))
print(len(os.listdir('base_dir/train_dir/bcc')))
print(len(os.listdir('base_dir/train_dir/akiec')))
print(len(os.listdir('base_dir/train_dir/vasc')))
print(len(os.listdir('base_dir/train_dir/df')))

In [21]:
# check how many val images we have in each folder

print(len(os.listdir('base_dir/val_dir/nv')))
print(len(os.listdir('base_dir/val_dir/mel')))
print(len(os.listdir('base_dir/val_dir/bkl')))
print(len(os.listdir('base_dir/val_dir/bcc')))
print(len(os.listdir('base_dir/val_dir/akiec')))
print(len(os.listdir('base_dir/val_dir/vasc')))
print(len(os.listdir('base_dir/val_dir/df')))

In [22]:
train_path = 'base_dir/train_dir'
valid_path = 'base_dir/val_dir'

num_train_samples = len(df_train)
num_val_samples = len(df_val)
train_batch_size = 10
val_batch_size = 10
image_size = 224

train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)

In [23]:
datagen = ImageDataGenerator(
    preprocessing_function= \
    tf.keras.applications.mobilenet.preprocess_input)

train_batches = datagen.flow_from_directory(train_path,
                                            target_size=(image_size,image_size),
                                            batch_size=train_batch_size)

valid_batches = datagen.flow_from_directory(valid_path,
                                            target_size=(image_size,image_size),
                                            batch_size=val_batch_size)

# Note: shuffle=False causes the test dataset to not be shuffled
test_batches = datagen.flow_from_directory(valid_path,
                                            target_size=(image_size,image_size),
                                            batch_size=1,
                                            shuffle=False)

**Modeling with Pre-Trained Model: MobileNet**

In [24]:
mobile = tf.keras.applications.mobilenet.MobileNet()

In [25]:
mobile.summary()

In [26]:
#layers in mobile net
len(mobile.layers)

In [27]:
# CREATE THE MODEL ARCHITECTURE

# Exclude the last 5 layers of the above model.
# This will include all layers up to and including global_average_pooling2d_1
x = mobile.layers[-6].output

# Create a new dense layer for predictions
# Flatten the output layer to 1 dimension
x = tf.keras.layers.Flatten()(x)
# # Create a new dense layer for predictions
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(256, activation='relu')(x)
# 7 corresponds to the number of classes
x = tf.keras.layers.Dropout(0.25)(x)
predictions = tf.keras.layers.Dense(7, activation='softmax')(x)

# inputs=mobile.input selects the input layer, outputs=predictions refers to the
# dense layer we created above.

model = tf.keras.Model(inputs=mobile.input, outputs=predictions)

In [28]:
model.summary()

In [29]:
# We need to choose how many layers we actually want to be trained.

# Here we are freezing the weights of all layers except the
# last 25 layers in the new model.
# The last 25 layers of the model will be trained.

for layer in model.layers[:-25]:
    layer.trainable = False

# model.trainable = False # We don't want to re-train the pre-trained weights

In [30]:
model.compile(tf.keras.optimizers.Adam(learning_rate=0.00001), loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [31]:
# Get the labels that are associated with each index
print(valid_batches.class_indices)

In [32]:
train = model.fit(train_batches, steps_per_epoch=train_steps, 
                    validation_data=valid_batches,
                    validation_steps=val_steps,
                    epochs=15, verbose=1)

**Model Evaluation**

In [33]:
model.metrics_names

In [34]:
val_loss, val_acc = model.evaluate(test_batches, steps=len(df_val))

print('val_loss:', val_loss)
print('val_acc:', val_acc)

In [35]:
loss, accuracy = model.evaluate(test_batches)
loss_v, accuracy_v = model.evaluate(valid_batches,steps=len(df_val))
print("Validation: accuracy = %f  ;  loss_v = %f" % (accuracy_v, loss_v))
print("Test: accuracy = %f  ;  loss = %f" % (accuracy, loss))

In [36]:
plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
plt.plot(train.history['accuracy'],'r', label='accuracy')
plt.plot(train.history['val_accuracy'],'b', label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
# plt.show()
plt.subplot(1, 2, 2)
plt.plot(train.history['loss'],'r', label='loss')
plt.plot(train.history['val_loss'],'b', label = 'val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [37]:
test_labels = test_batches.classes
# print(test_labels)
print(test_batches.class_indices)

In [38]:
# make a prediction
predictions = model.predict(test_batches, steps=len(df_val), verbose=1)

In [39]:
print(predictions.shape,"\n",test_labels.shape)

In [40]:
def plot_confusion_matrix(cm, classes,title,
                          normalize=False,cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [41]:
cm = confusion_matrix(test_labels, predictions.argmax(axis=1))
cm_plot_labels = ['akiec', 'bcc', 'bkl', 'df', 'mel','nv', 'vasc']

plot_confusion_matrix(cm, cm_plot_labels, title='Confusion Matrix')

In [42]:
# test_batches
test_img = test_batches[2][0]
print("Test image shape: ",test_img.shape)
predictions= model.predict(test_img)
print("predictions: ",predictions)
def max_ypred(y_pred):
    max_ypred2 = np.max(predictions)
    for i,n in enumerate(predictions[0]):
        if n == max_ypred2:
            return i

print(' lesions type: {}'.format(max_ypred(predictions)))
plt.imshow(test_img.reshape(224,224,3))