# Prepare Project

Can be run on Google Colab.

## Install libraries

In [None]:
# Libraries to download datasets
!pip install sla-cli
!pip install alive-progress
!pip install patool
!pip install fuzzywuzzy

In [None]:
# Summary of datasets
!sla-cli ls -v all

## Import libraries

In [None]:
import os
import shutil
import random
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.utils import class_weight

sns.set()

In [None]:
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Conv2D, GlobalAveragePooling2D, MaxPooling2D, Dense, Dropout, Flatten, BatchNormalization

from tensorflow.keras.applications.nasnet import NASNetLarge
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.efficientnet import EfficientNetB7
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2

## Helper functions

### Visualization functions

In [None]:
def show_diagnosis(dataset, id='image', random_state=None,
                   training_path='/content/data/ISIC2018_Task3_Training_Input/'):
  '''
  Plots examples of each diagnosis in the training dataset.
  '''

  # Set random seed for reproducible results
  random.seed(random_state)

  # Create figure
  fig = plt.figure(figsize=(12, 7))
  plt.title('Diagnostic Examples', fontsize=13)
  plt.grid()
  plt.axis(False)
  columns = 4
  rows = 2
  ax = []

  # Diagnosis
  diagnosis = dataset.drop(id, axis=1).columns

  # Iterate for every diagnosis
  for index, col in enumerate(diagnosis):
    dataset_diagnosis = dataset.loc[dataset[col] == 1, id].values
    idx = random.choice(dataset_diagnosis)
    label = col

    # Read image
    img = plt.imread(training_path + idx + '.jpg')

    # Create subplot and append to ax
    ax.append(fig.add_subplot(rows, columns, index+1))

    # Hide grid lines
    ax[-1].grid(False)

    # Hide axes ticks
    ax[-1].set_xticks([])
    ax[-1].set_yticks([])
    ax[-1].set_title(label)
    plt.imshow(img)

  # Add one more example
  col = random.choice(diagnosis)
  dataset_diagnosis = dataset.loc[dataset[col] == 1, id].values
  idx = random.choice(dataset_diagnosis)
  label = col

  # Read image
  img = plt.imread(training_path + idx + '.jpg')

  # Create subplot and append to ax
  ax.append(fig.add_subplot(rows, columns, 8))

  # Hide grid lines
  ax[-1].grid(False)

  # Hide axes ticks
  ax[-1].set_xticks([])
  ax[-1].set_yticks([])
  ax[-1].set_title(label)
  plt.imshow(img)

  plt.show()

def plot_model_history(model_history, save=True):
  '''
  Plot model's validation loss and validation accuracy.
  '''

  fig, axs = plt.subplots(1,2,figsize=(16, 7))

  # summarize history for accuracy
  axs[0].plot(range(1, len(model_history.history['accuracy'])+1), model_history.history['accuracy'])
  axs[0].plot(range(1, len(model_history.history['val_accuracy'])+1), model_history.history['val_accuracy'])
  axs[0].set_title('Model Accuracy')
  axs[0].set_ylabel('Accuracy')
  axs[0].set_xlabel('Epoch')
  axs[0].set_xticks(np.arange(1,len(model_history.history['accuracy'])+1),len(model_history.history['accuracy'])/10)
  axs[0].legend(['Training', 'Validation'], loc='best')

  # summarize history for loss
  axs[1].plot(range(1, len(model_history.history['loss'])+1), model_history.history['loss'])
  axs[1].plot(range(1, len(model_history.history['val_loss'])+1), model_history.history['val_loss'])
  axs[1].set_title('Model Loss')
  axs[1].set_ylabel('Loss')
  axs[1].set_xlabel('Epoch')
  axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
  axs[1].legend(['Training', 'Validation'], loc='best')

  if save == True:
      plt.savefig('Training history.png')

  plt.show()

# Main Project

## Download data

ISIC 2018 Dataset (HAM10000)

In [None]:
# Prepare directory to download data
phases = ['Training', 'Validation', 'Test']
data_root = os.path.join(os.getcwd(), 'data')
data_dir = {phase:os.path.join(data_root, phase) for phase in phases}

# Define download function
def download(url, destination_folder='.'):
  !wget -nc -q --show-progress $url -P $destination_folder

# Download Training.zip, Validation.zip, Test.zip and ground truth labels
for phase in phases:
  download(f'https://isic-challenge-data.s3.amazonaws.com/2018/ISIC2018_Task3_{phase}_Input.zip', data_root)

for phase in phases[:2]:
  download(f'https://isic-challenge-data.s3.amazonaws.com/2018/ISIC2018_Task3_{phase}_GroundTruth.zip', data_root)

In [None]:
# Unzip Training.zip, Validation.zip, Test.zip and ground truth labels
for phase in phases:
  if not os.path.exists(data_dir[phase]):
    with zipfile.ZipFile(os.path.join(data_root, f'ISIC2018_Task3_{phase}_Input.zip'), 'r') as myzip:
      for file in tqdm(myzip.namelist(), desc=f'Extracting {phase}.zip'):
        myzip.extract(member=file, path=data_root)

for phase in phases[:2]:
  if not os.path.exists(data_dir[phase]):
    with zipfile.ZipFile(os.path.join(data_root, f'ISIC2018_Task3_{phase}_GroundTruth.zip'), 'r') as myzip:
      for file in tqdm(myzip.namelist(), desc=f'Extracting {phase}.zip'):
        myzip.extract(member=file, path=data_root)

Download additional data: BCN20000

In [None]:
!sla-cli download bcn_20000

In [None]:
df = pd.read_csv('/content/bcn_20000/metadata.csv')

In [None]:
diagn_map = {'melanoma': 'MEL', 'basal cell carcinoma': 'BCC', 'seborrheic keratosis': 'BKL',
             'actinic keratosis': 'AKIEC', 'solar lentigo': 'BKL', 'dermatofibroma': 'DF',
             'vascular lesion': 'VASC'}

df['dx'] = df['dx'].map(diagn_map)
df = df.dropna(subset=['dx'], axis=0).reset_index(drop=True)
bcn_data = df[['image_name', 'dx']].copy()

In [None]:
bcn_dir = '/content/bcn_20000/images'
train_dir = '/content/data/ISIC2018_Task3_Training_Input'

file_names = os.listdir(bcn_dir)

for file_name in file_names:
    shutil.move(os.path.join(bcn_dir, file_name), train_dir)

PAD-UFES-20

In [None]:
# Error in cleaning up doesn't affect the images
!sla-cli download pad_ufes_20

In [None]:
df2 = pd.read_csv('/content/PAD_UFES_20/metadata.csv')

In [None]:
diagn_map2 = {'BCC': 'BCC', 'ACK': 'AKIEC', 'SEK': 'BKL', 'MEL': 'MEL'}

df2['diagnostic'] = df2['diagnostic'].map(diagn_map2)
df2 = df2.dropna(subset=['diagnostic'], axis=0).reset_index(drop=True)
pad_data = df2[['img_id', 'diagnostic']]

In [None]:
pad_dir = '/content/PAD_UFES_20/images'
train_dir = '/content/data/ISIC2018_Task3_Training_Input'
    
file_names = os.listdir(pad_dir)

for file_name in file_names:
    shutil.move(os.path.join(pad_dir, file_name), train_dir)

## Load and explore data

In [None]:
# Reading the datasets
train_labels = pd.read_csv(os.path.join(data_root, 'ISIC2018_Task3_Training_GroundTruth/ISIC2018_Task3_Training_GroundTruth.csv'))
val_labels = pd.read_csv(os.path.join(data_root, 'ISIC2018_Task3_Validation_GroundTruth/ISIC2018_Task3_Validation_GroundTruth.csv'))

In [None]:
# Integrate all data
training_labels = train_labels.copy()
training_labels['diagnosis'] = training_labels.iloc[:, 1:].idxmax(axis=1)
training_labels['img_id'] = training_labels['image'] + '.jpg'
train_data = training_labels[['img_id', 'diagnosis']]

validation_labels = val_labels.copy()
validation_labels['diagnosis'] = validation_labels.iloc[:, 1:].idxmax(axis=1)
validation_labels['img_id'] = validation_labels['image'] + '.jpg'
val_data = validation_labels[['img_id', 'diagnosis']]

bcn_data['img_id'] =  bcn_data['image_name'] + '.jpg'
bcn_data = bcn_data[['img_id', 'dx']]
bcn_data = bcn_data.rename(columns={'dx':'diagnosis'})

pad_data = pad_data.rename(columns={'diagnostic':'diagnosis'})

# Create merged dataset
train_all = pd.concat([train_data, bcn_data, pad_data], axis=0).reset_index(drop=True)

Check for null values.



In [None]:
train_labels.isnull().sum()

In [None]:
train_labels.describe()

## Data Visualisation

Show class distribution before and after integrating additional data

In [None]:
# Count the number of diagnoses
plt.figure(figsize=(10, 7))

plt.title('Distribution of skin diseases')
sns.countplot(data=train_data, x='diagnosis')

plt.show()

In [None]:
# Count the number of diagnoses
plt.figure(figsize=(10, 7))

plt.title('Distribution of skin diseases')
sns.countplot(data=train_all, x='diagnosis')

plt.show()

Compare percentages

In [None]:
# Show percentages
print(f"Initial percentage of each diagnosis:\n{100*train_data['diagnosis'].value_counts(normalize=True).to_frame()}")

# Show percentages
print(f"\nFinal percentage of each diagnosis:\n{100*train_all['diagnosis'].value_counts(normalize=True).to_frame()}")

## Diagnosis Visualisation

In [None]:
show_diagnosis(train_labels, random_state=42)

## Image Augmentation

In [None]:
# Set 0 if you want to use EfficientNet or other models with input 224.
# Set 1 if you want to use InceptionV3 or other models with input 299.
model_id = 0

In [None]:
# Load data in batches with an ImageDataGenerator

datagen_train = ImageDataGenerator(
                    rotation_range=20,
                    width_shift_range=0.1,
                    height_shift_range=0.1,
                    brightness_range=None, 
                    shear_range=0.2,
                    zoom_range=0.2,
                    channel_shift_range=0.0, 
                    fill_mode='nearest', 
                    cval=0.0, 
                    horizontal_flip=True,
                    vertical_flip=True,
                    rescale=1.0/255.0, 
                    preprocessing_function=None,
                    validation_split=0.2)

datagen_test = ImageDataGenerator(
                    rescale=1.0/255.0)

# Shape of the images (lxl)
if model_id is 0:
  l = 224
else:
  l = 299

batch_size = 128

# Training Generators
train_batches = datagen_train.flow_from_dataframe(dataframe=train_all, directory=os.path.join(data_root, 'ISIC2018_Task3_Training_Input'),
                                                  x_col='img_id', y_col='diagnosis', class_mode='categorical', batch_size=batch_size, shuffle=True,
                                                  target_size= (l, l), subset='training')

valid_batches = datagen_train.flow_from_dataframe(dataframe=train_all, directory=os.path.join(data_root, 'ISIC2018_Task3_Training_Input'),
                                                  x_col='img_id', y_col='diagnosis', class_mode='categorical', batch_size=batch_size, shuffle=True,
                                                  target_size= (l, l), subset='validation')

# Evaluation Generators
test_batches = datagen_test.flow_from_directory(data_root, batch_size=batch_size,
                                                target_size=(l, l), shuffle=False, classes = ['ISIC2018_Task3_Test_Input'])

valid_ind = datagen_test.flow_from_dataframe(dataframe=val_data, directory=os.path.join(data_root, 'ISIC2018_Task3_Validation_Input'),
                                             x_col='img_id', y_col='diagnosis', class_mode='categorical', batch_size=batch_size,
                                             shuffle=False, target_size= (l, l))


batchX, batchy = train_batches.next()
print('Batch shape=%s, min=%.3f, max=%.3f' % (batchX.shape, batchX.min(), batchX.max()))

## Model Training

Create model from scratch to serve as a baseline classifier.

In [None]:
model = Sequential()

model.add(Conv2D(filters=64, kernel_size = (3,3), activation="relu", input_shape=(224, 224, 3)))
model.add(Conv2D(filters=64, kernel_size = (3,3), activation="relu"))

model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization())
model.add(Conv2D(filters=128, kernel_size = (3,3), activation="relu"))
model.add(Conv2D(filters=128, kernel_size = (3,3), activation="relu"))

model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization())

model.add(Flatten())
model.add(Dense(256,activation="relu"))

model.add(Dense(7 ,activation="softmax"))

optimizer=optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=['accuracy', 'AUC'], weighted_metrics=['accuracy'])

In [None]:
# Callback functions

# Earlystop
earlystop_callback = EarlyStopping(monitor = 'val_loss',
                                   min_delta = 0,
                                   patience = 20,
                                   verbose = 1,
                                   restore_best_weights = True)

# Save the most accurate model's weights
checkpoint_callback = ModelCheckpoint(filepath='/content/',
                                      save_weights_only=True,
                                      monitor='val_weighted_accuracy',
                                      mode='max',
                                      save_best_only=True)

my_callbacks = [earlystop_callback, checkpoint_callback]

In [None]:
# Weighted loss
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_all['diagnosis']),
                                                 train_all['diagnosis'])
class_weights = dict(enumerate(class_weights))

print(np.unique(train_all['diagnosis']))
print(class_weights)

custom_weights = {0: 1.5, 1: 1.0, 2: 1.0, 3: 10.0, 4: 1.1, 5: 0.4, 6: 10.0}

In [None]:
# Train the model
history = model.fit(train_batches, validation_data=valid_batches, epochs = 100, callbacks = my_callbacks,
                    shuffle = True, steps_per_epoch = 50, verbose=2, class_weight=class_weights)

In [None]:
# Save best model's weights
model.save_weights('SimpleCNN.hdf5')

# Plot training history
plot_model_history(history)

## Model Evaluation

In [None]:
# Load model
model.load_weights('/content/SimpleCNN.hdf5')

# Evaluate the model on the validation dataset

results = model.evaluate(valid_ind, batch_size=batch_size)
print("Test loss: {:.3f}".format(results[0]))
print("Test accuracy: {:.3f}".format(results[1]))

In [None]:
# Get predictions on the test dataset
results = model.predict(test_batches, batch_size=batch_size)

# Transform predictions on the appropriate format for submission
df_pred = pd.DataFrame(results)
df_pred = df_pred.rename(columns={0:'AKIEC', 1:'BCC', 2:'BKL', 3:'DF', 4:'MEL', 5:'NV', 6:'VASC'})

In [None]:
ids = []
for filename in os.listdir('/content/data/Test/ISIC2018_Task3_Test_Input'):
  if os.path.splitext(filename)[1] == '.jpg':
    id = os.path.splitext(filename)[0]
    ids.append(id)

df_pred['image'] = ids

In [None]:
df_pred.to_csv('InitialResults.csv', index=None)