In [0]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [0]:
from google.colab import files
files.upload()

In [0]:
!pip install -q kaggle

In [0]:
!mkdir -p ~/.kaggle

In [0]:
!cp kaggle.json ~/.kaggle/

In [0]:
cd '/content/gdrive/My Drive/Deep Learning Projects/skincancer'

In [0]:
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000

In [0]:
!unzip -q skin-cancer-mnist-ham10000.zip

In [0]:
!pip install q keras==2.1.5
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from PIL import Image
import os
#from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [0]:
#Importing sklearn libraries
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [0]:
#First checking out the csv file and observing the distributions and the size of the data
data = pd.read_csv('/content/gdrive/My Drive/Deep Learning Projects/skincancer/HAM10000_metadata.csv')

In [0]:
data.head()

In [0]:
data.info() #No nonnull objects

In [0]:
#Since we do not need lesion_id and image_id we drop it
#data.drop(['lesion_id','image_id'],axis=1,inplace=True)

In [0]:
data.head()

In [0]:
data.dx.value_counts()

In [0]:
data.age.value_counts()

In [0]:
#Checking if there is any missing values in some coloumns
coloumns = data[['dx','dx_type','age','sex']]

for col in coloumns:
  print(coloumns[col].value_counts())

  #DX stands for diagnosis of the patients


  #NV-Melanocytic Nevi -6705
  #MEL- Melanoma -1113
  #BKL- Benign keratosis-like lesions - 1099
  #BCC- Basal Cell Carcinoma - 514
  #AKIEC - Actinic Keratoses - 327
  #VASC - Vascular Lesions - 142
  #DF - Dermatofibroma - 115

  #DX_TYPE and how to diagnosis were made
  #Histo - 5304
  #Follow_up - 3704
  #Consensus - 902
  #Confocal - 69

  #Age - 
  #45 years - 1299
  # -50 years -1187
  # -55 years -1009
  # -40 years -985
  # -60 years -803
  #70 years -756
  #35 years -753
  #65 years -731
  #75 years -618
  #30 years -464
  #80 years -404
  #85 years -290
  #25 years -247
  #20 years -169
  #5 years -86
  #15 years -77
  #10 years -41
  #0 years with 39 people data-Missing data

In [0]:
#Checking out the association between the lesion_id and the images
data_filtering = data.groupby('lesion_id').count()

In [0]:
data_filtering = data_filtering[data_filtering['image_id']==1]

In [0]:
data_filtering.reset_index(inplace=True) #Creating a new dataframes using the dataframes given

In [0]:
data_filtering.head()

In [0]:
#Checking the lesion_id for any dublicate images and images with only 1 image_id
def read_duplicates(x):
  new_data = list(data_filtering['lesion_id'])
  if x in new_data:
    return 'unduplicated'
  else:
    return 'duplicated'


In [0]:
#Creating a new coloumn that is the copy of lesion_id
data['duplicates'] = data['lesion_id']

In [0]:
#Time to apply the new function created to the new coloumn
data['duplicates'] = data['duplicates'].apply(read_duplicates)

In [0]:
data.head()

In [0]:
data['duplicates'].value_counts()
#Looks like there are high number of duplicated values 

In [0]:
#First filtering out images that don't have duplicated values
data_filtering = data[data['duplicates']=='unduplicated'] #Creating new dataframe using "unduplicated" string from duplicates

In [0]:
data_filtering.shape

In [0]:
from sklearn.model_selection import train_test_split
#Creating a test set from the data
y = data_filtering['dx']
_,data_val = train_test_split(data_filtering,test_size=0.2,random_state=0,stratify=y)

In [0]:
data_val.shape

In [0]:
data_val['dx'].value_counts()

In [0]:
#Splitting training data from the validation data in the csv file
def trained(y):
  val_list =list(data_val['image_id']) #Creating a list of all the lesion Iy
  if str(y) in val_list:
    return 'validation'
  else:
    return 'training'

In [0]:
data['training_or_validation'] = data['image_id'] #New coloumn and copying training and validaiton

In [0]:
#Using above function into the new coloumn
data['training_or_validation'] = data['training_or_validation'].apply(trained)

In [0]:
training_data = data[data['training_or_validation']=='training']

In [0]:
print(len(training_data))

In [0]:
print(len(data_val))

In [0]:
training_data['dx'].value_counts()

In [0]:
data_val['dx'].value_counts()

In [0]:
#Setting the image ID as index in data
data.set_index('image_id',inplace=True)

In [0]:
#There are two folders linked to all the images
first_folder = os.listdir('/content/gdrive/My Drive/Deep Learning Projects/skincancer/ham10000_images_part_1')
second_folder = os.listdir('/content/gdrive/My Drive/Deep Learning Projects/skincancer/ham10000_images_part_2')

In [0]:
#Getting list of training and validation images
training_data_list = list(training_data['image_id'])
validation_data_list = list(data_val['image_id'])

In [0]:
#Don't run this section more than once or it will copy all the pictures more than once which we don't want
for image in training_data_list:
  image_name = image + '.jpg'
  label = data.loc[image,'dx']

  if image_name in second_folder:
    src = os.path.join('/content/gdrive/My Drive/Deep Learning Projects/skincancer/ham10000_images_part_2',image_name)
    destination_file = os.path.join('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory',label,image_name)
    shutil.copyfile(src,destination_file)
    
  if image_name in first_folder:
    src = os.path.join('/content/gdrive/My Drive/Deep Learning Projects/skincancer/ham10000_images_part_1',image_name)
    destination_file = os.path.join('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory',label,image_name)
    shutil.copyfile(src,destination_file)
    

In [0]:
#Don't run this section more than once or it will copy all the pictures more than once which we don't want
for image in validation_data_list:
  image_name = image+'.jpg'
  label = data.loc[image,'dx']

  if image_name in first_folder:
    src = os.path.join('/content/gdrive/My Drive/Deep Learning Projects/skincancer/ham10000_images_part_1',image_name)
    destination_file = os.path.join('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/testing_directory',label,image_name)
    shutil.copyfile(src,destination_file)

  
  if image_name in second_folder:
    src = os.path.join('/content/gdrive/My Drive/Deep Learning Projects/skincancer/ham10000_images_part_2',image_name)
    destination_file = os.path.join('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/testing_directory',label,image_name)
    shutil.copyfile(src,destination_file)

In [0]:
#Data augmentation to balance the image dataset
#Repeating the numbers of images in both training and testing folder
#----------Training-----------
#nv       5822
#mel      1067
#bkl      1011
#bcc       479
#akiec     297
#vasc      129
#df        107

#-----------Testing--------------

#nv       883
#bkl       88
#mel       46
#bcc       35
#akiec     30
#vasc      13
#df         8

In [0]:
folder_list = ['mel','bkl','bcc','akiec','vasc','df'] 

In [0]:
for item in folder_list:
  augmented_directory = 'augmentation'
  os.mkdir(augmented_directory)

  image_directory = os.path.join(augmented_directory,'image_directory')
  os.mkdir(image_directory)

  image_class = item

  #Listing all the images in the directory present
  image_list = os.listdir('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory/'+image_class)

  #Copying all the images from training_directory into image_directory
  for filename in image_list:
    src = os.path.join('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory/'+image_class,filename)
    destination = os.path.join(image_directory,filename)
    shutil.copyfile(src,destination)
  
  path = augmented_directory
  save_path = '/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory/'+image_class

  #Creating a data generator
  datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest')
  
  batch_size=50

  aug_datagen = datagen.flow_from_directory(path,
                                            save_to_dir =save_path,
                                            save_format='jpg',
                                            target_size=(224,224),
                                            batch_size=batch_size)
  
  create_image=6000

  num_files = len(os.listdir(image_directory))
  num_batches = int(np.ceil((create_image-num_files)/batch_size))

  for i in range(0,num_batches):
    imgs,labels = next(aug_datagen)

  shutil.rmtree('augmentation')



In [0]:
print(len(os.listdir('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory//bcc')))
print(len(os.listdir('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory//bkl')))
print(len(os.listdir('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory//akiec')))
print(len(os.listdir('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory//vasc')))
print(len(os.listdir('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory//df')))
print(len(os.listdir('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory//mel')))

In [7]:
#Now we almost have 5k images for each classes
#Now using pretrained models to train the skin lesion problem using training dataset

#Looking at couple of AI papers,it looks like Densenet architecture will do a better classification and it is also one of the most recent architecture in deep learning

#Densenet architecture(Particularly Densenet-201) not 121
%tensorflow_version 1.x
import tensorflow as tf
from tensorflow.keras.applications import DenseNet201
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential,Model,load_model 
from tensorflow.keras.layers import Convolution2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Flatten,Dense,Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint,Callback
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model 

TensorFlow 1.x selected.


In [0]:
image_width,image_height=224,224 #Assigning height and width ---Densenet is trained in 224*224 dimensions 
training_samples = 33068 #No of training sample
testing_samples = 1103 #Number of validation sample
epochs = 40
batch_size = 10 #Taking the batch_size of 10 
n_classes = 7 #Number of classes is 7. We have 7 categories of skin disease

num_training_data = len(training_data)
num_data_val = len(data_val)

training_steps = np.ceil(num_training_data/batch_size)
testing_steps = np.ceil(num_data_val/batch_size)

In [0]:
training_folder = '/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/training_directory/'
testing_folder = '/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/testing_directory/'

In [0]:
datagen = ImageDataGenerator(
    preprocessing_function= \
    tf.keras.applications.densenet.preprocess_input)
#Using the same pre processing technique that was applied to the original rgb images for densenet architecture

In [0]:
training_batches = datagen.flow_from_directory(training_folder,
                                               target_size=(image_height,image_width),
                                               batch_size = batch_size)

Found 21247 images belonging to 7 classes.


In [0]:
testing_batches = datagen.flow_from_directory(testing_folder,
                                              target_size=(image_height,image_width),
                                              batch_size = batch_size)

Found 1102 images belonging to 7 classes.


In [0]:
#When you use a pretrained network, you have to freeze some layers in the network
#Since we are using densenet201, let's see the properties of the network first

network = tf.keras.applications.densenet.DenseNet201() #Assigning a variable to densenet121 network

In [0]:
network.summary() #Checking the CNN architecture of Dense121 network

In [0]:
len(network.layers) #There are alltogether 429 layers in the network

709

In [0]:
#Creating the model architecture by removing the last 40 layers from the network
engineered_network = network.layers[-40].output

In [0]:
network.summary()

In [0]:
engineered_network = Dropout(0.20)(engineered_network)

In [0]:
engineered_network = Flatten()(engineered_network)

In [0]:
predictions = Dense(7,activation='softmax')(engineered_network) #7 for the 7 classes we have

In [0]:
final_model = Model(inputs = network.input, outputs = predictions)

In [0]:
final_model.summary()

In [0]:
#Because it is a pretrained neural network, we need to choose how many layers we want our neural network to be trained. 
#Freezing all layer weights except the last 100
#layers in the newly created model
#meaning the last 100 layers of the model will be trained

for layer in final_model.layers[:-100]:
  layer.trainable = False

In [0]:
from tensorflow.keras.metrics import categorical_accuracy, top_k_categorical_accuracy

def top_two_accuracy(y_true,y_pred):
  return top_k_categorical_accuracy(y_true,y_pred,k=2)

In [0]:
final_model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=[categorical_accuracy,top_two_accuracy])

In [0]:
print(training_batches.class_indices) #Checking oout the number(class indices)

In [0]:
#Assigning the class wights so that it can be sensitive to melanoma because we cannot take risk on it to misdiagonise
class_weights = {
    0: 1.0, #akiec
    1: 1.0, #bcc
    2: 1.0, #bkl
    3: 1.0, #df
    4: 2.0, #melanoma
    5: 1.0, #nv
    6: 1.0, #vasc
} 

In [0]:
 #ReduceLROnPlateau that will adjust the learning rate when a plateau in model performance is detected, e.g. no change for a given number of training epochs. 
 #This callback is designed to reduce the learning rate after the model stops improving with the hope of fine-tuning model weights.
 filepath = '/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/cancermodel.h5'
 checkpoint = ModelCheckpoint(filepath,monitor = 'val_top_two_accuracy',verbose=1,save_best_only=True, mode='max') #max when the model accuracy has stopped increasing

learning_rate_reduced = ReduceLROnPlateau(monitor = 'val_top_two_accuracy',factor =0.5, patience =2, verbose=1, mode='max', min_lr=0.000003) 


In [0]:
#First training yielded
#Epoch 100/100
#36/36 [==============================] - 349s 10s/step - loss: 0.2844 - acc: 0.9361 - val_loss: 2.6744 - val_acc: 0.7682

#Since validation accuracy isn't very satisfying we are going to be using reduceLRonplateu method that will reduce the learning rate once the learning gets stagnant at some epochs

#Second training time was around the same accuracy
#Since training loss isn't getting any better or worse, the issue here is that the optimizer is stalling at a local minimum. 
#Decreased the minimum learning rate and unfreezing more layers.

#Third attempt was to use categorical_accuracy instead of just accuracy and changed the steps_per epoch value (training data / epochs)
#Same with the validation steps
#Stopped at 10 epochs. Continuing ...


#End result..............................................................................................................................

#891/892 [============================>.] - ETA: 6s - loss: 0.0212 - categorical_accuracy: 0.9930 - top_two_accuracy: 0.9999 Epoch 1/10
#111/892 [==>...........................] - ETA: 38:40 - loss: 2.2958 - categorical_accuracy: 0.7967 - top_two_accuracy: 0.8711
#Epoch 00002: val_top_two_accuracy improved from 0.86751 to 0.87114, saving model to /content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/cancermodel.h5
#892/892 [==============================] - 6245s 7s/step - loss: 0.0212 - categorical_accuracy: 0.9930 - top_two_accuracy: 0.9999 - val_loss: 2.2958 - val_categorical_accuracy: 0.8020 - val_top_two_accuracy: 0.8711


In [0]:
callbacks_list = [checkpoint,learning_rate_reduced]

In [0]:
saved_model = tf.keras.models.load_model(filepath,custom_objects={"top_two_accuracy": top_two_accuracy}) #Use this when retraining the model from the point where you canceled the training

In [0]:
network_training = saved_model.fit_generator(training_batches,
                                             steps_per_epoch=training_steps,
                                             #class_weight=class_weights, #Remove the '#' if you are not retraining it cause the weights has already been saved before so no need to call the class_weights again
                                             validation_data = testing_batches,
                                             validation_steps = testing_steps,
                                             epochs=10,
                                             verbose = 1,
                                             callbacks = callbacks_list)


In [0]:
saved_model = tf.keras.models.load_model('/content/gdrive/My Drive/Deep Learning Projects/skincancer/skin_cancer_classified/cancermodel.h5',custom_objects={"top_two_accuracy": top_two_accuracy})

In [0]:
from PIL import Image
import numpy as np
from skimage import transform

custom_image = '/content/gdrive/My Drive/Deep Learning Projects/skincancer/custom images from internet for testing/bcc.jpg'
def load(filename):
   np_image = Image.open(filename)
   np_image = np.array(np_image).astype('float32')/255
   np_image = transform.resize(np_image, (224, 224, 3))
   np_image = np.expand_dims(np_image, axis=0)
   return np_image

image_load = load(custom_image)
prediction = saved_model.predict(image_load)[0] #Creates a one dimensional list by adding [0]

max_value = prediction.max()


if prediction[0]==max_value:
  print('The disease is classified as Actinic Keratoses')
elif prediction[1]==max_value:
  print('The disease is classified as Basal Cell Carcinoma')
elif prediction[2]==max_value:
  print('The disease is classified as Benign Keratosis like lesion')
elif prediction[3]==max_value:
  print('The disease is classified as Dermatofibroma')
elif prediction[4]==max_value:
  print('The disease is classified as Melanoma')
elif prediction[5]==max_value:
  print('The disease is classified as Melanocytic Nevi')
elif prediction[6]==max_value:
  print ('The disease is classified as Vascular Lesions')
else:
  print('Disease could not be identified. Try a different picture')
  