In [None]:
''' Copyright 2019 Xilinx Inc.

Licensed under the Apache License, Version 2.0 (the "License"); you may 
not use this file except in compliance with the License. You may obtain
a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''

In [2]:
# Skin Cancer Dataset Preprocessing

# Import the libraries
import pandas as pd
import pandas_profiling as pp
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
import os
from sklearn.model_selection import train_test_split
import shutil

#### Learn more about the skin lesion dataset:
The HAM10000 Dataset: A Large Collection of Multi-Source Dermatoscopic Images of Common Pigmented Skin Lesions
https://arxiv.org/abs/1803.10417

#### Download dataset
Downloas the following zip files from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T
1) HAM10000_images_part_1.zip
2) HAM10000_images_part_2.zip
3) HAM10000_metadata.tab

In [None]:
# Read the metadata
pdf = pd.read_csv('data/HAM10000_metadata.csv')

# Explore the dataset
pp.ProfileReport(pdf)

In [5]:
# Set y as the labels
y = pdf['dx']

# Split the metadata into training and validation
df_train, df_val = train_test_split(pdf, test_size=0.1, random_state=101, stratify=y)

# Print the shape of the training and validation split
print(df_train.shape)
print(df_val.shape)

# Find the number of values in the training and validation set
df_train['dx'].value_counts()
df_val['dx'].value_counts()

(9013, 7)
(1002, 7)


nv       671
mel      111
bkl      110
bcc       51
akiec     33
vasc      14
df        12
Name: dx, dtype: int64

In [4]:
# Transfer the images into folders
# Set the image id as the index
pdf.set_index('image_id', inplace=True)

# Get a list of images in each of the two folders
folder_1 = os.listdir('data/ham10000_images_part_1')
folder_2 = os.listdir('data/ham10000_images_part_2')

# Get a list of training and validation images
train_list = list(df_train['image_id'])
val_list = list(df_val['image_id'])

In [5]:
# Create new directories for the images for training and validation
base_dir = 'data/train_val'
os.mkdir(base_dir)

# Training file directory
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# Validation file directory
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)

# Test/quantization file directory
test_dir = os.path.join(base_dir, 'test_dir')
os.mkdir(test_dir)

# Create new folders in the training directory for each of the classes
nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)

# Create new folders in the validation directory for each of the classes
nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

# Create new folders in the test directory for each of the classes
nv = os.path.join(test_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(test_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(test_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(test_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(test_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(test_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(test_dir, 'df')
os.mkdir(df)

In [6]:
# Transfer the training images
for image in train_list:

    fname = image + '.jpg'
    label = pdf.loc[image, 'dx']

    if fname in folder_1:
        # source path to image
        src = os.path.join('data/HAM10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join('data/HAM10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

# Transfer the validation and test/quatizer images
testimgcount=0 
#open the testlabelfile
f = open(os.path.join(base_dir, 'testlabelfile.txt'), 'a+')
fcal = open(os.path.join(base_dir, 'calibration.txt'), 'a+')

    
for image in val_list:

    fname = image + '.jpg'
    label = pdf.loc[image, 'dx']

    if fname in folder_1:
        # source path to image
        src = os.path.join('data/HAM10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(val_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)
        #copy for testing and dnndk quantization
        if testimgcount<600:
            dst = os.path.join(test_dir,label, fname)
            shutil.copyfile(src, dst)
            #append the testlabel file
            f.write(label+'\n')
            #append in the calibration.txt file
            imgloc =  os.path.join(label,fname)
            fcal.write('{0}\n'.format(imgloc))
            # increment the file#
            testimgcount+=1

    if fname in folder_2:
        # source path to image
        src = os.path.join('data/HAM10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(val_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)
        #copy for testing and dnndk quantization
        if testimgcount<300:
            dst = os.path.join(test_dir,label, fname)
            shutil.copyfile(src, dst)
            #append the testlabel file
            f.write(label+'\n')
            #append in the calibration.txt file
            imgloc =  os.path.join(label,fname)
            fcal.write('{0}\n'.format(imgloc))
            # increment the file#
            testimgcount+=1
f.close()
fcal.close()

In [7]:
# Check how many training images are in each folder
print(len(os.listdir(base_dir +'/train_dir/nv')))
print(len(os.listdir(base_dir +'/train_dir/mel')))
print(len(os.listdir(base_dir +'/train_dir/bkl')))
print(len(os.listdir(base_dir +'/train_dir/bcc')))
print(len(os.listdir(base_dir +'/train_dir/akiec')))
print(len(os.listdir(base_dir +'/train_dir/vasc')))
print(len(os.listdir(base_dir +'/train_dir/df')))

# Check how many validation images are in each folder
print(len(os.listdir(base_dir +'/val_dir/nv')))
print(len(os.listdir(base_dir +'/val_dir/mel')))
print(len(os.listdir(base_dir +'/val_dir/bkl')))
print(len(os.listdir(base_dir +'/val_dir/bcc')))
print(len(os.listdir(base_dir +'/val_dir/akiec')))
print(len(os.listdir(base_dir +'/val_dir/vasc')))
print(len(os.listdir(base_dir +'/val_dir/df')))

# check how many test/quantization images are
print(len(os.listdir(base_dir + '/test_dir')))

6034
1002
989
463
294
128
103
671
111
110
51
33
14
12
7


In [71]:
# Create a data generator to augment the images in real time
datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        # brightness_range=(0.9,1.1),
        fill_mode='nearest')

In [80]:
# Augment the data
# Class 'nv' is not going to be augmented, it already has more than 6000 images
class_list = ['mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']

for item in class_list:

    # Create a temporary directory for the augmented images
    aug_dir = 'data/train_val/aug_dir'
    os.mkdir(aug_dir)

    # Create a directory within the base dir to store images of the same class
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)

    # Choose a class
    img_class = item

    # List all the images in the directory
    img_list = os.listdir(base_dir +'/train_dir/' + img_class)

    # Copy images from the class train dir to the img_dir
    for fname in img_list:
        # source path to image
        src = os.path.join(base_dir +'/train_dir/' + img_class, fname)
        # destination path to image
        dst = os.path.join(img_dir, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    # point to a dir containing the images and not to the images themselves
    path = aug_dir
    
    save_path = base_dir + '/train_dir/' + img_class

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(path,
                                              save_to_dir=save_path,
                                              save_format='jpg',
                                              target_size=(224, 224),
                                              batch_size=batch_size)

    # Generate the augmented images and add them to the training folders
    num_aug_images_wanted = 6000  # total number of images we want to have in each class
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted - num_files) / batch_size))

    # run the generator and create about 6000 augmented images
    for i in range(0, num_batches):
        imgs, labels = next(aug_datagen)

    # delete temporary directory with the raw image files
    shutil.rmtree(aug_dir)

Found 5810 images belonging to 1 classes.
Found 5984 images belonging to 1 classes.
Found 5606 images belonging to 1 classes.
Found 5930 images belonging to 1 classes.
Found 5170 images belonging to 1 classes.
Found 4170 images belonging to 1 classes.


In [81]:
# Check how many training images are in each folder
print(len(os.listdir(base_dir +'/train_dir/nv')))
print(len(os.listdir(base_dir +'/train_dir/mel')))
print(len(os.listdir(base_dir +'/train_dir/bkl')))
print(len(os.listdir(base_dir +'/train_dir/bcc')))
print(len(os.listdir(base_dir +'/train_dir/akiec')))
print(len(os.listdir(base_dir +'/train_dir/vasc')))
print(len(os.listdir(base_dir +'/train_dir/df')))

# Check how many validation images are in each folder
print(len(os.listdir(base_dir +'/val_dir/nv')))
print(len(os.listdir(base_dir +'/val_dir/mel')))
print(len(os.listdir(base_dir +'/val_dir/bkl')))
print(len(os.listdir(base_dir +'/val_dir/bcc')))
print(len(os.listdir(base_dir +'/val_dir/akiec')))
print(len(os.listdir(base_dir +'/val_dir/vasc')))
print(len(os.listdir(base_dir +'/val_dir/df')))

6034
6010
6034
6006
6030
6020
6020
671
111
110
51
33
14
12
