## Create dataset bundle

This notebook will create the dataset bundle for the Skin lesion tutorial in Knowledge Center.

Download the training data and ground truth for the segmentation task from the __[competition site](https://challenge.kitware.com/#phase/5abcb19a56357d0139260e53)__ or via the links below:

* https://storage.googleapis.com/bucket-8732/SkinLesionSegmentation/ISIC2018_Task1-2_Training_Input.zip
* https://storage.googleapis.com/bucket-8732/SkinLesionSegmentation/ISIC2018_Task1_Training_GroundTruth.zip

In [4]:
import functools
from glob import glob
import os

import pandas as pd
import sidekick
from sklearn.model_selection import train_test_split

---
**NOTE!** 

For information about how to install sidekick, see https://github.com/Peltarion/sidekick

---


In [11]:
input_path = '.'
output_path = './data.zip'

In [6]:
tr_images_rel_path = glob(input_path + '/ISIC2018_Task1-2_Training_Input/*.jpg')
gt_images_rel_path = glob(input_path + '/ISIC2018_Task1_Training_GroundTruth/*.png')
print('Training images: {}, Ground Truth: {}'.format(len(tr_images_rel_path), len(gt_images_rel_path)))

Training images: 2594, Ground Truth: 2594


In [3]:
tr_images_rel_path = sorted(glob(os.path.join('*Training_Input*', '*.jpg')))
gt_images_rel_path = sorted(glob(os.path.join('*GroundTruth', '*.png')))
print('Training images: {}, Ground Truth: {}'.format(len(tr_images_rel_path), len(gt_images_rel_path)))

Training images: 0, Ground Truth: 0


In [7]:
df = pd.DataFrame({'image': tr_images_rel_path, 'mask': gt_images_rel_path})

In [8]:
def create_subsets(df):
    train_data, validate_data = train_test_split(df, test_size=0.20, random_state=42)
    print('Training samples: ' + str(len(train_data.values)))
    print('Validation samples: ' + str(len(validate_data.values)))
    train_data.insert(loc=2, column='subset', value='T')
    validate_data.insert(loc=2, column='subset', value='V')
    return train_data.append(validate_data, ignore_index=True)

df = create_subsets(df)

Training samples: 2075
Validation samples: 519


In [9]:
df.head(5)

Unnamed: 0,image,mask,subset
0,/Users/joakim/rep/ham10k_tutorial/ISIC2018_Tas...,/Users/joakim/rep/ham10k_tutorial/ISIC2018_Tas...,T
1,/Users/joakim/rep/ham10k_tutorial/ISIC2018_Tas...,/Users/joakim/rep/ham10k_tutorial/ISIC2018_Tas...,T
2,/Users/joakim/rep/ham10k_tutorial/ISIC2018_Tas...,/Users/joakim/rep/ham10k_tutorial/ISIC2018_Tas...,T
3,/Users/joakim/rep/ham10k_tutorial/ISIC2018_Tas...,/Users/joakim/rep/ham10k_tutorial/ISIC2018_Tas...,T
4,/Users/joakim/rep/ham10k_tutorial/ISIC2018_Tas...,/Users/joakim/rep/ham10k_tutorial/ISIC2018_Tas...,T


In [10]:
# Create preprocessor for images, cropping to specified size
image_processor = functools.partial(sidekick.process_image, mode='resize', size=(64, 64), file_format='png')

# Create dataset
sidekick.create_dataset(
    output_path,
    df,
    path_columns=['image','mask'],
    preprocess={
        'image': image_processor,
        'mask': image_processor
    }
)