## Load Packages

In [1]:
import glob
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

## Constants

In [2]:
RANDOM_SEED = 43

## Load Data

### Load Metadata

In [3]:
metadata_df = pd.read_csv("metadata.csv")
metadata_df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern
...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern


### Add filepath to dataframe

In [4]:
def map_id_to_path(image_id):
    return "dataset/"+image_id+".jpg"

metadata_df["filepath"] = metadata_df["image_id"].apply(lambda image_id: map_id_to_path(image_id))

In [5]:
metadata_df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0027419.jpg
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0025030.jpg
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0026769.jpg
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0025661.jpg
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,dataset/ISIC_0031633.jpg
...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033084.jpg
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033550.jpg
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033536.jpg
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern,dataset/ISIC_0032854.jpg


In [6]:
metadata_df.to_csv("dataset.csv", sep=';')

## View variety of variables

In [7]:
print(sorted(metadata_df.sex.unique()), len(metadata_df.sex.unique()))

['female', 'male', 'unknown'] 3


In [8]:
print(metadata_df.age.unique(), len(metadata_df.age.unique()))

[80. 75. 60. 70. 55. 85. 65. 40. 50. 45. 35.  0. 30. nan  5. 25. 20. 10.
 15.] 19


In [9]:
print(sorted(metadata_df.localization.unique()), len(metadata_df.localization.unique()))

['abdomen', 'acral', 'back', 'chest', 'ear', 'face', 'foot', 'genital', 'hand', 'lower extremity', 'neck', 'scalp', 'trunk', 'unknown', 'upper extremity'] 15


In [10]:
print(sorted(metadata_df.dx.unique()), len(metadata_df.dx.unique()))

['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc'] 7


## Create three Datasets

In [11]:
metadata_df_shuffled = shuffle(metadata_df, random_state=RANDOM_SEED)
metadata_df_shuffled

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
617,HAM_0006173,ISIC_0031961,bkl,histo,60.0,female,face,rosendahl,dataset/ISIC_0031961.jpg
4992,HAM_0007602,ISIC_0031262,nv,follow_up,50.0,male,abdomen,vidir_molemax,dataset/ISIC_0031262.jpg
6171,HAM_0006289,ISIC_0031835,nv,follow_up,70.0,male,back,vidir_molemax,dataset/ISIC_0031835.jpg
6880,HAM_0004663,ISIC_0025809,nv,histo,55.0,male,upper extremity,vidir_modern,dataset/ISIC_0025809.jpg
1422,HAM_0004102,ISIC_0029726,mel,histo,75.0,male,face,vidir_modern,dataset/ISIC_0029726.jpg
...,...,...,...,...,...,...,...,...,...
8499,HAM_0004030,ISIC_0029926,nv,histo,50.0,male,hand,rosendahl,dataset/ISIC_0029926.jpg
2064,HAM_0002719,ISIC_0029033,mel,histo,30.0,female,upper extremity,rosendahl,dataset/ISIC_0029033.jpg
7985,HAM_0004172,ISIC_0034171,nv,histo,45.0,male,back,vidir_modern,dataset/ISIC_0034171.jpg
2303,HAM_0003504,ISIC_0026823,mel,histo,55.0,male,face,rosendahl,dataset/ISIC_0026823.jpg


In [12]:
train_df = metadata_df_shuffled.sample(frac=0.8, random_state=RANDOM_SEED) #random state is a seed value
test_and_validation = metadata_df_shuffled.drop(train_df.index)

test_df = test_and_validation.sample(frac=0.5, random_state=RANDOM_SEED) #random state is a seed value
validation_df = test_and_validation.drop(test_df.index)

In [13]:
train_df.to_csv("train.csv", sep=';')
test_df.to_csv("test.csv", sep=';')
validation_df.to_csv("validation.csv", sep=';')