## Load Packages

In [66]:
import glob
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

## Load Data

### Load Metadata

In [67]:
metadata_df = pd.read_csv("metadata.csv")
metadata_df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern
...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern


### Add filepath to dataframe

In [68]:
def map_id_to_path(image_id):
    return "dataset/"+image_id+".jpg"

metadata_df["filepath"] = metadata_df["image_id"].apply(lambda image_id: map_id_to_path(image_id))

In [69]:
metadata_df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0027419.jpg
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0025030.jpg
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0026769.jpg
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0025661.jpg
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,dataset/ISIC_0031633.jpg
...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033084.jpg
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033550.jpg
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033536.jpg
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern,dataset/ISIC_0032854.jpg


In [70]:
metadata_df.to_csv("dataset.csv", sep=';')

## View variety of variables

In [71]:
print(sorted(metadata_df.sex.unique()), len(metadata_df.sex.unique()))

['female', 'male', 'unknown'] 3


In [72]:
print(metadata_df.age.unique(), len(metadata_df.age.unique()))

[80. 75. 60. 70. 55. 85. 65. 40. 50. 45. 35.  0. 30. nan  5. 25. 20. 10.
 15.] 19


In [73]:
print(sorted(metadata_df.localization.unique()), len(metadata_df.localization.unique()))

['abdomen', 'acral', 'back', 'chest', 'ear', 'face', 'foot', 'genital', 'hand', 'lower extremity', 'neck', 'scalp', 'trunk', 'unknown', 'upper extremity'] 15


In [74]:
print(sorted(metadata_df.dx.unique()), len(metadata_df.dx.unique()))

['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc'] 7


## Create three Datasets

In [75]:
metadata_df_shuffled = shuffle(metadata_df)
metadata_df_shuffled

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
2419,HAM_0000845,ISIC_0025244,vasc,consensus,45.0,female,trunk,vidir_molemax,dataset/ISIC_0025244.jpg
3316,HAM_0003438,ISIC_0031151,nv,follow_up,45.0,female,trunk,vidir_molemax,dataset/ISIC_0031151.jpg
1950,HAM_0002491,ISIC_0026770,mel,histo,65.0,male,upper extremity,vienna_dias,dataset/ISIC_0026770.jpg
7143,HAM_0000407,ISIC_0032771,nv,histo,60.0,female,upper extremity,vidir_modern,dataset/ISIC_0032771.jpg
1264,HAM_0002720,ISIC_0025277,mel,histo,75.0,male,ear,vidir_modern,dataset/ISIC_0025277.jpg
...,...,...,...,...,...,...,...,...,...
7629,HAM_0005566,ISIC_0032746,nv,histo,70.0,male,chest,vidir_modern,dataset/ISIC_0032746.jpg
7559,HAM_0002253,ISIC_0033492,nv,histo,35.0,male,back,vidir_modern,dataset/ISIC_0033492.jpg
7770,HAM_0004038,ISIC_0032990,nv,histo,50.0,female,back,vidir_modern,dataset/ISIC_0032990.jpg
3053,HAM_0006498,ISIC_0028446,nv,follow_up,45.0,male,lower extremity,vidir_molemax,dataset/ISIC_0028446.jpg


In [76]:
train_df = metadata_df_shuffled.sample(frac=0.8, random_state=43) #random state is a seed value
test_and_validation = metadata_df_shuffled.drop(train_df.index)

test_df = test_and_validation.sample(frac=0.5, random_state=43) #random state is a seed value
validation_df = test_and_validation.drop(test_df.index)

In [77]:
train_df.to_csv("train.csv", sep=';')
test_df.to_csv("test.csv", sep=';')
validation_df.to_csv("validation.csv", sep=';')