## Load Packages

In [78]:
import glob
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

## Load Data

### Load Metadata

In [79]:
metadata_df = pd.read_csv("metadata.csv")
metadata_df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern
...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern


### Add filepath to dataframe

In [80]:
def map_id_to_path(image_id):
    return "dataset/"+image_id+".jpg"

metadata_df["filepath"] = metadata_df["image_id"].apply(lambda image_id: map_id_to_path(image_id))

In [81]:
metadata_df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0027419.jpg
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0025030.jpg
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0026769.jpg
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0025661.jpg
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,dataset/ISIC_0031633.jpg
...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033084.jpg
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033550.jpg
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033536.jpg
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern,dataset/ISIC_0032854.jpg


In [82]:
metadata_df.to_csv("dataset.csv", sep=';')

## View variety of variables

In [83]:
print(sorted(metadata_df.sex.unique()), len(metadata_df.sex.unique()))

['female', 'male', 'unknown'] 3


In [84]:
print(metadata_df.age.unique(), len(metadata_df.age.unique()))

[80. 75. 60. 70. 55. 85. 65. 40. 50. 45. 35.  0. 30. nan  5. 25. 20. 10.
 15.] 19


In [85]:
print(sorted(metadata_df.localization.unique()), len(metadata_df.localization.unique()))

['abdomen', 'acral', 'back', 'chest', 'ear', 'face', 'foot', 'genital', 'hand', 'lower extremity', 'neck', 'scalp', 'trunk', 'unknown', 'upper extremity'] 15


In [86]:
print(sorted(metadata_df.dx.unique()), len(metadata_df.dx.unique()))

['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc'] 7


## Create three Datasets

In [87]:
metadata_df_shuffled = shuffle(metadata_df)
metadata_df_shuffled

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
9356,HAM_0002332,ISIC_0025670,nv,consensus,0.0,male,foot,vidir_modern,dataset/ISIC_0025670.jpg
4516,HAM_0001350,ISIC_0030221,nv,follow_up,45.0,female,trunk,vidir_molemax,dataset/ISIC_0030221.jpg
91,HAM_0003410,ISIC_0024635,bkl,histo,60.0,female,face,vidir_modern,dataset/ISIC_0024635.jpg
2791,HAM_0001654,ISIC_0030755,bcc,histo,85.0,male,lower extremity,rosendahl,dataset/ISIC_0030755.jpg
4097,HAM_0007158,ISIC_0027206,nv,follow_up,50.0,female,lower extremity,vidir_molemax,dataset/ISIC_0027206.jpg
...,...,...,...,...,...,...,...,...,...
1975,HAM_0000888,ISIC_0025085,mel,histo,75.0,male,upper extremity,vienna_dias,dataset/ISIC_0025085.jpg
7384,HAM_0006883,ISIC_0032619,nv,histo,40.0,female,face,vidir_modern,dataset/ISIC_0032619.jpg
3952,HAM_0002891,ISIC_0028906,nv,follow_up,65.0,female,lower extremity,vidir_molemax,dataset/ISIC_0028906.jpg
7115,HAM_0006268,ISIC_0026432,nv,histo,50.0,female,lower extremity,vidir_modern,dataset/ISIC_0026432.jpg


In [88]:
train_df = metadata_df_shuffled.sample(frac=0.8, random_state=43) #random state is a seed value
test_and_validation = metadata_df_shuffled.drop(train_df.index)

test_df = test_and_validation.sample(frac=0.5, random_state=43) #random state is a seed value
validation_df = test_and_validation.drop(test_df.index)

In [89]:
train_df.to_csv("train.csv", sep=';')
test_df.to_csv("test.csv", sep=';')
validation_df.to_csv("validation.csv", sep=';')