In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as mping
import glob
import os
import shutil

In [2]:
# Environment variable
main_data_dir = 'D:/Data/HAM100000 - Harvard Dataset'
metadata_dir = main_data_dir + '/HAM10000_metadata.csv'
img_data_dir = main_data_dir + '/img_data'
test_data_dir = main_data_dir + '/test'
test_label_dir = main_data_dir + '/test_label.csv'
preprocessed_data_dir = main_data_dir + '/preprocessed_data_15split'
train_dir = preprocessed_data_dir + '/train'
val_dir = preprocessed_data_dir + '/val'

train_label_path = preprocessed_data_dir + '/train_label.csv'
test_label_path = preprocessed_data_dir + '/val_label.csv'

In [3]:
data_pd = pd.read_csv(metadata_dir)
data_pd.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern


In [4]:
data_pd = data_pd.dropna()
data_pd.shape

(9958, 8)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train_df, test_df = train_test_split(data_pd, test_size=0.15, stratify=data_pd['dx'])

In [7]:
def identify_trainOrtest(x):
    test_data = set(test_df['image_id'])
    if str(x) in test_data:
        return 'test'
    else:
        return 'train'

#creating train_df
data_pd['train_test_split'] = data_pd['image_id'].apply(identify_trainOrtest)
train_df = data_pd[data_pd['train_test_split'] == 'train']
train_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,train_test_split
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,train
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,train
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,train
5,HAM_0001466,ISIC_0027850,bkl,histo,75.0,male,ear,vidir_modern,train
6,HAM_0002761,ISIC_0029176,bkl,histo,60.0,male,face,vidir_modern,train


In [8]:
test_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
9332,HAM_0004646,ISIC_0029140,nv,consensus,25.0,male,back,vidir_modern
1713,HAM_0002342,ISIC_0032925,mel,histo,40.0,male,back,vidir_modern
9337,HAM_0002832,ISIC_0032026,nv,consensus,5.0,male,trunk,vidir_modern
4299,HAM_0006768,ISIC_0028284,nv,follow_up,55.0,female,lower extremity,vidir_molemax
3607,HAM_0000227,ISIC_0032201,nv,follow_up,45.0,male,lower extremity,vidir_molemax


In [9]:
train_list = list(train_df['image_id'])
test_list = list(test_df['image_id'])
print("Number of train: {}".format(len(train_list)))
print("Number of test: {}".format(len(test_list)))

Number of train: 8464
Number of test: 1494


In [10]:
data_pd.set_index('image_id', inplace=True)

In [11]:
targetnames = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

In [12]:
for i in targetnames:
  directory1=train_dir+'/'+i
  directory2=val_dir+'/'+i
  os.mkdir(directory1)
  os.mkdir(directory2)

In [13]:
for img in train_list:
    filename = img + '.jpg'
    label = data_pd.loc[img, 'dx']
    
    source = img_data_dir + '/' + filename
    
    target = os.path.join(train_dir, label, filename)
    
    shutil.copyfile(source, target)

In [14]:
for img in test_list:
    filename = img + '.jpg'
    label = data_pd.loc[img, 'dx']
    
    source = img_data_dir + '/' + filename
    target = os.path.join(val_dir, label, filename)
    
    shutil.copyfile(source, target)

In [15]:
train_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,train_test_split
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,train
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,train
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,train
5,HAM_0001466,ISIC_0027850,bkl,histo,75.0,male,ear,vidir_modern,train
6,HAM_0002761,ISIC_0029176,bkl,histo,60.0,male,face,vidir_modern,train


In [16]:
drop_train = train_df.drop(['lesion_id', 'dx_type', 'dataset', 'train_test_split'], axis = 1)
drop_test = test_df.drop(['lesion_id', 'dx_type', 'dataset'], axis = 1)

In [17]:
drop_train.head()

Unnamed: 0,image_id,dx,age,sex,localization
0,ISIC_0027419,bkl,80.0,male,scalp
3,ISIC_0025661,bkl,80.0,male,scalp
4,ISIC_0031633,bkl,75.0,male,ear
5,ISIC_0027850,bkl,75.0,male,ear
6,ISIC_0029176,bkl,60.0,male,face


In [18]:
drop_test.head()

Unnamed: 0,image_id,dx,age,sex,localization
9332,ISIC_0029140,nv,25.0,male,back
1713,ISIC_0032925,mel,40.0,male,back
9337,ISIC_0032026,nv,5.0,male,trunk
4299,ISIC_0028284,nv,55.0,female,lower extremity
3607,ISIC_0032201,nv,45.0,male,lower extremity


In [19]:
drop_train.to_csv(train_label_path)
drop_test.to_csv(test_label_path)