In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as mping
import glob
import os
import shutil

In [2]:
# Environment variable
main_data_dir = 'D:/Data/HAM100000 - Harvard Dataset'
metadata_dir = main_data_dir + '/HAM10000_metadata.csv'
img_data_dir = main_data_dir + '/img_data'
test_data_dir = main_data_dir + '/test'
test_label_dir = main_data_dir + '/test_label.csv'
preprocessed_data_dir = main_data_dir + '/preprocessed_data_15split'
train_dir = preprocessed_data_dir + '/train'
val_dir = preprocessed_data_dir + '/val'

train_label_path = preprocessed_data_dir + '/train_label.csv'
test_label_path = preprocessed_data_dir + '/val_label.csv'

In [3]:
data_pd = pd.read_csv(metadata_dir)
data_pd.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern


In [4]:
data_pd = data_pd.dropna()
data_pd.shape

(9958, 8)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train_df, test_df = train_test_split(data_pd, test_size=0.15, stratify=data_pd['dx'])

In [8]:
def identify_trainOrtest(x):
    test_data = set(test_df['image_id'])
    if str(x) in test_data:
        return 'test'
    else:
        return 'train'

#creating train_df
data_pd['train_test_split'] = data_pd['image_id'].apply(identify_trainOrtest)
train_df = data_pd[data_pd['train_test_split'] == 'train']
train_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,train_test_split
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,train
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,train
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,train
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,train
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,train


In [9]:
test_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
5407,HAM_0005477,ISIC_0029552,nv,follow_up,75.0,male,lower extremity,vidir_molemax
2517,HAM_0007002,ISIC_0029515,bcc,histo,80.0,male,face,vidir_modern
9558,HAM_0004212,ISIC_0034114,nv,consensus,40.0,female,unknown,vidir_modern
6818,HAM_0002670,ISIC_0030426,nv,histo,75.0,male,back,vidir_modern
6092,HAM_0002913,ISIC_0025070,nv,follow_up,25.0,male,abdomen,vidir_molemax


In [10]:
train_list = list(train_df['image_id'])
test_list = list(test_df['image_id'])
print("Number of train: {}".format(len(train_list)))
print("Number of test: {}".format(len(test_list)))

Number of train: 8464
Number of test: 1494


In [11]:
data_pd.set_index('image_id', inplace=True)

In [12]:
targetnames = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

In [13]:
for i in targetnames:
  directory1=train_dir+'/'+i
  directory2=val_dir+'/'+i
  os.mkdir(directory1)
  os.mkdir(directory2)

In [14]:
for img in train_list:
    filename = img + '.jpg'
    label = data_pd.loc[img, 'dx']
    
    source = img_data_dir + '/' + filename
    
    target = os.path.join(train_dir, label, filename)
    
    shutil.copyfile(source, target)

In [15]:
for img in test_list:
    filename = img + '.jpg'
    label = data_pd.loc[img, 'dx']
    
    source = img_data_dir + '/' + filename
    target = os.path.join(val_dir, label, filename)
    
    shutil.copyfile(source, target)

In [16]:
train_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,train_test_split
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,train
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,train
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,train
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,train
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,train


In [17]:
drop_train = train_df.drop(['lesion_id', 'dx_type', 'dataset', 'train_test_split'], axis = 1)
drop_test = test_df.drop(['lesion_id', 'dx_type', 'dataset'], axis = 1)

In [18]:
drop_train.head()

Unnamed: 0,image_id,dx,age,sex,localization
0,ISIC_0027419,bkl,80.0,male,scalp
1,ISIC_0025030,bkl,80.0,male,scalp
2,ISIC_0026769,bkl,80.0,male,scalp
3,ISIC_0025661,bkl,80.0,male,scalp
4,ISIC_0031633,bkl,75.0,male,ear


In [19]:
drop_test.head()

Unnamed: 0,image_id,dx,age,sex,localization
5407,ISIC_0029552,nv,75.0,male,lower extremity
2517,ISIC_0029515,bcc,80.0,male,face
9558,ISIC_0034114,nv,40.0,female,unknown
6818,ISIC_0030426,nv,75.0,male,back
6092,ISIC_0025070,nv,25.0,male,abdomen


In [20]:
drop_train.to_csv(train_label_path)
drop_test.to_csv(test_label_path)