In [1]:
import pandas as pd
import numpy as np
from shutil import copy, rmtree 
import os
from sklearn.model_selection import train_test_split

In [2]:
data_pd = pd.read_csv('HAM10000_metadata.csv')
data_pd.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern


In [3]:
df_count = data_pd.groupby('lesion_id').count()
df_count.head()

Unnamed: 0_level_0,image_id,dx,dx_type,age,sex,localization,dataset
lesion_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HAM_0000000,2,2,2,2,2,2,2
HAM_0000001,1,1,1,1,1,1,1
HAM_0000002,3,3,3,3,3,3,3
HAM_0000003,1,1,1,1,1,1,1
HAM_0000004,1,1,1,1,1,1,1


In [4]:
df_count = df_count[df_count['dx'] == 1]
df_count.reset_index(inplace=True)

In [5]:
def duplicates(x):
    unique = set(df_count['lesion_id'])
    if x in unique:
        return 'no' 
    else:
        return 'duplicates'

In [6]:
data_pd['is_duplicate'] = data_pd['lesion_id'].apply(duplicates)
data_pd.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,is_duplicate
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,duplicates
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,duplicates
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,duplicates
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,duplicates
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,duplicates


In [7]:
df_count = data_pd[data_pd['is_duplicate'] == 'no']

In [8]:
train, test_df = train_test_split(df_count, test_size=0.15, stratify=df_count['dx'])

In [9]:
def identify_trainOrtest(x):
    test_data = set(test_df['image_id'])
    if str(x) in test_data:
        return 'test'
    else:
        return 'train'

#creating train_df
data_pd['train_test_split'] = data_pd['image_id'].apply(identify_trainOrtest)
train_df = data_pd[data_pd['train_test_split'] == 'train']
train_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,is_duplicate,train_test_split
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,duplicates,train
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,duplicates,train
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,duplicates,train
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,duplicates,train
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,duplicates,train


In [10]:
test_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,is_duplicate
9206,HAM_0005250,ISIC_0029792,nv,histo,40.0,female,foot,rosendahl,no
6374,HAM_0000076,ISIC_0028605,nv,follow_up,30.0,female,trunk,vidir_molemax,no
989,HAM_0000613,ISIC_0031253,bkl,consensus,60.0,male,hand,vidir_molemax,no
4557,HAM_0001261,ISIC_0025332,nv,follow_up,55.0,female,lower extremity,vidir_molemax,no
4367,HAM_0002328,ISIC_0026896,nv,follow_up,40.0,female,back,vidir_molemax,no


In [11]:
len(train_df),len(test_df)#(9187, 828)

(9187, 828)

In [12]:
# Image id of train and test images
train_list = list(train_df['image_id'])
test_list = list(test_df['image_id'])

# len(train_list),len(test_list)#(9187, 828)

#Set the image_id as the index in data_pd
data_pd.set_index('image_id', inplace=True)

In [13]:
#create store
train_dir = os.path.join(os.getcwd(), 'train_dir')
test_dir = os.path.join(os.getcwd(), 'test_dir')

In [14]:
os.mkdir(train_dir)
os.mkdir(test_dir)

In [15]:
# Image id of train and test images
train_list = list(train_df['image_id'])
test_list = list(test_df['image_id'])

In [16]:
targetnames = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

In [17]:
for i in targetnames:
    directory1=train_dir+'/'+i
    directory2=test_dir+'/'+i
    os.mkdir(directory1)
    os.mkdir(directory2)

In [18]:
for image in train_list:
    file_name = image+'.jpg'
    label = data_pd.loc[image, 'dx']

    # path of source image 
    source = os.path.join('Images', file_name)

    # copying the image from the source to target file
    target = os.path.join(train_dir, label, file_name)

    copy(source, target)

In [19]:
for image in test_list:

    file_name = image+'.jpg'
    label = data_pd.loc[image, 'dx']

    # path of source image 
    source = os.path.join('Images', file_name)
    
    # copying the image from the source to target file
    target = os.path.join(test_dir, label, file_name)

    copy(source, target)