In [1]:
import pandas as pd
import numpy as np
import os
import shutil
import matplotlib.pyplot as plt
import cv2

In [35]:
df = pd.read_csv('./data/HAM10000_metadata.csv')

In [36]:
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [37]:
df['dx_type'].unique()

array(['histo', 'consensus', 'confocal', 'follow_up'], dtype=object)

In [70]:
dx = df['dx'].unique()
dx

array(['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec'], dtype=object)

In [39]:
df['localization'].unique()

array(['scalp', 'ear', 'face', 'back', 'trunk', 'chest',
       'upper extremity', 'abdomen', 'unknown', 'lower extremity',
       'genital', 'neck', 'hand', 'foot', 'acral'], dtype=object)

In [38]:
list_img = os.listdir('./data/HAM10000_images_part_1')
list_img_2 = os.listdir('./data/HAM10000_images_part_2')
for img in list_img:
    cls = df.loc[img.rstrip('.jpg'), 'dx']
    shutil.copy(f'./data/HAM10000_images_part_1/{img}', f'./classes/{cls}/{img}')
for img in list_img_2:
    cls = df.loc[img.rstrip('.jpg'), 'dx']
    shutil.copy(f'./data/HAM10000_images_part_2/{img}', f'./classes/{cls}/{img}')

In [45]:
def show_image(string):
    if string + '.jpg' in list_img:
        img_1 = cv2.imread(f'data/HAM10000_images_part_1/{string}.jpg', cv2.IMREAD_COLOR)
        img = cv2.cvtColor(img_1, cv2.COLOR_BGR2RGB)
    elif string + '.jpg' in list_img_2:
        img_1 = cv2.imread(f'data/HAM10000_images_part_2/{string}.jpg', cv2.IMREAD_COLOR)
        img = cv2.cvtColor(img_1, cv2.COLOR_BGR2RGB)
    else:
        return 'string not in library'
    plt.imshow(img)

In [83]:
for d in dx:
    ratio = np.round(len(df[df['dx']=='nv'])/len(df[df['dx']==d]),2)
    print(f'{d}: {ratio}')

bkl: 6.1
nv: 1.0
df: 58.3
mel: 6.02
vasc: 47.22
bcc: 13.04
akiec: 20.5


split and new directories

In [151]:
from sklearn.utils import resample


In [204]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df, train_size=0.8, stratify=df['dx'], random_state=76)

In [193]:
# os.mkdir('./sampled_classes/train')
# os.mkdir('./sampled_classes/test')
# os.mkdir('./sampled_classes/validation')

In [205]:
df_sampled = pd.DataFrame(columns=df.columns)
for d in dx:
    df_sampled = pd.concat([df_sampled, resample(X_train[X_train['dx']==d], replace=True, n_samples=500)])

X_train, X_val = train_test_split(df_sampled, train_size=0.75, random_state=76, stratify=df_sampled['dx'])

In [195]:
def create_directory(train_test, frame):
    left = []
    for cls in dx:
        shutil.rmtree(f'./sampled_classes/{train_test}/{cls}', ignore_errors=True)
        os.mkdir(f'./sampled_classes/{train_test}/{cls}')
    i = 0
    for img, cls in zip(frame['image_id'], frame['dx']):
        if img+'.jpg' in list_img:
            shutil.copy(f'./data/HAM10000_images_part_1/{img}.jpg', f'./sampled_classes/{train_test}/{cls}/{img}_{str(i)}.jpg')
        elif img+'.jpg' in list_img_2:
            shutil.copy(f'./data/HAM10000_images_part_2/{img}.jpg', f'./sampled_classes/{train_test}/{cls}/{img}_{str(i)}.jpg')
        else:
            left.append(img)
        i += 1
    if len(left) == 0:
        return 'Everything alright'
    else:
        return left

In [206]:
create_directory('train', X_train)
create_directory('validation', X_val)
# create_directory('test', X_test)

'Everything alright'

In [207]:
for d in dx:
    print(d, len(os.listdir(f'./sampled_classes/train/{d}')))

bkl 375
nv 375
df 375
mel 375
vasc 375
bcc 375
akiec 375


In [208]:
for d in dx:
    print(d, len(os.listdir(f'./sampled_classes/validation/{d}')))

bkl 125
nv 125
df 125
mel 125
vasc 125
bcc 125
akiec 125


In [175]:
lst_df = list(df['image_id'])

In [180]:
total_list = list_img + list_img_2
total_list = [x.rstrip('.jpg') for x in  total_list]

In [182]:
set_1 = set(lst_df)
set_2 = set(total_list)

In [183]:
set_1.difference(set_2)

set()

In [184]:
set_2.difference(set_1)

set()

In [185]:
set_1 == set_2

True