In [1]:
import os 
import pandas as pd
import shutil

# Explore data

In [2]:
df = pd.read_csv('./data/HAM10000_metadata.csv')

In [3]:
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [4]:
cls = list(df['dx'].unique())
cls

['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec']

In [5]:
for cl in cls:
    l = df[df['dx']==cl].shape[0]
    print(f'{cl}: {l}')

bkl: 1099
nv: 6705
df: 115
mel: 1113
vasc: 142
bcc: 514
akiec: 327


# Create directory of original files

In [6]:
os.makedirs('./classes', exist_ok=True)

In [7]:
for cl in cls:
    os.makedirs(f'./classes/{cl}', exist_ok=True)

In [10]:
df_2 = df.set_index('image_id')
for img in os.listdir('./data/HAM10000_images_part_1'):
    cl = df_2.loc[img.rstrip('.jpg'), 'dx']
    shutil.copy(f'./data/HAM10000_images_part_1/{img}', f'./classes/{cl}/{img}')
for img in os.listdir('./data/HAM10000_images_part_2'):
    cl = df_2.loc[img.rstrip('.jpg'), 'dx']
    shutil.copy(f'./data/HAM10000_images_part_2/{img}', f'./classes/{cl}/{img}')

# balancing the classes

In [8]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split


In [19]:
X_train, X_test = train_test_split(df, train_size=0.8, stratify=df['dx'], random_state=76)

X_val, X_test = train_test_split(X_test, train_size=0.5, random_state=76, stratify=X_test['dx'])

In [20]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(8012, 7)
(1001, 7)
(1002, 7)


In [25]:

S_train = pd.DataFrame(columns=df.columns)
S_val = pd.DataFrame(columns=df.columns)
for cl in cls:
    n_samples_train = 5364
    n_samples_val = 670
    frame = X_train[X_train['dx']==cl]
    n_train = frame.shape[0]
    while n_samples_train > n_train:
        S_train = pd.concat([S_train, frame])
        n_samples_train -= n_train
    S_train = pd.concat([S_train, resample(frame, replace=False, n_samples=n_samples_train)])

    frame_val = X_val[X_val['dx']==cl]
    n_val = frame_val.shape[0]
    while n_samples_val > n_val:
        S_val = pd.concat([S_val, frame_val])
        n_samples_val -= n_val
    S_val = pd.concat([S_val, resample(frame_val, replace=False, n_samples=n_samples_val)])

In [21]:
for cl in cls:
    l = X_train[X_train['dx']==cl].shape[0]
    print(f'{cl}: {l}')

bkl: 879
nv: 5364
df: 92
mel: 890
vasc: 114
bcc: 411
akiec: 262


In [22]:
for cl in cls:
    l = X_val[X_val['dx']==cl].shape[0]
    print(f'{cl}: {l}')

bkl: 110
nv: 670
df: 12
mel: 111
vasc: 14
bcc: 51
akiec: 33


## Putting classes into directories

In [13]:
def create_directory(train_test, frame):
    left = []
    cls = df['dx'].unique()
    for cl in cls:
        shutil.rmtree(f'./sampled_classes/{train_test}/{cl}', ignore_errors=True)
        os.makedirs(f'./sampled_classes/{train_test}/{cl}', exist_ok=True)
    i = 0
    for img, cl in zip(frame['image_id'], frame['dx']):
        if img+'.jpg' in os.listdir('./data/HAM10000_images_part_1'):
            shutil.copy(f'./data/HAM10000_images_part_1/{img}.jpg', f'./sampled_classes/{train_test}/{cl}/{img}_{str(i)}.jpg')
        elif img+'.jpg' in os.listdir('./data/HAM10000_images_part_2'):
            shutil.copy(f'./data/HAM10000_images_part_2/{img}.jpg', f'./sampled_classes/{train_test}/{cl}/{img}_{str(i)}.jpg')
        else:
            left.append(img)
        i += 1
    if len(left) == 0:
        print('Everything alright')
    else:
        return left

In [26]:
shutil.rmtree('./sampled_classes')

os.makedirs('./sampled_classes/train', exist_ok=True)
os.makedirs('./sampled_classes/test', exist_ok=True)
os.makedirs('./sampled_classes/validation', exist_ok=True)

create_directory('train', S_train)
create_directory('validation', S_val)
create_directory('test', X_test)

Everything alright
Everything alright
Everything alright


# Data augmentation

In [19]:
import Augmentor

In [20]:
p = Augmentor.Pipeline('./classes/akiec')

Initialised with 327 image(s) found.
Output directory set to ./classes/akiec\output.

In [22]:
p.rotate(probability=1., max_left_rotation=180, max_right_rotation=180)

ValueError: The max_left_rotation argument must be between 0 and 25.