# Dermatologist AI

This artificial neural networks classifies the following kinds of skin marks:
* **melanoma**: this term will be used for _malignant melanoma_ (https://en.wikipedia.org/wiki/Melanoma)
* **nevus**: it is a birthmark (https://en.wikipedia.org/wiki/Nevus)
* **seborreic keratosis**: it is a benign skin tumour (https://en.wikipedia.org/wiki/Seborrheic_keratosis)

## Import the datasets

## Content of the dataset

In [27]:
classes_names = ['melanoma', 'nevus', 'seborrheic_keratosis']
datasets_names = ['train', 'valid', 'test']

melanoma_files_count = dict()
nevus_files_count = dict()
seborrheic_keratosis_files_count = dict()

for dataset_name in datasets_names:
    melanoma_files_count[dataset_name] = len(np.array(glob(f"data/{dataset_name}/melanoma/*")))
    nevus_files_count[dataset_name] = len(np.array(glob(f"data/{dataset_name}/nevus/*")))
    seborrheic_keratosis_files_count[dataset_name] = len(np.array(glob(f"data/{dataset_name}/seborrheic_keratosis/*")))
    
print('Number of images')
print('                      train\tvalid\ttest')
print('Melanoma:             {}\t{}\t{}'.format(melanoma_files_count['train'], melanoma_files_count['valid'], melanoma_files_count['test']))
print('Nevus:                {}\t{}\t{}'.format(nevus_files_count['train'], nevus_files_count['valid'], nevus_files_count['test']))
print('Seborrheic keratosis: {}\t{}\t{}'.format(seborrheic_keratosis_files_count['train'], seborrheic_keratosis_files_count['valid'], seborrheic_keratosis_files_count['test']))
print('-----------------------------------------------')
total_train = melanoma_files_count['train'] + nevus_files_count['train'] + seborrheic_keratosis_files_count['train']
total_valid = melanoma_files_count['valid'] + nevus_files_count['valid'] + seborrheic_keratosis_files_count['valid']
total_test = melanoma_files_count['test'] + nevus_files_count['test'] + seborrheic_keratosis_files_count['test']
print('Total:                {}\t{}\t{}'.format(total_train, total_valid, total_test))


Number of images
                      train	valid	test
Melanoma:             374	30	117
Nevus:                1372	78	393
Seborrheic keratosis: 254	42	90
-----------------------------------------------
Total:                2000	150	600


## Load and transform the images

In [28]:
from torchvision import datasets
import torchvision.transforms as transforms

data = dict()

data_transforms = \
{
    'train': transforms.Compose([transforms.ToTensor()]),
    'valid': transforms.Compose([transforms.ToTensor()]),
    'test': transforms.Compose([transforms.ToTensor()]),
}

for dataset_name in datasets_names:
    data[dataset_name] = \
        datasets.ImageFolder(f"data/{dataset_name}", 
                             transform=data_transforms[dataset_name])


## Create the samplers

In [36]:
from torch.utils.data.sampler import SubsetRandomSampler

# A subset of the train data to train faster on CPU to check 
# if it converges fast enough
subset_size = \
{
    'train': 1,
    'valid': 1,
    'test': 1
}

samplers = dict()

for dataset_name in datasets_names:
    samples_count = len(data[dataset_name])
    indices = list(range(samples_count))
    np.random.shuffle(indices)
    split = int(np.floor(subset_size[dataset_name] * samples_count))
    idx = indices[:split]
    samplers[dataset_name] = SubsetRandomSampler(idx)
    print(f"{dataset_name} on {len(idx)} samples out of {samples_count} ({subset_size[dataset_name] * 100}%)")


train on 2000 samples out of 2000 (100%)
valid on 150 samples out of 150 (100%)
test on 600 samples out of 600 (100%)


## Create the dataloaders

In [38]:
import torch

# The worker function needs to be in another python file to work with jupyter notebooks.
# See this thread: https://stackoverflow.com/questions/48915440/pandas-multiprocessing-cant-get-attribute-function-on-main
from worker import worker_init_fn

num_workers = 6
classes_count = 3
batch_size = 32

loaders = dict()
for dataset_name in datasets_names:
    loaders[dataset_name] = \
        torch.utils.data.DataLoader(data[dataset_name],
                                    batch_size=batch_size, 
                                    num_workers=num_workers,
                                    sampler=samplers[dataset_name],
                                    worker_init_fn=worker_init_fn)
