In [1]:
import os
import numpy as np
from PIL import Image
from pathlib import Path

import torch
from torch.utils.data import random_split
from torchvision.datasets import ImageFolder

In [2]:
# Raw dataset path
datasetPath = 'data/Crudo'

# Training percentage
trainPercentage = 0.8

# Output paths
baseDir = 'data/raw'

### Misc funnctions

In [3]:
# Function used to validate the distribution of the split.
def validateSplit(trainSet, testSet, classNames):
    classAmount = len(classNames)

    # Arrays to store the amount of samples of each class.
    trainSetDist = np.array([0] * classAmount)
    testSetDist  = np.array([0] * classAmount)

    # Count the samples of each class in the training set.
    for train in trainSet:
        trainSetDist[train[1]] += 1

    # Count the samples of each class in the testing set.
    for test in testSet:
        testSetDist[test[1]] += 1

    # Make the distribution vector (percentages of each class).
    trainSetProb = trainSetDist / len(trainSet) * 100
    testSetProb  = testSetDist  / len(testSet)  * 100

    # Print the data
    print('Training set class distribution:')
    for i in range(classAmount):
        print('\t{0}: {1} ({2:2.2f}%)'.format(classNames[i], trainSetDist[i], trainSetProb[i]))

    print('Testing set class distribution:')
    for i in range(classAmount):
        print('\t{0}: {1} ({2:2.2f}%)'.format(classNames[i], testSetDist[i], testSetProb[i]))

# Function used to save a split based on a type (training or testing).
def saveSplit(dataset, classNames, setType, path):
    # Get the dir for each class.
    dirs = [os.path.join(os.getcwd(), path, setType, className) for className in classNames]

    # Make the dirs.
    for _dir in dirs:
        Path(_dir).mkdir(parents=True, exist_ok=True)
    
    # Array for control the name of the files.
    classCounter = [0, 0, 0, 0]

    # For printing the progress.
    samplesProcessed = 0
    samplesStep = 25
    step = len(dataset) // 4

    # Each set can be iterated getting tuples of (PIL image, label).
    for sample in dataset:
        # Unpack the sample.
        (image, label) = sample
        # Save the image in the corresponding dir.
        image.save(os.path.join(dirs[label], str(classCounter[label]) + '.png'))
        # Increment the counter.
        classCounter[label] += 1
        
        # Progress printing.
        samplesProcessed += 1
        if (samplesProcessed % step == 0):
            print('\t' + str(samplesStep) + '% saved.')
            samplesStep += 25

# Function used to save the splits.
def saveTrainTestSplit(trainSet, testSet, classNames, path):
    # Save the training set.
    print('Saving training set')
    saveSplit(trainSet, classNames, 'train', path)
    print()

    print('Saving testing set')
    # Save the testing set.
    saveSplit(testSet,  classNames, 'test',  path)

### Split data
Function to get the raw data and separate them in a training/testing set and save them. In order to maintain the same sets for all the experiments.

In [4]:
def splitDataset(datasetPath, trainPercentage, save):
    # Get the raw dataset.
    dataset = ImageFolder(datasetPath)
    classNames = dataset.classes

    # Get the size of the training set and the testing set.
    trainLen = int(len(dataset) * 0.8)
    testLen  = len(dataset) - trainLen

    print('Len of the dataset:', len(dataset))
    print('\tTraining set: {0} ({1:2.2f}%)'.format(trainLen, trainLen / len(dataset) * 100))
    print('\tTesting set: {0} ({1:2.2f}%)'.format(testLen,  testLen  / len(dataset) * 100))
    print()

    # Random split the general dataset.
    trainSet, testSet = random_split(dataset, [trainLen, testLen])

    # Validate the splits.
    validateSplit(trainSet, testSet, classNames)
    print()

    # Save the splits.
    if save:
        saveTrainTestSplit(trainSet, testSet, classNames, baseDir)

In [5]:
splitDataset(datasetPath, trainPercentage, True)

Len of the dataset: 21165
	Training set: 16932 (80.00%)
	Testing set: 4233 (20.00%)

Training set class distribution:
	COVID: 2906 (17.16%)
	Lung_Opacity: 4795 (28.32%)
	Normal: 8158 (48.18%)
	Viral_Pneumonia: 1073 (6.34%)
Testing set class distribution:
	COVID: 710 (16.77%)
	Lung_Opacity: 1217 (28.75%)
	Normal: 2034 (48.05%)
	Viral_Pneumonia: 272 (6.43%)

Saving training set
	25% saved.
	50% saved.
	75% saved.
	100% saved.

Saving testing set
	25% saved.
	50% saved.
	75% saved.
	100% saved.
