In [1]:
import os
import cv2
import numpy as np

from PIL import Image
from pathlib import Path

import torch
from torch.utils.data import random_split
from torchvision.datasets import ImageFolder

In [2]:
# Raw dataset path
#datasetPath = 'data/Crudo'
datasetPath = 'data/crudo'

# Training percentage
trainPercentage = 0.8

# Output paths
rawOutputDir  = 'data/pp/raw'
histOutputDir = 'data/pp/hist'
bfOutputDir   = 'data/pp/bf'

### Misc functions used to validate and save the split of the raw data

In [3]:
# Function used to validate the distribution of the split.
def validateSplit(trainSet, testSet, classNames):
    classAmount = len(classNames)

    # Arrays to store the amount of samples of each class.
    trainSetDist = np.array([0] * classAmount)
    testSetDist  = np.array([0] * classAmount)

    # Count the samples of each class in the training set.
    for train in trainSet:
        trainSetDist[train[1]] += 1

    # Count the samples of each class in the testing set.
    for test in testSet:
        testSetDist[test[1]] += 1

    # Make the distribution vector (percentages of each class).
    trainSetProb = trainSetDist / len(trainSet) * 100
    testSetProb  = testSetDist  / len(testSet)  * 100

    # Print the data
    print('Training set class distribution:')
    for i in range(classAmount):
        print('\t{0}: {1} ({2:2.2f}%)'.format(classNames[i], trainSetDist[i], trainSetProb[i]))

    print('Testing set class distribution:')
    for i in range(classAmount):
        print('\t{0}: {1} ({2:2.2f}%)'.format(classNames[i], testSetDist[i], testSetProb[i]))

# Function used to save a split based on a type (training or testing).
def saveSplit(dataset, classNames, setType, path):
    # Get the dir for each class.
    dirs = [os.path.join(os.getcwd(), path, setType, className) for className in classNames]

    # Make the dirs.
    for _dir in dirs:
        Path(_dir).mkdir(parents=True, exist_ok=True)
    
    # Array for control the name of the files.
    classCounter = [0, 0, 0, 0]

    # For printing the progress.
    samplesProcessed = 0
    samplesStep = 25
    step = len(dataset) // 4

    # Each set can be iterated getting tuples of (PIL image, label).
    for sample in dataset:
        # Unpack the sample.
        (image, label) = sample
        # Save the image in the corresponding dir.
        image.save(os.path.join(dirs[label], str(classCounter[label]) + '.png'))
        # Increment the counter.
        classCounter[label] += 1
        
        # Progress printing.
        samplesProcessed += 1
        if (samplesProcessed % step == 0):
            print('\t' + str(samplesStep) + '% saved.')
            samplesStep += 25
    
    return dirs

# Function used to save the splits.
def saveTrainTestSplit(trainSet, testSet, classNames, path):
    # Save the training set.
    print('Saving training set')
    rawTrainDir = saveSplit(trainSet, classNames, 'train', path)
    print()

    print('Saving testing set')
    # Save the testing set.
    rawTestDir = saveSplit(testSet,  classNames, 'test',  path)

    return rawTrainDir, rawTestDir

### Histograms and bilateral filter
Functions used to create a version of the dataset based on histogramns an another one based on bilateral filter 

In [4]:
def createSaveHistogram(imgPath, outPath):
    # Open the imagen.
    img = cv2.imread(imgPath, cv2.IMREAD_GRAYSCALE)
    # Get the histogram.
    histogram, _ = np.histogram(img.ravel(), 256, [0, 256])
    # Normalize the histogram.
    histogram = histogram / np.sum(histogram)

    # Delete the .png extension.
    outPath = outPath[:-4]

    # Save the numpy
    np.save(outPath, histogram)

def createSaveBilateralFilter(imgPath, outPath):
    img = cv2.imread(imgPath, cv2.IMREAD_GRAYSCALE)
    bilateralFilter = cv2.bilateralFilter(img, 9, 75, 75)
    
    # Save the new imagen
    cv2.imwrite(outPath, bilateralFilter)

def makePreprocessing(inputDir, classNames, setType, path, preprocessingFunction):
    # Get the dir for each class.
    dirs = [os.path.join(os.getcwd(), path, setType, className) for className in classNames]

    # Make the dirs.
    for _dir in dirs:
        Path(_dir).mkdir(parents=True, exist_ok=True)

    # Process each folder.
    for i in range(0, len(inputDir)):

        # Get all files from directory 
        files = [f for f in os.listdir(inputDir[i]) if os.path.isfile(os.path.join(inputDir[i], f))]
        print('\tClass: ' + classNames[i])

        # For progress printing.
        samplesProcessed = 0
        samplesStep = 25
        step = len(files) // 4
        print('\t\t', end='')

        # Process all files in the dir.
        for fileName in files:
            # Create the in path
            inPath = inputDir[i] + '/' + fileName

            # Create the output path
            outPath = dirs[i] + '/' + fileName

            # Do the preprocessing.
            preprocessingFunction(inPath, outPath)
            
            # Progress printing.
            samplesProcessed += 1
            if (samplesProcessed % step == 0 and samplesStep == 100):
                print(str(samplesStep) + '% saved.')
            elif (samplesProcessed % step == 0):
                print(str(samplesStep) + '% saved, ', end='')
                samplesStep += 25

### Split data
Function to get the raw data and separate them in a training/testing set and save them. In order to maintain the same sets for all the experiments.

In [5]:
def createSplits(datasetPath, trainPercentage):
    print('/* Making the split *\\')

    # Get the raw dataset.
    dataset = ImageFolder(datasetPath)
    classNames = dataset.classes

    # Get the size of the training set and the testing set.
    trainLen = int(len(dataset) * trainPercentage)
    testLen  = len(dataset) - trainLen

    print('Len of the dataset:', len(dataset))
    print('\tTraining set: {0} ({1:2.2f}%)'.format(trainLen, trainLen / len(dataset) * 100))
    print('\tTesting set: {0} ({1:2.2f}%)'.format(testLen,  testLen  / len(dataset) * 100))
    print()

    # Random split the general dataset.
    trainSet, testSet = random_split(dataset, [trainLen, testLen])

    # For the raw dataset.
    # Validate the raw splits.
    validateSplit(trainSet, testSet, classNames)
    print()

    # Save the raw splits.
    print('/* Saving the raw splits *\\')
    (rawTrainDir, rawTestDir) = saveTrainTestSplit(trainSet, testSet, classNames, rawOutputDir)
    print()

    # For the histograms. createSaveBilateralFilter
    print('/* Making and saving the histograms *\\')
    print('Histograms for training')
    makePreprocessing(rawTrainDir, classNames, 'train', histOutputDir, createSaveHistogram)
    print('Histograms for testing')
    makePreprocessing(rawTestDir,  classNames, 'test',  histOutputDir, createSaveHistogram)
    print()

    # For the histograms. 
    print('/* Making and saving the bilateral filter images *\\')
    print('BF images for training')
    makePreprocessing(rawTrainDir, classNames, 'train', bfOutputDir, createSaveBilateralFilter)
    print('BF images for testing')
    makePreprocessing(rawTestDir,  classNames, 'test',  bfOutputDir, createSaveBilateralFilter)

In [6]:
createSplits(datasetPath, trainPercentage)

/* Making the split *\
Len of the dataset: 21165
	Training set: 16932 (80.00%)
	Testing set: 4233 (20.00%)

Training set class distribution:
	COVID: 2917 (17.23%)
	Lung_Opacity: 4831 (28.53%)
	Normal: 8111 (47.90%)
	Viral Pneumonia: 1073 (6.34%)
Testing set class distribution:
	COVID: 699 (16.51%)
	Lung_Opacity: 1181 (27.90%)
	Normal: 2081 (49.16%)
	Viral Pneumonia: 272 (6.43%)

/* Saving the raw splits *\
Saving training set
	25% saved.
	50% saved.
	75% saved.
	100% saved.

Saving testing set
	25% saved.
	50% saved.
	75% saved.
	100% saved.

/* Making and saving the histograms *\
Histograms for training
	Class: COVID
		25% saved, 50% saved, 75% saved, 100% saved.
	Class: Lung_Opacity
		25% saved, 50% saved, 75% saved, 100% saved.
	Class: Normal
		25% saved, 50% saved, 75% saved, 100% saved.
	Class: Viral Pneumonia
		25% saved, 50% saved, 75% saved, 100% saved.
Histograms for testing
	Class: COVID
		25% saved, 50% saved, 75% saved, 100% saved.
	Class: Lung_Opacity
		25% saved, 50% save