### Required imports

In [None]:
import os
import cv2
import numpy as np

from PIL import Image
from pathlib import Path

import torch
from torch.utils.data import random_split
from torchvision.datasets import ImageFolder

### Misc functions used to validate and save the split of the raw data

In [None]:
# Function used to validate the distribution of the split.
def validateSplit(noLabelSet, labelTrainingSet, labelTestingSet, classNames):
    classAmount = len(classNames)

    # Arrays to store the amount of samples of each class.
    noLabelDist       = np.array([0] * classAmount)
    labelTrainingDist = np.array([0] * classAmount)
    labelTestingDist  = np.array([0] * classAmount)

    # Count the samples of each class in the no labels set.
    for sample in noLabelSet:
        noLabelDist[sample[1]] += 1

    # Count the samples of each class in the labels training set.
    for sample in labelTrainingSet:
        labelTrainingDist[sample[1]] += 1

    # Count the samples of each class in the labels testing set.
    for sample in labelTestingSet:
        labelTestingDist[sample[1]] += 1

    # Make the distribution vector (percentages of each class).
    noLabelProb       = noLabelDist / len(noLabelSet) * 100
    labelTrainingProb = labelTrainingDist / len(labelTrainingSet) * 100
    labelTestingProb  = labelTestingDist / len(labelTestingSet) * 100

    # Print the distributions.
    tableSep = "+{0}+{1}+{1}+{1}+".format('-' * 52, '-' * 14)
    tableCell = "{0:>4} ({1:2.2f}%)"
    tableLine = "| {0:>50} | {1:<12} | {2:<12} | {3:<12} |"

    print('Class distribution of each set')
    print(tableSep)
    print(tableLine.format('Class name', 'No label', 'Training', 'Testing'))
    print(tableSep)
    for i in range(classAmount):
        noLabelStr       = tableCell.format(noLabelDist[i], noLabelProb[i])
        labelTrainingStr = tableCell.format(labelTrainingDist[i], labelTrainingProb[i])
        labelTestingStr  = tableCell.format(labelTestingDist[i], labelTestingProb[i])
        print(tableLine.format(classNames[i], noLabelStr, labelTrainingStr, labelTestingStr))
    print(tableSep)

# Function used to save a split based on a type (training or testing).
def saveSplit(dataset, classNames, setType, path):
    # Get the dir for each class.
    dirs = [os.path.join(os.getcwd(), path, setType, className) for className in classNames]

    # Make the dirs.
    for _dir in dirs:
        Path(_dir).mkdir(parents=True, exist_ok=True)
    
    # Array for control the name of the files.
    classCounter = np.array([0] * len(classNames))

    # For printing the progress.
    samplesProcessed = 0
    samplesStep = 25
    step = len(dataset) // 4

    # Each set can be iterated getting tuples of (PIL image, label).
    for sample in dataset:
        # Unpack the sample.
        (image, label) = sample
        # Save the image in the corresponding dir.
        image.save(os.path.join(dirs[label], str(classCounter[label]) + '.png'))
        # Increment the counter.
        classCounter[label] += 1
        
        # Progress printing.
        samplesProcessed += 1
        if (samplesProcessed % step == 0):
            print('\t' + str(samplesStep) + '% saved.')
            samplesStep += 25

# Function used to save the splits.
def saveSplits(noLabelSet, labelTrainingSet, labelTestingSet, classNames, path):
    print('Saving no labels set')
    rawTrainDir = saveSplit(noLabelSet, classNames, 'noLabel', path)
    print()

    print('Saving labels training set')
    rawTestDir = saveSplit(labelTrainingSet, classNames, 'labelTrain', path)
    print()

    print('Saving labels testing set')
    rawTestDir = saveSplit(labelTestingSet, classNames, 'labelTest', path)

### Split data
Function to get the raw data and separate them in a no label, label training, label testing set and save them. In order to maintain the same sets for all the runs.

In [None]:
def createSplits(datasetPath, noLabelPercentage, labelTrainingPercentage, labelTestingPercentage, outputPath):
    print('/* Making the split *\\')

    # Get the raw dataset.
    dataset = ImageFolder(datasetPath)
    classNames = dataset.classes

    # Get the size of the no label set, label set for training and label set for testing.
    noLabelLen        = int(len(dataset) * noLabelPercentage)
    labelTrainingLen  = int(len(dataset) * labelTrainingPercentage)
    labelTestingLen   = len(dataset) - noLabelLen - labelTrainingLen

    print('Len of the dataset:', len(dataset))
    print('Set distribution:')
    print('\t      No labels set: {0} ({1:2.2f}%)'.format(noLabelLen, noLabelLen / len(dataset) * 100))
    print('\tLabels training set: {0} ({1:2.2f}%)'.format(labelTrainingLen, labelTrainingLen / len(dataset) * 100))
    print('\t Labels testing set: {0} ({1:2.2f}%)'.format(labelTestingLen, labelTestingLen / len(dataset) * 100))

    # Random split the general dataset.
    noLabelSet, labelTrainingSet, labelTestingSet = random_split(dataset, [noLabelLen, labelTrainingLen, labelTestingLen])

    # Validate the splits.
    print()
    validateSplit(noLabelSet, labelTrainingSet, labelTestingSet, classNames)
    print()

    # Save the raw splits.
    print('/* Saving the raw splits *\\')
    saveSplits(noLabelSet, labelTrainingSet, labelTestingSet, classNames, outputPath)

### First run
- Data without labels: 80%
- With labels (training): 10%
- With labels (test): 10%

In [None]:
# Raw dataset path
datasetPath = 'data/raw'

# Output paths
outputPath  = 'data/corrida1'

# Training percentage
noLabelPercentage = 0.8
labelTrainingPercentage = 0.1
labelTestingPercentage = 0.1

createSplits(datasetPath, noLabelPercentage, labelTrainingPercentage, labelTestingPercentage, outputPath)

### Second run
- Data without labels: 50%
- With labels (training): 35%
- With labels (test): 15%

In [None]:
# Raw dataset path
datasetPath = 'data/raw'

# Output paths
outputPath  = 'data/corrida2'

# Training percentage
noLabelPercentage = 0.5
labelTrainingPercentage = 0.35
labelTestingPercentage = 0.15

createSplits(datasetPath, noLabelPercentage, labelTrainingPercentage, labelTestingPercentage, outputPath)