In [1]:
import torch
import torchvision
import os

In [4]:
VOC_COLORS = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
                [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
                [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
                [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128],
                [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
                [0, 64, 128]]

VOC_CLASSES = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 'chair', 'cow',
               'diningtable', 'dog', 'horse', 'motorbike', 'person',
               'potted plant', 'sheep', 'sofa', 'train', 'tv/monitor']

COLORMAP = dict(zip([tuple(lst) for lst in VOC_COLORS], range(21)))
COLORMAP[(224, 224, 192)] = 0
COLORMAP


def label_preprocessing(label: torch.Tensor):
    int_label = torch.zeros(label.shape[1:])
    for x, row in enumerate(label.permute(1, 2, 0)):
        for y, color in enumerate(row):
            int_label[x, y] = COLORMAP[tuple(color.tolist())]
    
    return int_label
    

In [2]:
TRAIN_PATH = "ImageSets/Segmentation/train.txt"
IMAGE_PATH = "JPEGImages"
SEGMENTATION_LABEL_PATH = "SegmentationClass"

dataset_path = "../../datasets/VOC2012"

In [13]:
with open(os.path.join(dataset_path, TRAIN_PATH)) as f:
    file_list = [string for string in f.read().split('\n') if string]


num_classes = [0] * 21

for i, file in enumerate(file_list):
    label = torchvision.io.read_image(
        os.path.join(dataset_path, SEGMENTATION_LABEL_PATH, file + '.png'),
        mode=torchvision.io.image.ImageReadMode.RGB)
    processed_label = label_preprocessing(label)
    
    for idx in range(21):
        num_classes[idx] += int(torch.sum(processed_label == idx))
    
    if (i + 1) % 20 == 0:
        print(f"{i + 1} images processed.")


num_classes

20 images processed.
40 images processed.
60 images processed.
80 images processed.
100 images processed.
120 images processed.
140 images processed.
160 images processed.
180 images processed.
200 images processed.
220 images processed.
240 images processed.
260 images processed.
280 images processed.
300 images processed.
320 images processed.
340 images processed.
360 images processed.
380 images processed.
400 images processed.
420 images processed.
440 images processed.
460 images processed.
480 images processed.
500 images processed.
520 images processed.
540 images processed.
560 images processed.
580 images processed.
600 images processed.
620 images processed.
640 images processed.
660 images processed.
680 images processed.
700 images processed.
720 images processed.
740 images processed.
760 images processed.
780 images processed.
800 images processed.
820 images processed.
840 images processed.
860 images processed.
880 images processed.
900 images processed.
920 images pro

[196349993,
 1780580,
 758311,
 2232247,
 1514260,
 1517186,
 4375622,
 3494749,
 6752515,
 2861091,
 2060925,
 3381632,
 4344951,
 2283739,
 2888641,
 11995853,
 1670340,
 2254463,
 3612229,
 3984238,
 2349235]

In [16]:
num_classes[0] / sum(num_classes)

0.7481059906394354

{0, 15}