## 1. Setup

In [1]:
import sys
sys.path.append('..')

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import skimage.io
import warnings

from pprint import pprint

In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

warnings.filterwarnings('ignore')

## 2. Datasets Stats

In [4]:
def relative_distribution_of_img_counts(counts):
    total = counts.size
    
    d = {}
    for i in range(100, 300 + 1, 100):
        curr = np.sum((i - 100 <= counts) & (counts < i))
        d[f'{i - 100}-{i}'] = f'{curr} ({100. * (curr / total):.2f}%)'
        
    curr = np.sum(counts >= 300)
    d[f'>= 300'] = f'{curr} ({100. * (curr / total):.2f}%)'
    
    return d

for dataset_name in ['vgg_cells', 'carpk', 'shanghai_tech/part_b']:
    dataset_path = f'../datasets/{dataset_name}'
    
    print()
    print(f'-----{dataset_name}-----')
    print(dataset_path)
    
    counts = {}
    for split_name in ['train', 'val', 'test']:
        split_path = f'{dataset_path}/{split_name}'
        dots_path = f'{split_path}/gt_dots'
        
        dot_img_names = []
        if os.path.exists(dots_path):
            dot_img_names = sorted(os.listdir(dots_path))
        
        counts[split_name] = []
        for img_name in dot_img_names:
            dot_img = skimage.io.imread(f'{dots_path}/{img_name}') > 0
            count = dot_img.sum()
            counts[split_name].append(count)
            
    
    print(f'Train size: {len(counts["train"]) + len(counts["val"])} images')
    print(f'Test size: {len(counts["test"])} images')
    
    counts_all = np.array(counts['train'] + counts['val'] + counts['test'])
    train_val_counts = np.array(counts['train'] + counts['val'])
    print('->Objects per image:')
    print(f'Min: {counts_all.min()} (train: {np.min(train_val_counts)}, test: {np.min(counts["test"])})')
    print(f'Max: {counts_all.max()} (train: {np.max(train_val_counts)}, test: {np.max(counts["test"])})')
    print(f'Avg: {counts_all.mean():.1f} (train: {np.mean(train_val_counts):.1f}, test: {np.mean(counts["test"]):.1f})')
    print(f'Total: {counts_all.sum()} (train: {np.sum(train_val_counts)}, test: {np.sum(counts["test"])})')
          
    print('->Relative distribution of image counts within the dataset:')
    print(f'**Train set ({train_val_counts.size} images)**')
    pprint(relative_distribution_of_img_counts(train_val_counts))
    print(f'**Test set ({len(counts["test"])} images)**')
    pprint(relative_distribution_of_img_counts(np.asarray(counts['test'])))


-----vgg_cells-----
../datasets/vgg_cells
Train size: 100 images
Test size: 100 images
->Objects per image:
Min: 74 (train: 78, test: 74)
Max: 317 (train: 315, test: 317)
Avg: 176.0 (train: 180.4, test: 171.5)
Total: 35192 (train: 18045, test: 17147)
->Relative distribution of image counts within the dataset:
**Train set (100 images)**
{'0-100': '10 (10.00%)',
 '100-200': '53 (53.00%)',
 '200-300': '34 (34.00%)',
 '>= 300': '3 (3.00%)'}
**Test set (100 images)**
{'0-100': '14 (14.00%)',
 '100-200': '50 (50.00%)',
 '200-300': '32 (32.00%)',
 '>= 300': '4 (4.00%)'}

-----carpk-----
../datasets/carpk
Train size: 989 images
Test size: 459 images
->Objects per image:
Min: 1 (train: 1, test: 2)
Max: 188 (train: 87, test: 188)
Avg: 62.0 (train: 42.7, test: 103.5)
Total: 89772 (train: 42273, test: 47499)
->Relative distribution of image counts within the dataset:
**Train set (989 images)**
{'0-100': '989 (100.00%)',
 '100-200': '0 (0.00%)',
 '200-300': '0 (0.00%)',
 '>= 300': '0 (0.00%)'}
**T