In [1]:
def generate_pareto_distribution(largest_class_size, num_classes, imbalance_ratio):
    smallest_class_size = largest_class_size / imbalance_ratio
    
    # Calculate the ratio factor for the geometric series
    factor = (largest_class_size / smallest_class_size)**(1 / (num_classes - 1))
    
    class_instances = [round(largest_class_size / (factor**i)) for i in range(num_classes)]
    
    # Ensure the largest class has the exact number of instances
    class_instances[0] = largest_class_size
    
    return class_instances

In [4]:
import numpy as np
import os
from collections import Counter
source_dir = '/mnt/sda/julie/datasets/medmnist/npz_files/'
dic_original_distribution = {}
dic_pareto_distribution = {}
dic_class_map = {}
for npz_file in os.listdir(source_dir):
    dataset = npz_file.split('_')[0]
    print(npz_file)
    npz_arrays = np.load(source_dir+npz_file)
    print(npz_arrays.files)
    train_labels = [x[0] for x in npz_arrays['train_labels'].tolist()]
    val_labels = [x[0] for x in npz_arrays['val_labels'].tolist()]
    test_labels = [x[0] for x in npz_arrays['test_labels'].tolist()]
    distribution = Counter(train_labels).most_common()
    dic_original_distribution[dataset] = distribution
    val_dis = Counter(val_labels).most_common()
    test_dis = Counter(test_labels).most_common()
    print('train', [x[1] for x in distribution])
    print('val', [x[1] for x in val_dis])
    print('test', [x[1] for x in test_dis])
    print(generate_pareto_distribution(distribution[0][1], len(distribution), 100))
    dic_pareto_distribution[dataset] = generate_pareto_distribution(distribution[0][1], len(distribution), 100)
    

organsmnist_224.npz
['train_images', 'train_labels', 'val_images', 'val_labels', 'test_images', 'test_labels']
train [3464, 2004, 1556, 1148, 1132, 1119, 803, 741, 721, 630, 614]
val [491, 280, 275, 261, 246, 213, 188, 159, 140, 104, 95]
test [2078, 1343, 968, 811, 704, 693, 510, 445, 439, 439, 397]
[3464, 2186, 1379, 870, 549, 346, 219, 138, 87, 55, 35]
organamnist_224.npz
['train_images', 'train_labels', 'val_images', 'val_labels', 'test_images', 'test_labels']
train [6164, 3963, 3929, 3919, 3817, 3561, 3031, 1956, 1474, 1390, 1357]
val [1033, 1033, 1009, 637, 568, 529, 511, 392, 321, 233, 225]
test [3285, 2064, 1965, 1884, 1813, 1747, 1622, 1036, 793, 785, 784]
[6164, 3889, 2454, 1548, 977, 616, 389, 245, 155, 98, 62]
tissuemnist_224.npz
['train_images', 'train_labels', 'val_images', 'val_labels', 'test_images', 'test_labels']
train [53075, 39203, 24608, 15406, 11789, 7814, 7705, 5866]
val [7582, 5601, 3516, 2201, 1684, 1117, 1101, 838]
test [15165, 11201, 7031, 4402, 3369, 2233, 22

In [6]:
dic_original_distribution.keys()


dict_keys(['organsmnist', 'organamnist', 'tissuemnist', 'pathmnist', 'bloodmnist', 'dermamnist', 'organcmnist'])

In [4]:
import cv2
target_dir = '/mnt/sda/julie/datasets/medmnist/images/'
for npz_file in os.listdir(source_dir):
    dataset = npz_file.split('.')[0]
    if not os.path.exists(target_dir+dataset):
        os.mkdir(target_dir+dataset)
    npz_arrays = np.load(source_dir+npz_file)
    train_images = npz_arrays['train_images']
    train_labels = npz_arrays['train_labels']
    val_images = npz_arrays['val_images']
    val_labels = npz_arrays['val_labels']
    test_images = npz_arrays['test_images']
    test_labels = npz_arrays['test_labels']

    for idx in range(train_images.shape[0]):
        img = train_images[idx]
        label = train_labels[idx][0]

        
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        save_name = 'train_%s_%s.jpg' %(idx, label)

        cv2.imwrite('%s/%s' %(target_dir+dataset, save_name), img_rgb)
    for idx in range(val_images.shape[0]):
        img = val_images[idx]
        label = val_labels[idx][0]
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        save_name = 'val_%s_%s.jpg' %(idx, label)

        cv2.imwrite('%s/%s' %(target_dir+dataset, save_name), img_rgb)
    for idx in range(test_images.shape[0]):
        img = test_images[idx]
        label = test_labels[idx][0]
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        save_name = 'test_%s_%s.jpg' %(idx, label)

        cv2.imwrite('%s/%s' %(target_dir+dataset, save_name), img_rgb)


In [10]:
import os
import numpy as np
import random
random.seed(1)
source_dir = '/mnt/sda/julie/datasets/medmnist/images/'
target_dir = '/mnt/sda/julie/projects/OpenMedLongTailed/numpy/medmnist/'
for i, dataset in enumerate(os.listdir(source_dir)):
    train_samples = []
    val_samples = []
    test_samples = []
    dic = {}
    dataset_name = dataset.split('_')[0]
    print(dataset_name)
    images = os.listdir(os.path.join(source_dir, dataset))
    random.shuffle(images)
    pareto_distribution = dic_pareto_distribution[dataset_name]
    train_sample_count = [0 for i in range(len(pareto_distribution))]
    for img in images:
        label = int(img.split('.jpg')[0].split('_')[-1])
        dic[img] = label
        if img.startswith('train'):
            if train_sample_count[label]>=pareto_distribution[label]:
                continue
            else:
                train_sample_count[label]+=1
                train_samples.append(img)
        elif img.startswith('val'):
            val_samples.append(img)
        elif img.startswith('test'):
            test_samples.append(img)
    print(train_sample_count)
    np.save('%s%s_train.npy' %(target_dir, dataset_name), train_samples)
    np.save('%s%s_val.npy' %(target_dir, dataset_name), val_samples)
    np.save('%s%s_test.npy' %(target_dir, dataset_name), test_samples)
    np.save('%s%s_dic.npy' %(target_dir, dataset_name), dic)


tissuemnist


[53075, 7814, 5866, 7375, 3820, 1978, 1025, 531]
organcmnist
[1148, 619, 595, 600, 473, 299, 188, 119, 75, 47, 30]
pathmnist
[9366, 7246, 4075, 2291, 1289, 725, 407, 229, 129]
organamnist
[1956, 1390, 1357, 1474, 977, 616, 389, 245, 155, 98, 62]
bloodmnist
[852, 1207, 625, 324, 168, 87, 45, 23]
organsmnist
[1148, 630, 614, 721, 549, 346, 219, 138, 87, 55, 35]
dermamnist
[228, 359, 769, 80, 218, 101, 47]


In [13]:
import os
import numpy as np
source_dir = '/mnt/sda/julie/datasets/medmnist/images/'
target_dir = '/mnt/sda/julie/projects/OpenMedLongTailed/numpy/medmnist/'
for i, dataset in enumerate(os.listdir(source_dir)):
    labels = []
    train_samples = []
    val_samples = []
    test_samples = []
    dic = {}
    dataset_name = dataset.split('_')[0]
    print(dataset_name)
    images = os.listdir(os.path.join(source_dir, dataset))
    for img in images:
        if img.startswith('val'):
            label = int(img.split('.jpg')[0].split('_')[-1])
            labels.append(label)
    from collections import Counter
    c = Counter(labels).most_common()
    print(c)

tissuemnist
[(0, 7582), (6, 5601), (7, 3516), (3, 2201), (4, 1684), (1, 1117), (5, 1101), (2, 838)]
organcmnist
[(6, 429), (8, 352), (7, 347), (10, 205), (3, 202), (0, 191), (9, 179), (5, 157), (4, 132), (1, 102), (2, 96)]
pathmnist
[(8, 1432), (5, 1354), (3, 1156), (2, 1152), (1, 1057), (7, 1045), (0, 1041), (4, 890), (6, 877)]
organamnist
[(6, 1033), (7, 1033), (8, 1009), (5, 637), (4, 568), (9, 529), (10, 511), (3, 392), (0, 321), (1, 233), (2, 225)]
bloodmnist
[(6, 333), (1, 312), (3, 290), (7, 235), (2, 155), (5, 143), (0, 122), (4, 122)]
organsmnist
[(6, 491), (9, 280), (8, 275), (7, 261), (3, 246), (10, 213), (0, 188), (5, 159), (4, 140), (1, 104), (2, 95)]
dermamnist
[(5, 671), (4, 111), (2, 110), (1, 52), (0, 33), (6, 14), (3, 12)]


In [1]:
import numpy as np
train = np.load('/mnt/sda/julie/projects/OpenMedLongTailed/numpy/medmnist/organsmnist_train.npy')
dic = np.load('/mnt/sda/julie/projects/OpenMedLongTailed/numpy/medmnist/organsmnist_dic.npy', allow_pickle=True).item()

In [2]:
labels = []
for s in train:
    labels.append(dic[s])

In [5]:
np.unique(labels)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [3]:
from collections import  Counter
c = Counter((labels)).most_common()

In [4]:
label_to_count = [x[1] for x in c]
c, label_to_count
per_cls_weights = 1 / np.array(label_to_count)
per_cls_weights, sum(per_cls_weights)

(array([0.00028868, 0.000499  , 0.00072516, 0.00114943, 0.00182149,
        0.00289017, 0.00456621, 0.00724638, 0.01149425, 0.01818182,
        0.02857143]),
 0.07743402756709027)

In [None]:
self.indices = list(range(len(dataset))) \
    if indices is None else indices
    
# if num_samples is not provided, 
# draw `len(indices)` samples in each iteration
self.num_samples = len(self.indices) \
    if num_samples is None else num_samples
    
# distribution of classes in the dataset 
label_to_count = [0] * len(np.unique(dataset.img_label))
for idx in self.indices:
    label = self._get_label(dataset, idx)
    label_to_count[label] += 1


# weight for each sample
weights = [per_cls_weights[self._get_label(dataset, idx)]
           for idx in self.indices]

self.per_cls_weights = per_cls_weights
self.weights = torch.DoubleTensor(weights)