In [1]:
from medmnist.dataset import MedMNIST

In [2]:
datasets = ['tissuemnist', 'octmnist', 'organamnist']

In [3]:
from medmnist import INFO

infos = [INFO[dataset] for dataset in datasets]

In [4]:
infos[0]

{'python_class': 'TissueMNIST',
 'description': 'We use the BBBC051, available from the Broad Bioimage Benchmark Collection. The dataset contains 236,386 human kidney cortex cells, segmented from 3 reference tissue specimens and organized into 8 categories. We split the source dataset with a ratio of 7:1:2 into training, validation and test set. Each gray-scale image is 32×32×7 pixels, where 7 denotes 7 slices. We take maximum values across the slices and resize them into 28×28 gray-scale images.',
 'url': 'https://zenodo.org/records/10519652/files/tissuemnist.npz?download=1',
 'MD5': 'ebe78ee8b05294063de985d821c1c34b',
 'url_64': 'https://zenodo.org/records/10519652/files/tissuemnist_64.npz?download=1',
 'MD5_64': '123ece2eba09d0aa5d698fda57103344',
 'url_128': 'https://zenodo.org/records/10519652/files/tissuemnist_128.npz?download=1',
 'MD5_128': '61b955355d7425a89687b06cca3ce0c2',
 'url_224': 'https://zenodo.org/records/10519652/files/tissuemnist_224.npz?download=1',
 'MD5_224': 'b0

In [5]:
import medmnist

assert all(info['n_channels'] == infos[0]['n_channels'] for info in infos)
DataClasses = [getattr(medmnist.dataset, info['python_class']) for info in infos]

In [6]:
img_size = 28

In [7]:
from torchvision import transforms


def default_transform():
    return transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize(mean=(0.5,), std=(0.5,)),
            ]
        )

In [8]:
train_sets = []
for DataClass in DataClasses:
    dataset = DataClass(split='train', transform=transforms.ToTensor(), download=True, size=img_size)
    train_sets.append(dataset)

Using downloaded and verified file: C:\Users\pasca\.medmnist\tissuemnist.npz
Using downloaded and verified file: C:\Users\pasca\.medmnist\octmnist.npz
Using downloaded and verified file: C:\Users\pasca\.medmnist\organamnist.npz


In [9]:
train_sets

[Dataset TissueMNIST of size 28 (tissuemnist)
     Number of datapoints: 165466
     Root location: C:\Users\pasca\.medmnist
     Split: train
     Task: multi-class
     Number of channels: 1
     Meaning of labels: {'0': 'Collecting Duct, Connecting Tubule', '1': 'Distal Convoluted Tubule', '2': 'Glomerular endothelial cells', '3': 'Interstitial endothelial cells', '4': 'Leukocytes', '5': 'Podocytes', '6': 'Proximal Tubule Segments', '7': 'Thick Ascending Limb'}
     Number of samples: {'train': 165466, 'val': 23640, 'test': 47280}
     Description: We use the BBBC051, available from the Broad Bioimage Benchmark Collection. The dataset contains 236,386 human kidney cortex cells, segmented from 3 reference tissue specimens and organized into 8 categories. We split the source dataset with a ratio of 7:1:2 into training, validation and test set. Each gray-scale image is 32×32×7 pixels, where 7 denotes 7 slices. We take maximum values across the slices and resize them into 28×28 gray-sca

In [10]:
train_sets[0].imgs

array([[[ 41,  48,  36, ...,  42,  39,  35],
        [ 41,  44,  33, ...,  45,  45,  41],
        [ 43,  39,  28, ...,  41,  45,  42],
        ...,
        [ 14,  22,  60, ...,  11,  13,  14],
        [ 10,  22,  63, ...,   9,  11,  12],
        [  8,  18,  55, ...,   9,  10,  12]],

       [[  3,   6,   8, ...,   7,   6,   6],
        [  4,   6,   8, ...,   7,   6,   5],
        [  3,   5,   6, ...,   5,   4,   4],
        ...,
        [ 18,  17,  16, ...,  22,  19,  17],
        [ 18,  17,  16, ...,  22,  20,  18],
        [ 18,  17,  16, ...,  21,  20,  20]],

       [[ 62,  76,  99, ...,  18,  16,  16],
        [ 78,  84,  96, ...,  18,  15,  13],
        [110,  98,  95, ...,  20,  16,  13],
        ...,
        [ 11,  12,  11, ...,   2,   2,   2],
        [ 11,  12,  11, ...,   2,   2,   2],
        [ 11,  12,  11, ...,   2,   2,   2]],

       ...,

       [[188, 150, 103, ...,  22,  24,  25],
        [171, 112,  72, ...,  24,  18,  14],
        [152,  88,  73, ...,  21,  18,  17

In [11]:
len(train_sets[0].imgs)

165466

In [12]:
from torch.utils.data import ConcatDataset

train_set = ConcatDataset(train_sets)

In [13]:
print("Number of datasets in ConcatDataset:", len(train_set.datasets))
print("Total number of samples in ConcatDataset:", len(train_set))


Number of datasets in ConcatDataset: 3
Total number of samples in ConcatDataset: 297504


In [14]:
# Retrieve labels from the first dataset in the train_sets list
labels = train_sets[0].labels
print("Labels from the first dataset:", labels)


Labels from the first dataset: [[0]
 [0]
 [6]
 ...
 [2]
 [1]
 [6]]


In [15]:
for i, dataset in enumerate(train_sets):
    print(f"Labels for dataset {i+1}:", dataset.labels)


Labels for dataset 1: [[0]
 [0]
 [6]
 ...
 [2]
 [1]
 [6]]
Labels for dataset 2: [[0]
 [3]
 [3]
 ...
 [0]
 [3]
 [0]]
Labels for dataset 3: [[6]
 [8]
 [5]
 ...
 [0]
 [8]
 [8]]


In [16]:
from vae_medmnist.data.accumulated_dataset import AccumulatedMedMNIST

dataloader = AccumulatedMedMNIST(datasets)

In [17]:
dataloader.setup()

INFO:vae_medmnist.data.accumulated_dataset:Setting up AccumulatedMedMNIST with ['tissuemnist', 'octmnist', 'organamnist'] datasets


Using downloaded and verified file: C:\Users\pasca\.medmnist\tissuemnist.npz
Using downloaded and verified file: C:\Users\pasca\.medmnist\tissuemnist.npz
Using downloaded and verified file: C:\Users\pasca\.medmnist\tissuemnist.npz
Using downloaded and verified file: C:\Users\pasca\.medmnist\octmnist.npz
Using downloaded and verified file: C:\Users\pasca\.medmnist\octmnist.npz
Using downloaded and verified file: C:\Users\pasca\.medmnist\octmnist.npz
Using downloaded and verified file: C:\Users\pasca\.medmnist\organamnist.npz


INFO:vae_medmnist.data.accumulated_dataset:Combined train dataset: <torch.utils.data.dataset.ConcatDataset object at 0x000001ADF4670510>
INFO:vae_medmnist.data.accumulated_dataset:Combined validation dataset: <torch.utils.data.dataset.ConcatDataset object at 0x000001ADF3F4D250>
INFO:vae_medmnist.data.accumulated_dataset:Combined test dataset: <torch.utils.data.dataset.ConcatDataset object at 0x000001ADF460A0D0>
INFO:vae_medmnist.data.accumulated_dataset:Number of channels: {1}


Using downloaded and verified file: C:\Users\pasca\.medmnist\organamnist.npz
Using downloaded and verified file: C:\Users\pasca\.medmnist\organamnist.npz


INFO:vae_medmnist.data.accumulated_dataset:Number of classes: 23
INFO:vae_medmnist.data.accumulated_dataset:Number of samples: {'train': 297504, 'val': 40963, 'test': 66058}


In [None]:
for i, dataset in enumerate(dataloader.combined_train):
    print(f"Labels for dataset {i+1}:", dataset.labels)