In [3]:
# File Description:
# This Jupyter Notebook provides utilities for counting image files in multiple dataset directories.
# It defines a function `count_images_in_subdirs` that takes a dictionary mapping dataset names to their paths,
# and counts the number of image files (with common image extensions) in each dataset directory, including subdirectories.
# The variable `dataset_dict` contains example dataset paths for use with this function.

In [6]:
import os

def count_images_by_split(dataset_dict, splits=('train', 'val', 'test'), image_extensions={'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}):
    split_counts = {}
    for d_name, d_path in dataset_dict.items():
        split_counts[d_name] = {}
        for split in splits:
            split_dir = os.path.join(d_path, split)
            count = 0
            if os.path.exists(split_dir):
                for root, dirs, files in os.walk(split_dir):
                    for file in files:
                        if os.path.splitext(file)[1].lower() in image_extensions:
                            count += 1
            split_counts[d_name][split] = count
    return split_counts

# Example usage:
dataset_dict = {
    'paultimothymooney':r'C:\Users\TimePC\.cache\kagglehub\datasets\paultimothymooney\chest-xray-pneumonia\versions\2\chest_xray',
    'mimic_cxr':r'C:\Users\TimePC\.cache\kagglehub\datasets\itsanmol124\mimic-cxr\versions\1',
    'chexpert':r'C:\Users\TimePC\.cache\kagglehub\datasets\mimsadiislam\chexpert\versions\1',
    'chestx_ray14':r'C:\Users\TimePC\.cache\kagglehub\datasets\lawhan\chestx-ray14\versions\1\ChestX-ray14',
    }
# print(count_images_by_split(dataset_dict))
for d_name, counts in count_images_by_split(dataset_dict).items():
    print(f"Dataset: {d_name}")
    for split, count in counts.items():
        print(f"  {split}: {count} images")
    print()


Dataset: paultimothymooney
  train: 5216 images
  val: 16 images
  test: 624 images

Dataset: mimic_cxr
  train: 83837 images
  val: 711 images
  test: 1455 images

Dataset: chexpert
  train: 178732 images
  val: 44682 images
  test: 234 images

Dataset: chestx_ray14
  train: 78468 images
  val: 11219 images
  test: 22433 images

