In [None]:
# Install necessary libraries
!pip install medmnist numpy pandas tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
# Paths
drive_output_dir = '/content/drive/MyDrive/pathmnist/'  # Change this to your desired Drive folder

Collecting medmnist
  Downloading medmnist-3.0.2-py3-none-any.whl.metadata (14 kB)
Collecting fire (from medmnist)
  Downloading fire-0.7.0.tar.gz (87 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading medmnist-3.0.2-py3-none-any.whl (25 kB)
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=0324980cffca98447aea7ae795e0ffb9504f566a9a535ad8beee31190109e6ee
  Stored in directory: /root/.cache/pip/wheels/19/39/2f/2d3cadc408a8804103f1c34ddd4b9f6a93497b11fa96fe738e
Successfully built fire
Installing collected packages: fire, medmnist
Successfully installed fire-0.7.0 medmnist-3.0.2
Mounted at /content/driv

In [None]:
from medmnist import INFO, PathMNIST
import os
import numpy as np

# Download PathMNIST dataset
data_path = "/root/.medmnist"  # Default path used by medmnist
PathMNIST(root=data_path, split="train", download=True, size=224)

# MedMNIST info for PathMNIST
dataset_info = INFO['pathmnist']
label_text_values = list(dataset_info['label'].values())  # Get all values from label_text dictionary
print (label_text_values)

data = np.load("/root/.medmnist/pathmnist_224.npz")
images = data["train_images"]  # Training images
labels = data["train_labels"]  # Corresponding labels

# Manual label mapping
label_mapping = {
    0: "adipose",
    1: "background",
    2: "debris",
    3: "lymphocytes",
    4: "mucus",
    5: "smooth muscle",
    6: "normal colon mucosa",
    7: "cancer-associated stroma",
    8: "colorectal adenocarcinoma epithelium"
}

Downloading https://zenodo.org/records/10519652/files/pathmnist_224.npz?download=1 to /root/.medmnist/pathmnist_224.npz


100%|██████████| 12.6G/12.6G [16:50<00:00, 12.5MB/s]


['adipose', 'background', 'debris', 'lymphocytes', 'mucus', 'smooth muscle', 'normal colon mucosa', 'cancer-associated stroma', 'colorectal adenocarcinoma epithelium']


In [None]:
# Import tqdm
from tqdm import tqdm
# Import the Image module from Pillow
from PIL import Image

# Create the output directory on Drive
image_dir = os.path.join(drive_output_dir, 'images')
os.makedirs(image_dir, exist_ok=True)

# Save images
metadata = []
for idx, (img, label) in enumerate(tqdm(zip(images, labels), desc="Processing images", total=len(labels))):

    # Convert the numpy array to an image (RGB format)
    pil_img = Image.fromarray(img)

    # Get the disease name using the label mapping and format the label description
    disease_name = label_mapping[label[0]]
    label_description = f"a histopathological image of an area with {disease_name}"

    # Create a subdirectory for the label if it doesn't exist
    # label_dir = os.path.join(image_dir, disease_name)
    label_dir = os.path.join(image_dir, label_description)
    os.makedirs(label_dir, exist_ok=True)

    # Save image as PNG under the label directory
    file_name = f'image_{idx}.png'
    image_path = os.path.join(label_dir, file_name)
    pil_img.save(image_path)

    # Append to metadata
    metadata.append([file_name, label_description])

print(f"Dataset prepared and saved to {drive_output_dir}")

Processing images: 100%|██████████| 89996/89996 [26:28<00:00, 56.65it/s]

Dataset prepared and saved to /content/drive/MyDrive/pathmnist/





In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json

# Paths
base_dir = '/content/drive/MyDrive/pathmnist/images'
output_file = '/content/drive/MyDrive/pathmnist/images/metadata.jsonl'

# Prepare the data
data = []
for class_name in os.listdir(base_dir):
    class_path = os.path.join(base_dir, class_name)
    if os.path.isdir(class_path):
        for img_name in os.listdir(class_path):
            img_path = os.path.join(class_path, img_name)
            data.append({
                "file_name": class_name + "/" + img_name,
                "label": class_name
            })

# Write to metadata.jsonl
with open(output_file, 'w') as f:
    for entry in data:
        f.write(json.dumps(entry) + '\n')

print(f"Metadata file created at {output_file}")

Metadata file created at /content/drive/MyDrive/pathmnist/images/metadata.jsonl


In [4]:
import os

# Define the base directory
base_dir = '/content/drive/MyDrive/pathmnist/images'

# Initialize total image count
total_image_count = 0

# Iterate through sub-folders and count images
for sub_folder in os.listdir(base_dir):
    sub_folder_path = os.path.join(base_dir, sub_folder)

    # Check if it's a directory
    if os.path.isdir(sub_folder_path):
        # Count image files in the sub-folder
        image_count = len([f for f in os.listdir(sub_folder_path) if os.path.isfile(os.path.join(sub_folder_path, f)) and f.lower().endswith(('.png', '.jpg', '.jpeg'))])

        print(f"Sub-folder: {sub_folder}, Image Count: {image_count}")

        # Add to total count
        total_image_count += image_count

# Print the total count
print(f"\nTotal Image Count: {total_image_count}")

Sub-folder: a histopathological image of an area with adipose, Image Count: 6317
Sub-folder: a histopathological image of an area with mucus, Image Count: 5459
Sub-folder: a histopathological image of an area with cancer-associated stroma, Image Count: 6311
Sub-folder: a histopathological image of an area with smooth muscle, Image Count: 8265
Sub-folder: a histopathological image of an area with colorectal adenocarcinoma epithelium, Image Count: 8666
Sub-folder: a histopathological image of an area with lymphocytes, Image Count: 7011
Sub-folder: a histopathological image of an area with debris, Image Count: 7070
Sub-folder: a histopathological image of an area with background, Image Count: 6417
Sub-folder: a histopathological image of an area with normal colon mucosa, Image Count: 5324

Total Image Count: 60840


In [5]:
import json
from collections import defaultdict

# Define the path to the metadata file
metadata_file = '/content/drive/MyDrive/pathmnist/images/metadata.jsonl'

# Create a dictionary to store label counts
label_counts = defaultdict(int)

# Initialize total count
total_count = 0

# Open the metadata file and process each line
with open(metadata_file, 'r') as f:
    for line in f:
        # Load the JSON data from the line
        data = json.loads(line)

        # Get the label from the data
        label = data['label']

        # Increment the count for the label
        label_counts[label] += 1

        # Increment the total count
        total_count += 1

# Print the label counts
for label, count in label_counts.items():
    print(f"Label: {label}, Count: {count}")

# Print the total count
print(f"\nTotal Count: {total_count}")

Label: a histopathological image of an area with adipose, Count: 6317
Label: a histopathological image of an area with mucus, Count: 5459
Label: a histopathological image of an area with cancer-associated stroma, Count: 6311
Label: a histopathological image of an area with smooth muscle, Count: 8265
Label: a histopathological image of an area with colorectal adenocarcinoma epithelium, Count: 8666
Label: a histopathological image of an area with lymphocytes, Count: 7011
Label: a histopathological image of an area with debris, Count: 7070
Label: a histopathological image of an area with background, Count: 6417
Label: a histopathological image of an area with normal colon mucosa, Count: 5324

Total Count: 60840
