####  Partition MedNIST dataset into subsets for all participants

In [63]:
import os
import uuid
from tqdm import tqdm
import shutil
import json
import pandas as pd

def download_mednist_dataset():
    if not os.path.exists("./MedNIST.pkl"):
        os.system(
            'curl -O "https://media.githubusercontent.com/media/shubham3121/datasets/main/MedNIST/MedNIST.pkl"'
        )
        print("MedNIST is successfully downloaded.")
    else:
        print("MedNIST is already downloaded")

Create a folder to store the data subsets.

In [64]:
data_subset_folder = "MedNIST/subsets"
if os.path.exists(data_subset_folder):    
    print("Data subset directory already Exists. Clearing existing one.")
    shutil.rmtree(data_subset_folder)

os.makedirs(data_subset_folder)
print("Data subset directory created.")

Data subset directory already Exists. Clearing existing one.
Data subset directory created.


In [65]:
# file path where the MedNIST.pkl is downloaded
TOTAL_PARTICIPANTS = 10
FILE_PATH = "./MedNIST.pkl"

In [None]:
# Download the whole mednist dataset

download_mednist_dataset()

In [68]:
df = pd.read_pickle(FILE_PATH)
df.sort_values("patient_id", inplace=True, ignore_index=True)

#### Partition the dataset into subsets

In [69]:
data_subsets_map = {}

for participation_number in tqdm(range(1, TOTAL_PARTICIPANTS+1)):
    # Calculate start and end index based on your participant number
    batch_size = df.shape[0] // TOTAL_PARTICIPANTS
    start_idx = (participation_number - 1) * batch_size
    end_idx = start_idx + batch_size

    # Slice the dataframe according
    subset = df[start_idx:end_idx]
    
    # Reset index of the subset
    subset.reset_index(inplace=True, drop=True)
    subset_filename = f"MedNIST-{uuid.uuid4().hex[:TOTAL_PARTICIPANTS]}.pkl"
    subset_path = f"{data_subset_folder}/{subset_filename}"
    subset.to_pickle(subset_path)    
    data_subsets_map[participation_number] = subset_filename
    
print("Data subsets Created Successfully !!!")

with open("dataset.json", "w") as fp:
    json.dump(data_subsets_map, fp)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.36it/s]

Data subsets Created Successfully !!!





In [70]:
os.listdir(data_subset_folder)

['MedNIST-437467c744.pkl',
 'MedNIST-b35c610d54.pkl',
 'MedNIST-b48a3173fe.pkl',
 'MedNIST-516a3f6746.pkl',
 'MedNIST-8ca1b11846.pkl',
 'MedNIST-ecccc00ee3.pkl',
 'MedNIST-aa8208fc64.pkl',
 'MedNIST-842492e114.pkl',
 'MedNIST-48975f1701.pkl',
 'MedNIST-c865822efe.pkl']

Upload `dataset.json` and each of the data subsets to `https://github.com/OpenMined/datasets` inside the MedNIST Folder.