In [13]:
import os
import json
import random
import csv

WSI_id_name = {
    "FFHO026": "transfered_Xenium_Prime_Human_Ovary_FF_he_image.ome.tif",
    "FFMB028": "transfered_Xenium_Prime_Mouse_Brain_Coronal_FF_he_image.ome.tif",
    "WMPW025": "transfered_Xenium_Prime_Mouse_Pup_FFPE_he_image.ome.tif",
    "FFMC016": "transfered_Xenium_V1_mouse_Colon_FF_he_image.ome.tif",
    "WMPP017": "transfered_Xenium_V1_mouse_pup_he_image.ome.tif",
    "PDHP030": "transfered_Xenium_Prime_Human_Prostate_FFPE_he_image.ome.tif",
    "PDHS029": "transfered_Xenium_Prime_Human_Skin_FFPE_he_image.ome.tif",
    "HHDX011": "transfered_Xenium_V1_hHeart_nondiseased_section_FFPE_he_image.ome.tif",
    "HPXM007": "transfered_Xenium_V1_human_Pancreas_FFPE_he_image.ome.tif",
    "HSDX009": "transfered_Xenium_V1_hSkin_nondiseased_section_1_FFPE_he_image.ome.tif",
}

def greedy_sample_patches(json_files, patches_per_cluster=5, total_patches_per_wsi=100, output_csv="greedy_sampled_patches.csv"):
    all_samples = []

    for json_path in json_files:
        wsi_id = os.path.splitext(os.path.basename(json_path))[0].replace("cluster_to_patch_ids_", "")

        if wsi_id not in WSI_id_name:
            print(f"[SKIPPED] Unknown WSI ID: {wsi_id}")
            continue

        wsi_name = os.path.splitext(WSI_id_name[wsi_id])[0]

        with open(json_path, 'r') as f:
            cluster_map = json.load(f)

        sorted_clusters = sorted(cluster_map.items(), key=lambda item: len(item[1]), reverse=True)
        patches_collected = 0
        sampled_for_this_wsi = []
        used_patches = set()

        # First pass: up to 5 patches per cluster
        for cluster_id, patch_list in sorted_clusters:
            if patches_collected >= total_patches_per_wsi:
                break

            unused = [p for p in patch_list if p not in used_patches]
            if not unused:
                continue

            to_sample = min(patches_per_cluster, len(unused), total_patches_per_wsi - patches_collected)
            selected = random.sample(unused, to_sample)

            for patch in selected:
                sampled_for_this_wsi.append({
                    "wsi_name": wsi_name,
                    "patch_filename": patch
                })
                used_patches.add(patch)

            patches_collected += to_sample

        # Second pass: fill remaining up to 100
        if patches_collected < total_patches_per_wsi:
            for cluster_id, patch_list in sorted_clusters:
                if patches_collected >= total_patches_per_wsi:
                    break

                unused = [p for p in patch_list if p not in used_patches]
                while unused and patches_collected < total_patches_per_wsi:
                    patch = unused.pop()
                    sampled_for_this_wsi.append({
                        "wsi_name": wsi_name,
                        "patch_filename": patch
                    })
                    used_patches.add(patch)
                    patches_collected += 1

        if patches_collected < total_patches_per_wsi:
            print(f"[WARNING] {wsi_id}: Only collected {patches_collected} patches (needed 100)")

        all_samples.extend(sampled_for_this_wsi)

    # Save to CSV
    os.makedirs(os.path.dirname(output_csv) or ".", exist_ok=True)
    with open(output_csv, 'w', newline='') as csvfile:
        fieldnames = ['wsi_name', 'patch_filename']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_samples)

    print(f"[DONE] Saved {len(all_samples)} patch samples across all WSIs to {output_csv}")
    return all_samples


In [14]:
json_files = [
    "cluster_to_patch_ids_FFHO026.json",
    "cluster_to_patch_ids_FFMB028.json",
    "cluster_to_patch_ids_FFMC016.json",
    "cluster_to_patch_ids_HHDX011.json",
    "cluster_to_patch_ids_HPXM007.json",
    "cluster_to_patch_ids_HSDX009.json",
    "cluster_to_patch_ids_PDHP030.json",
    "cluster_to_patch_ids_PDHS029.json",
    "cluster_to_patch_ids_WMPP017.json",
    "cluster_to_patch_ids_WMPW025.json",
]

greedy_sample_patches(json_files, output_csv="sampled_patches.csv")

[DONE] Saved 800 patch samples across all WSIs to sampled_patches.csv


[{'wsi_name': 'transfered_Xenium_Prime_Human_Ovary_FF_he_image.ome',
  'patch_filename': 'patch_3389.tif'},
 {'wsi_name': 'transfered_Xenium_Prime_Human_Ovary_FF_he_image.ome',
  'patch_filename': 'patch_4521.tif'},
 {'wsi_name': 'transfered_Xenium_Prime_Human_Ovary_FF_he_image.ome',
  'patch_filename': 'patch_3437.tif'},
 {'wsi_name': 'transfered_Xenium_Prime_Human_Ovary_FF_he_image.ome',
  'patch_filename': 'patch_3438.tif'},
 {'wsi_name': 'transfered_Xenium_Prime_Human_Ovary_FF_he_image.ome',
  'patch_filename': 'patch_2610.tif'},
 {'wsi_name': 'transfered_Xenium_Prime_Human_Ovary_FF_he_image.ome',
  'patch_filename': 'patch_6294.tif'},
 {'wsi_name': 'transfered_Xenium_Prime_Human_Ovary_FF_he_image.ome',
  'patch_filename': 'patch_6853.tif'},
 {'wsi_name': 'transfered_Xenium_Prime_Human_Ovary_FF_he_image.ome',
  'patch_filename': 'patch_5170.tif'},
 {'wsi_name': 'transfered_Xenium_Prime_Human_Ovary_FF_he_image.ome',
  'patch_filename': 'patch_8092.tif'},
 {'wsi_name': 'transfered_Xe