In [1]:
# Cell 1: Imports and Setup

import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import scipy.io as sio
import xml.etree.ElementTree as ET
from glob import glob
import json

# Base path
BASE_PATH = Path('Histopathology_Datasets_Official')

# Our reference classes from PanNuke
PANNUKE_CLASSES = {
    0: 'Neoplastic',
    1: 'Inflammatory', 
    2: 'Connective_Soft_tissue',
    3: 'Dead',
    4: 'Epithelial'
}

print("Available datasets:")
for d in sorted(BASE_PATH.iterdir()):
    if d.is_dir():
        print(f"  - {d.name}")

Available datasets:
  - BreakHis
  - CoNIC
  - CoNSeP
  - LC25000
  - Lizard
  - MoNuSAC
  - MoNuSeg
  - PanNuke


---
## 1. CoNSeP Dataset Analysis

**Source:** HoVer-Net paper (Colorectal tissue)

**Classes:**
- 1: Other
- 2: Inflammatory
- 3: Healthy Epithelial
- 4: Dysplastic/Malignant Epithelial (Neoplastic)
- 5: Fibroblast (Connective)
- 6: Muscle
- 7: Endothelial

In [2]:
# Cell 2: Analyze CoNSeP Dataset

print("=" * 60)
print("CoNSeP DATASET ANALYSIS")
print("=" * 60)

consep_path = BASE_PATH / 'CoNSeP'

# CoNSeP class mapping
CONSEP_CLASSES = {
    0: 'Background',
    1: 'Other',
    2: 'Inflammatory',
    3: 'Healthy Epithelial',
    4: 'Dysplastic/Malignant Epithelial',
    5: 'Fibroblast',
    6: 'Muscle',
    7: 'Endothelial'
}

# Mapping to PanNuke classes
CONSEP_TO_PANNUKE = {
    2: 'Inflammatory',           # Inflammatory → Inflammatory ✓
    3: 'Epithelial',             # Healthy Epithelial → Epithelial ✓
    4: 'Neoplastic',             # Dysplastic/Malignant → Neoplastic ✓
    5: 'Connective_Soft_tissue', # Fibroblast → Connective ✓
}

print("\nCoNSeP Classes:")
for k, v in CONSEP_CLASSES.items():
    pannuke_map = CONSEP_TO_PANNUKE.get(k, 'N/A')
    print(f"  {k}: {v} → PanNuke: {pannuke_map}")

# Count files
train_images = list((consep_path / 'Train' / 'Images').glob('*.png'))
test_images = list((consep_path / 'Test' / 'Images').glob('*.png'))

print(f"\nDataset Size:")
print(f"  Train images: {len(train_images)}")
print(f"  Test images: {len(test_images)}")
print(f"  Total: {len(train_images) + len(test_images)}")

# Load sample to check format
train_labels = list((consep_path / 'Train' / 'Labels').glob('*.mat'))
if train_labels:
    sample_label = sio.loadmat(str(train_labels[0]))
    print(f"\nLabel file keys: {list(sample_label.keys())}")
    if 'type_map' in sample_label:
        type_map = sample_label['type_map']
        print(f"  Type map shape: {type_map.shape}")
        unique_classes = np.unique(type_map)
        print(f"  Unique classes in sample: {unique_classes}")

print("\n✓ CoNSeP is SUITABLE for zero-shot evaluation!")
print("  Matching classes: Inflammatory, Epithelial, Neoplastic, Connective")

CoNSeP DATASET ANALYSIS

CoNSeP Classes:
  0: Background → PanNuke: N/A
  1: Other → PanNuke: N/A
  2: Inflammatory → PanNuke: Inflammatory
  3: Healthy Epithelial → PanNuke: Epithelial
  4: Dysplastic/Malignant Epithelial → PanNuke: Neoplastic
  5: Fibroblast → PanNuke: Connective_Soft_tissue
  6: Muscle → PanNuke: N/A
  7: Endothelial → PanNuke: N/A

Dataset Size:
  Train images: 27
  Test images: 14
  Total: 41

Label file keys: ['__header__', '__version__', '__globals__', 'inst_map', 'type_map', 'inst_type', 'inst_centroid']
  Type map shape: (1000, 1000)
  Unique classes in sample: [0. 2. 3. 5.]

✓ CoNSeP is SUITABLE for zero-shot evaluation!
  Matching classes: Inflammatory, Epithelial, Neoplastic, Connective


---
## 2. MoNuSAC Dataset Analysis

**Source:** Multi-organ nuclei segmentation and classification challenge

**Classes (from XML annotations):**
- Epithelial
- Lymphocyte (Inflammatory)
- Macrophage (Inflammatory)
- Neutrophil (Inflammatory)

In [3]:
# Cell 3: Analyze MoNuSAC Dataset

print("=" * 60)
print("MoNuSAC DATASET ANALYSIS")
print("=" * 60)

monusac_path = BASE_PATH / 'MoNuSAC'

# MoNuSAC class mapping
MONUSAC_CLASSES = ['Epithelial', 'Lymphocyte', 'Macrophage', 'Neutrophil']

# Mapping to PanNuke classes
MONUSAC_TO_PANNUKE = {
    'Epithelial': 'Epithelial',       # Epithelial → Epithelial ✓
    'Lymphocyte': 'Inflammatory',     # Lymphocyte → Inflammatory ✓
    'Macrophage': 'Inflammatory',     # Macrophage → Inflammatory ✓
    'Neutrophil': 'Inflammatory',     # Neutrophil → Inflammatory ✓
}

print("\nMoNuSAC Classes:")
for cls in MONUSAC_CLASSES:
    pannuke_map = MONUSAC_TO_PANNUKE.get(cls, 'N/A')
    print(f"  {cls} → PanNuke: {pannuke_map}")

# Count samples
train_folders = list((monusac_path / 'Train').iterdir())
test_folders = list((monusac_path / 'Test').iterdir())

print(f"\nDataset Size:")
print(f"  Train patients: {len(train_folders)}")
print(f"  Test patients: {len(test_folders)}")

# Count total images
train_images = list((monusac_path / 'Train').rglob('*.tif'))
test_images = list((monusac_path / 'Test').rglob('*.tif'))
print(f"  Train images: {len(train_images)}")
print(f"  Test images: {len(test_images)}")

# Parse sample XML to verify classes
xml_files = list((monusac_path / 'Train').rglob('*.xml'))
if xml_files:
    tree = ET.parse(str(xml_files[0]))
    root = tree.getroot()
    classes_found = set()
    for annotation in root.findall('.//Annotation'):
        for attr in annotation.findall('.//Attribute'):
            name = attr.get('Name')
            if name and name not in ['Region', 'Length', 'Area', 'Text', 'Description', '']:
                classes_found.add(name)
    print(f"\nClasses found in XML: {classes_found}")

print("\n✓ MoNuSAC is SUITABLE for zero-shot evaluation!")
print("  Matching classes: Epithelial, Inflammatory (Lymphocyte+Macrophage+Neutrophil)")

MoNuSAC DATASET ANALYSIS

MoNuSAC Classes:
  Epithelial → PanNuke: Epithelial
  Lymphocyte → PanNuke: Inflammatory
  Macrophage → PanNuke: Inflammatory
  Neutrophil → PanNuke: Inflammatory

Dataset Size:
  Train patients: 46
  Test patients: 25
  Train images: 209
  Test images: 85

Classes found in XML: {'Macrophage', 'Neutrophil', 'Lymphocyte', 'Epithelial'}

✓ MoNuSAC is SUITABLE for zero-shot evaluation!
  Matching classes: Epithelial, Inflammatory (Lymphocyte+Macrophage+Neutrophil)


---
## 3. MoNuSeg Dataset Analysis

**Source:** Multi-organ nuclei segmentation dataset

**Note:** MoNuSeg provides **instance segmentation only** (no class labels). All nuclei are treated as one class.

In [4]:
# Cell 4: Analyze MoNuSeg Dataset

print("=" * 60)
print("MoNuSeg DATASET ANALYSIS")
print("=" * 60)

monuseg_path = BASE_PATH / 'MoNuSeg'

print("\nMoNuSeg provides INSTANCE SEGMENTATION only (no class labels)")
print("All nuclei are treated as a single 'nucleus' class.")

# Count files
train_images = list((monuseg_path / 'train' / 'images').glob('*'))
train_masks = list((monuseg_path / 'train' / 'masks').glob('*'))
test_images = list((monuseg_path / 'test' / 'images').glob('*'))
test_masks = list((monuseg_path / 'test' / 'masks').glob('*'))

print(f"\nDataset Size:")
print(f"  Train: {len(train_images)} images, {len(train_masks)} masks")
print(f"  Test: {len(test_images)} images, {len(test_masks)} masks")

# Check mask format
if train_masks:
    sample_mask = cv2.imread(str(train_masks[0]), cv2.IMREAD_GRAYSCALE)
    if sample_mask is not None:
        print(f"\nMask format:")
        print(f"  Shape: {sample_mask.shape}")
        print(f"  Unique values: {len(np.unique(sample_mask))} (instance IDs)")

print("\n⚠️ MoNuSeg is LIMITED for zero-shot evaluation")
print("  No class labels - can only evaluate binary segmentation (nucleus vs background)")
print("  Could be used with text prompt: 'Segment all nuclei'")

MoNuSeg DATASET ANALYSIS

MoNuSeg provides INSTANCE SEGMENTATION only (no class labels)
All nuclei are treated as a single 'nucleus' class.

Dataset Size:
  Train: 24 images, 24 masks
  Test: 58 images, 58 masks

Mask format:
  Shape: (256, 256)
  Unique values: 256 (instance IDs)

⚠️ MoNuSeg is LIMITED for zero-shot evaluation
  No class labels - can only evaluate binary segmentation (nucleus vs background)
  Could be used with text prompt: 'Segment all nuclei'


---
## 4. Lizard Dataset Analysis

**Source:** Large-scale colon dataset

**Classes:**
- 1: Neutrophil (Inflammatory)
- 2: Epithelial
- 3: Lymphocyte (Inflammatory)
- 4: Plasma (Inflammatory)
- 5: Eosinophil (Inflammatory)
- 6: Connective tissue

In [5]:
# Cell 5: Analyze Lizard Dataset

print("=" * 60)
print("LIZARD DATASET ANALYSIS")
print("=" * 60)

lizard_path = BASE_PATH / 'Lizard'

# Lizard class mapping (from README)
LIZARD_CLASSES = {
    1: 'Neutrophil',
    2: 'Epithelial',
    3: 'Lymphocyte',
    4: 'Plasma',
    5: 'Eosinophil',
    6: 'Connective tissue'
}

# Mapping to PanNuke classes
LIZARD_TO_PANNUKE = {
    1: 'Inflammatory',            # Neutrophil → Inflammatory ✓
    2: 'Epithelial',              # Epithelial → Epithelial ✓
    3: 'Inflammatory',            # Lymphocyte → Inflammatory ✓
    4: 'Inflammatory',            # Plasma → Inflammatory ✓
    5: 'Inflammatory',            # Eosinophil → Inflammatory ✓
    6: 'Connective_Soft_tissue',  # Connective → Connective ✓
}

print("\nLizard Classes:")
for k, v in LIZARD_CLASSES.items():
    pannuke_map = LIZARD_TO_PANNUKE.get(k, 'N/A')
    print(f"  {k}: {v} → PanNuke: {pannuke_map}")

# Count images
images1 = list((lizard_path / 'lizard_images1').rglob('*.png'))
images2 = list((lizard_path / 'lizard_images2').rglob('*.png'))
labels = list((lizard_path / 'lizard_labels' / 'Lizard_Labels' / 'Labels').glob('*.mat'))

print(f"\nDataset Size:")
print(f"  Images Part 1: {len(images1)}")
print(f"  Images Part 2: {len(images2)}")
print(f"  Total images: {len(images1) + len(images2)}")
print(f"  Labels: {len(labels)}")

# Load sample label
if labels:
    sample_label = sio.loadmat(str(labels[0]))
    print(f"\nLabel file keys: {[k for k in sample_label.keys() if not k.startswith('_')]}")

print("\n✓ Lizard is SUITABLE for zero-shot evaluation!")
print("  Matching classes: Inflammatory (4 types), Epithelial, Connective")

LIZARD DATASET ANALYSIS

Lizard Classes:
  1: Neutrophil → PanNuke: Inflammatory
  2: Epithelial → PanNuke: Epithelial
  3: Lymphocyte → PanNuke: Inflammatory
  4: Plasma → PanNuke: Inflammatory
  5: Eosinophil → PanNuke: Inflammatory
  6: Connective tissue → PanNuke: Connective_Soft_tissue

Dataset Size:
  Images Part 1: 80
  Images Part 2: 158
  Total images: 238
  Labels: 238

Label file keys: ['inst_map', 'id', 'class', 'bbox', 'centroid']

✓ Lizard is SUITABLE for zero-shot evaluation!
  Matching classes: Inflammatory (4 types), Epithelial, Connective


---
## 5. CoNIC Dataset Analysis

**Source:** CoNIC Challenge (derived from Lizard)

**Classes:**
- 1: Neutrophil
- 2: Epithelial
- 3: Lymphocyte
- 4: Plasma
- 5: Eosinophil
- 6: Connective

In [6]:
# Cell 6: Analyze CoNIC Dataset

print("=" * 60)
print("CoNIC DATASET ANALYSIS")
print("=" * 60)

conic_path = BASE_PATH / 'CoNIC'

# CoNIC class mapping (same as Lizard)
CONIC_CLASSES = {
    0: 'Background',
    1: 'Neutrophil',
    2: 'Epithelial',
    3: 'Lymphocyte',
    4: 'Plasma',
    5: 'Eosinophil',
    6: 'Connective'
}

CONIC_TO_PANNUKE = {
    1: 'Inflammatory',            # Neutrophil → Inflammatory ✓
    2: 'Epithelial',              # Epithelial → Epithelial ✓
    3: 'Inflammatory',            # Lymphocyte → Inflammatory ✓
    4: 'Inflammatory',            # Plasma → Inflammatory ✓
    5: 'Inflammatory',            # Eosinophil → Inflammatory ✓
    6: 'Connective_Soft_tissue',  # Connective → Connective ✓
}

print("\nCoNIC Classes:")
for k, v in CONIC_CLASSES.items():
    if k > 0:
        pannuke_map = CONIC_TO_PANNUKE.get(k, 'N/A')
        print(f"  {k}: {v} → PanNuke: {pannuke_map}")

# Load numpy arrays
images = np.load(conic_path / 'images.npy')
labels = np.load(conic_path / 'labels.npy')

print(f"\nDataset Size:")
print(f"  Images shape: {images.shape}")
print(f"  Labels shape: {labels.shape}")
print(f"  Labels channels: [instance_map, class_map]")

# Analyze class distribution
class_map = labels[:, :, :, 1]  # Second channel is class map
unique_classes, counts = np.unique(class_map, return_counts=True)
print(f"\nClass distribution (pixels):")
for cls, cnt in zip(unique_classes, counts):
    cls_name = CONIC_CLASSES.get(cls, 'Unknown')
    pct = cnt / class_map.size * 100
    print(f"  {cls} ({cls_name}): {cnt:,} ({pct:.2f}%)")

print("\n✓ CoNIC is HIGHLY SUITABLE for zero-shot evaluation!")
print("  - Ready-to-use numpy format (256x256 patches)")
print("  - Matching classes: Inflammatory (4 types), Epithelial, Connective")

CoNIC DATASET ANALYSIS

CoNIC Classes:
  1: Neutrophil → PanNuke: Inflammatory
  2: Epithelial → PanNuke: Epithelial
  3: Lymphocyte → PanNuke: Inflammatory
  4: Plasma → PanNuke: Inflammatory
  5: Eosinophil → PanNuke: Inflammatory
  6: Connective → PanNuke: Connective_Soft_tissue

Dataset Size:
  Images shape: (4981, 256, 256, 3)
  Labels shape: (4981, 256, 256, 2)
  Labels channels: [instance_map, class_map]

Class distribution (pixels):
  0 (Background): 274,123,017 (83.97%)
  1 (Neutrophil): 410,511 (0.13%)
  2 (Epithelial): 33,649,770 (10.31%)
  3 (Lymphocyte): 6,054,345 (1.85%)
  4 (Plasma): 1,799,732 (0.55%)
  5 (Eosinophil): 338,595 (0.10%)
  6 (Connective): 10,058,846 (3.08%)

✓ CoNIC is HIGHLY SUITABLE for zero-shot evaluation!
  - Ready-to-use numpy format (256x256 patches)
  - Matching classes: Inflammatory (4 types), Epithelial, Connective


---
## 6. BreakHis Dataset Analysis

**Source:** Breast cancer histopathology images

**Note:** BreakHis is a **classification dataset** (Benign vs Malignant), NOT segmentation. No pixel-level masks available.

In [7]:
# Cell 7: Analyze BreakHis Dataset

print("=" * 60)
print("BreakHis DATASET ANALYSIS")
print("=" * 60)

breakhis_path = BASE_PATH / 'BreakHis' / 'BreaKHis_v1' / 'BreaKHis_v1' / 'histology_slides' / 'breast'

print("\n⚠️ BreakHis is a CLASSIFICATION dataset (not segmentation)")
print("   - Labels: Benign vs Malignant (image-level)")
print("   - No pixel-level segmentation masks")

# Count images per category
benign_path = breakhis_path / 'benign' / 'SOB'
malignant_path = breakhis_path / 'malignant' / 'SOB'

benign_subtypes = list(benign_path.iterdir()) if benign_path.exists() else []
malignant_subtypes = list(malignant_path.iterdir()) if malignant_path.exists() else []

print(f"\nBenign subtypes: {[s.name for s in benign_subtypes]}")
print(f"Malignant subtypes: {[s.name for s in malignant_subtypes]}")

# Count total images
benign_images = list(benign_path.rglob('*.png')) if benign_path.exists() else []
malignant_images = list(malignant_path.rglob('*.png')) if malignant_path.exists() else []

print(f"\nDataset Size:")
print(f"  Benign images: {len(benign_images)}")
print(f"  Malignant images: {len(malignant_images)}")
print(f"  Total: {len(benign_images) + len(malignant_images)}")

print("\n❌ BreakHis is NOT SUITABLE for zero-shot segmentation")
print("   Reason: No pixel-level masks, only image-level classification labels")

BreakHis DATASET ANALYSIS

⚠️ BreakHis is a CLASSIFICATION dataset (not segmentation)
   - Labels: Benign vs Malignant (image-level)
   - No pixel-level segmentation masks

Benign subtypes: ['adenosis', 'phyllodes_tumor', 'tubular_adenoma', 'fibroadenoma']
Malignant subtypes: ['mucinous_carcinoma', 'lobular_carcinoma', 'ductal_carcinoma', 'papillary_carcinoma']

Dataset Size:
  Benign images: 2480
  Malignant images: 5429
  Total: 7909

❌ BreakHis is NOT SUITABLE for zero-shot segmentation
   Reason: No pixel-level masks, only image-level classification labels


---
## 7. LC25000 Dataset Analysis

**Source:** Lung and Colon Cancer Histopathological Images

**Note:** LC25000 is a **classification dataset** (tissue type), NOT segmentation.

In [8]:
# Cell 8: Analyze LC25000 Dataset

print("=" * 60)
print("LC25000 DATASET ANALYSIS")
print("=" * 60)

lc25000_path = BASE_PATH / 'LC25000' / 'lung_colon_image_set'

print("\n⚠️ LC25000 is a CLASSIFICATION dataset (not segmentation)")
print("   - Labels: Tissue type (image-level)")
print("   - No pixel-level segmentation masks")

# Categories
categories = {
    'colon_aca': 'Colon Adenocarcinoma',
    'colon_n': 'Colon Normal (Benign)',
    'lung_aca': 'Lung Adenocarcinoma',
    'lung_n': 'Lung Normal (Benign)',
    'lung_scc': 'Lung Squamous Cell Carcinoma'
}

print(f"\nCategories:")
for cat, desc in categories.items():
    print(f"  {cat}: {desc}")

# Count images
train_path = lc25000_path / 'Train and Validation Set'
test_path = lc25000_path / 'Test Set'

print(f"\nDataset Size:")
for cat in categories.keys():
    train_imgs = list((train_path / cat).glob('*.jpeg')) if (train_path / cat).exists() else []
    test_imgs = list((test_path / cat).glob('*.jpeg')) if (test_path / cat).exists() else []
    print(f"  {cat}: Train={len(train_imgs)}, Test={len(test_imgs)}")

print("\n❌ LC25000 is NOT SUITABLE for zero-shot segmentation")
print("   Reason: No pixel-level masks, only image-level classification labels")

LC25000 DATASET ANALYSIS

⚠️ LC25000 is a CLASSIFICATION dataset (not segmentation)
   - Labels: Tissue type (image-level)
   - No pixel-level segmentation masks

Categories:
  colon_aca: Colon Adenocarcinoma
  colon_n: Colon Normal (Benign)
  lung_aca: Lung Adenocarcinoma
  lung_n: Lung Normal (Benign)
  lung_scc: Lung Squamous Cell Carcinoma

Dataset Size:
  colon_aca: Train=4500, Test=500
  colon_n: Train=4500, Test=500
  lung_aca: Train=4500, Test=500
  lung_n: Train=4500, Test=500
  lung_scc: Train=4501, Test=499

❌ LC25000 is NOT SUITABLE for zero-shot segmentation
   Reason: No pixel-level masks, only image-level classification labels


---
## 8. PanNuke Dataset (Reference)

This is our training dataset. We already have preprocessed data.

In [9]:
# Cell 9: PanNuke Reference

print("=" * 60)
print("PanNuke DATASET (TRAINING REFERENCE)")
print("=" * 60)

print("\nPanNuke Classes (Our Reference):")
for k, v in PANNUKE_CLASSES.items():
    print(f"  {k}: {v}")

print("\nPanNuke is our TRAINING dataset - already preprocessed.")
print("See: PanNuke_Preprocess/ directory")

PanNuke DATASET (TRAINING REFERENCE)

PanNuke Classes (Our Reference):
  0: Neoplastic
  1: Inflammatory
  2: Connective_Soft_tissue
  3: Dead
  4: Epithelial

PanNuke is our TRAINING dataset - already preprocessed.
See: PanNuke_Preprocess/ directory


---
## Summary: Dataset Suitability for Zero-Shot Segmentation

In [10]:
# Cell 10: Summary Table

print("=" * 80)
print("ZERO-SHOT SEGMENTATION DATASET SUITABILITY SUMMARY")
print("=" * 80)

summary_data = [
    {
        'Dataset': 'CoNSeP',
        'Type': 'Segmentation',
        'Suitable': '✅ Yes',
        'Matching Classes': 'Inflammatory, Epithelial, Neoplastic, Connective',
        'Format': '.mat files',
        'Size': '41 images',
        'Priority': '⭐⭐⭐ High'
    },
    {
        'Dataset': 'MoNuSAC',
        'Type': 'Segmentation',
        'Suitable': '✅ Yes',
        'Matching Classes': 'Epithelial, Inflammatory (3 types)',
        'Format': 'XML + TIF',
        'Size': '~300 images',
        'Priority': '⭐⭐⭐ High'
    },
    {
        'Dataset': 'CoNIC',
        'Type': 'Segmentation',
        'Suitable': '✅ Yes',
        'Matching Classes': 'Epithelial, Inflammatory (4 types), Connective',
        'Format': 'NumPy arrays',
        'Size': '4,981 patches',
        'Priority': '⭐⭐⭐ High (Easy format!)'
    },
    {
        'Dataset': 'Lizard',
        'Type': 'Segmentation',
        'Suitable': '✅ Yes',
        'Matching Classes': 'Epithelial, Inflammatory (4 types), Connective',
        'Format': '.mat files',
        'Size': '~500 images',
        'Priority': '⭐⭐ Medium'
    },
    {
        'Dataset': 'MoNuSeg',
        'Type': 'Instance Seg',
        'Suitable': '⚠️ Limited',
        'Matching Classes': 'Binary only (nucleus vs bg)',
        'Format': 'PNG masks',
        'Size': '~44 images',
        'Priority': '⭐ Low'
    },
    {
        'Dataset': 'BreakHis',
        'Type': 'Classification',
        'Suitable': '❌ No',
        'Matching Classes': 'N/A (no masks)',
        'Format': 'PNG images',
        'Size': '~7,900 images',
        'Priority': '❌ Not applicable'
    },
    {
        'Dataset': 'LC25000',
        'Type': 'Classification',
        'Suitable': '❌ No',
        'Matching Classes': 'N/A (no masks)',
        'Format': 'JPEG images',
        'Size': '25,000 images',
        'Priority': '❌ Not applicable'
    },
]

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

print("\n" + "=" * 80)
print("RECOMMENDED ZERO-SHOT EVALUATION PLAN")
print("=" * 80)
print("""
1. CoNIC (Highest Priority - Easy NumPy format)
   - 4,981 patches ready to use
   - Classes: Epithelial, Inflammatory, Connective
   - Text prompts: "Segment epithelial cells", "Segment inflammatory cells", etc.

2. CoNSeP (High Priority - HoVer-Net benchmark)
   - 41 images (14 test)
   - Classes: Inflammatory, Epithelial, Neoplastic, Connective
   - Good for comparison with HoVer-Net results

3. MoNuSAC (High Priority - Multi-organ)
   - ~300 images across multiple organs
   - Classes: Epithelial, Lymphocyte, Macrophage, Neutrophil
   - Tests generalization across organs

4. Lizard (Medium Priority - Large scale)
   - ~500 images
   - Same classes as CoNIC
   - Requires more preprocessing
""")

ZERO-SHOT SEGMENTATION DATASET SUITABILITY SUMMARY
 Dataset           Type   Suitable                                 Matching Classes       Format          Size                Priority
  CoNSeP   Segmentation      ✅ Yes Inflammatory, Epithelial, Neoplastic, Connective   .mat files     41 images                ⭐⭐⭐ High
 MoNuSAC   Segmentation      ✅ Yes               Epithelial, Inflammatory (3 types)    XML + TIF   ~300 images                ⭐⭐⭐ High
   CoNIC   Segmentation      ✅ Yes   Epithelial, Inflammatory (4 types), Connective NumPy arrays 4,981 patches ⭐⭐⭐ High (Easy format!)
  Lizard   Segmentation      ✅ Yes   Epithelial, Inflammatory (4 types), Connective   .mat files   ~500 images               ⭐⭐ Medium
 MoNuSeg   Instance Seg ⚠️ Limited                      Binary only (nucleus vs bg)    PNG masks    ~44 images                   ⭐ Low
BreakHis Classification       ❌ No                                   N/A (no masks)   PNG images ~7,900 images        ❌ Not applicable
 LC2

---
## Class Mapping Reference Table

In [11]:
# Cell 11: Complete Class Mapping Table

print("=" * 80)
print("COMPLETE CLASS MAPPING: External Datasets → PanNuke Classes")
print("=" * 80)

mapping_data = [
    # CoNSeP
    {'Source': 'CoNSeP', 'Original Class': 'Inflammatory (2)', 'PanNuke Class': 'Inflammatory', 'PanNuke ID': 1},
    {'Source': 'CoNSeP', 'Original Class': 'Healthy Epithelial (3)', 'PanNuke Class': 'Epithelial', 'PanNuke ID': 4},
    {'Source': 'CoNSeP', 'Original Class': 'Dysplastic/Malignant (4)', 'PanNuke Class': 'Neoplastic', 'PanNuke ID': 0},
    {'Source': 'CoNSeP', 'Original Class': 'Fibroblast (5)', 'PanNuke Class': 'Connective', 'PanNuke ID': 2},
    # MoNuSAC
    {'Source': 'MoNuSAC', 'Original Class': 'Epithelial', 'PanNuke Class': 'Epithelial', 'PanNuke ID': 4},
    {'Source': 'MoNuSAC', 'Original Class': 'Lymphocyte', 'PanNuke Class': 'Inflammatory', 'PanNuke ID': 1},
    {'Source': 'MoNuSAC', 'Original Class': 'Macrophage', 'PanNuke Class': 'Inflammatory', 'PanNuke ID': 1},
    {'Source': 'MoNuSAC', 'Original Class': 'Neutrophil', 'PanNuke Class': 'Inflammatory', 'PanNuke ID': 1},
    # CoNIC/Lizard
    {'Source': 'CoNIC/Lizard', 'Original Class': 'Neutrophil (1)', 'PanNuke Class': 'Inflammatory', 'PanNuke ID': 1},
    {'Source': 'CoNIC/Lizard', 'Original Class': 'Epithelial (2)', 'PanNuke Class': 'Epithelial', 'PanNuke ID': 4},
    {'Source': 'CoNIC/Lizard', 'Original Class': 'Lymphocyte (3)', 'PanNuke Class': 'Inflammatory', 'PanNuke ID': 1},
    {'Source': 'CoNIC/Lizard', 'Original Class': 'Plasma (4)', 'PanNuke Class': 'Inflammatory', 'PanNuke ID': 1},
    {'Source': 'CoNIC/Lizard', 'Original Class': 'Eosinophil (5)', 'PanNuke Class': 'Inflammatory', 'PanNuke ID': 1},
    {'Source': 'CoNIC/Lizard', 'Original Class': 'Connective (6)', 'PanNuke Class': 'Connective', 'PanNuke ID': 2},
]

mapping_df = pd.DataFrame(mapping_data)
print(mapping_df.to_string(index=False))

print("\n" + "=" * 80)
print("CLASSES AVAILABLE FOR ZERO-SHOT EVALUATION")
print("=" * 80)
print("""
PanNuke Class          | Available in External Datasets?
-----------------------|--------------------------------
0: Neoplastic          | ✅ CoNSeP (Dysplastic/Malignant)
1: Inflammatory        | ✅ All datasets (various subtypes)
2: Connective          | ✅ CoNSeP, CoNIC, Lizard
3: Dead                | ❌ Not available in external datasets
4: Epithelial          | ✅ All datasets

Note: 'Dead' class is unique to PanNuke and not present in other datasets.
""")

COMPLETE CLASS MAPPING: External Datasets → PanNuke Classes
      Source           Original Class PanNuke Class  PanNuke ID
      CoNSeP         Inflammatory (2)  Inflammatory           1
      CoNSeP   Healthy Epithelial (3)    Epithelial           4
      CoNSeP Dysplastic/Malignant (4)    Neoplastic           0
      CoNSeP           Fibroblast (5)    Connective           2
     MoNuSAC               Epithelial    Epithelial           4
     MoNuSAC               Lymphocyte  Inflammatory           1
     MoNuSAC               Macrophage  Inflammatory           1
     MoNuSAC               Neutrophil  Inflammatory           1
CoNIC/Lizard           Neutrophil (1)  Inflammatory           1
CoNIC/Lizard           Epithelial (2)    Epithelial           4
CoNIC/Lizard           Lymphocyte (3)  Inflammatory           1
CoNIC/Lizard               Plasma (4)  Inflammatory           1
CoNIC/Lizard           Eosinophil (5)  Inflammatory           1
CoNIC/Lizard           Connective (6)    Con

In [12]:
# Cell 12: Save Analysis Summary

# Save summary to JSON for reference
analysis_summary = {
    'pannuke_classes': PANNUKE_CLASSES,
    'suitable_datasets': ['CoNIC', 'CoNSeP', 'MoNuSAC', 'Lizard'],
    'unsuitable_datasets': ['BreakHis', 'LC25000'],
    'limited_datasets': ['MoNuSeg'],
    'class_mappings': {
        'CoNSeP': CONSEP_TO_PANNUKE,
        'CoNIC': CONIC_TO_PANNUKE,
        'Lizard': LIZARD_TO_PANNUKE,
        'MoNuSAC': MONUSAC_TO_PANNUKE,
    },
    'evaluation_priority': [
        {'dataset': 'CoNIC', 'priority': 1, 'reason': 'Easy NumPy format, 4981 patches'},
        {'dataset': 'CoNSeP', 'priority': 2, 'reason': 'HoVer-Net benchmark, has Neoplastic class'},
        {'dataset': 'MoNuSAC', 'priority': 3, 'reason': 'Multi-organ, tests generalization'},
        {'dataset': 'Lizard', 'priority': 4, 'reason': 'Large scale, requires preprocessing'},
    ]
}

with open('zero_shot_analysis_summary.json', 'w') as f:
    json.dump(analysis_summary, f, indent=2)

print("✓ Analysis summary saved to: zero_shot_analysis_summary.json")
print("\n" + "=" * 80)
print("NEXT STEPS")
print("=" * 80)
print("""
1. Create preprocessing notebooks for each suitable dataset
2. Create zero_shot_evaluation.ipynb to:
   - Load pretrained CIPS-Net model (best fold)
   - Generate text prompts for each class
   - Run inference on external datasets
   - Compute metrics (Dice, IoU, PQ) per class
3. Compare results across datasets
""")

✓ Analysis summary saved to: zero_shot_analysis_summary.json

NEXT STEPS

1. Create preprocessing notebooks for each suitable dataset
2. Create zero_shot_evaluation.ipynb to:
   - Load pretrained CIPS-Net model (best fold)
   - Generate text prompts for each class
   - Run inference on external datasets
   - Compute metrics (Dice, IoU, PQ) per class
3. Compare results across datasets

