In [None]:
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from anndata import AnnData
import MENDER
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import svm
from scipy import stats
import re
import os
import glob
import tifffile
import pandas as pd
from skimage.measure import regionprops_table
import numpy as np
import plotly.graph_objects as go

# Load data

In [2]:
# Directory containing your segmented mask TIFF files
mask_dir = r'TNBC_shareCellData'
# Output CSV file path
output_csv = "cell_centroids.csv"

records = []

# Loop through each TIFF mask
for mask_path in glob.glob(os.path.join(mask_dir, "*.tiff")):
    # Load the mask (each cell labeled by a unique integer)
    mask = tifffile.imread(mask_path)

    print("Shape:", mask.shape)          # e.g. (1024, 1024)
    print("Data type:", mask.dtype)      # e.g. uint16
    labels = np.unique(mask)
    print(f"Found {len(labels)-1} objects (ignoring background=0)")
    print("Some label values:", labels[:10], "…", labels[-10:])
    # Extract properties: label ID and centroids
    props = regionprops_table(
        mask,
        properties=('label', 'centroid')
    )
    
    # Convert to DataFrame
    df = pd.DataFrame(props)
    # Keep track of source image
    
    
    nums = re.search(r"\d+", os.path.basename(mask_path))
    df['image'] = int(nums.group()) if nums else None
    records.append(df)

# Concatenate all image DataFrames
all_cells = pd.concat(records, ignore_index=True)

# Rename columns for clarity
all_cells = all_cells.rename(
    columns={
        'label': 'CellID',
        'centroid-0': 'CentroidY',
        'centroid-1': 'CentroidX'
    }
)

# Save to CSV
all_cells.to_csv(output_csv, index=False)
print(f"Saved {len(all_cells)} cell centroids to {output_csv}")


Shape: (2048, 2048)
Data type: uint16
Found 5213 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [5204 5205 5206 5207 5208 5209 5210 5211 5212 5213]
Shape: (2048, 2048)
Data type: uint16
Found 6140 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [6131 6132 6133 6134 6135 6136 6137 6138 6139 6140]
Shape: (2048, 2048)
Data type: uint16
Found 8772 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [8763 8764 8765 8766 8767 8768 8769 8770 8771 8772]
Shape: (2048, 2048)
Data type: uint16
Found 9738 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [9729 9730 9731 9732 9733 9734 9735 9736 9737 9738]
Shape: (2048, 2048)
Data type: uint16
Found 7751 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [7742 7743 7744 7745 7746 7747 7748 7749 7750 7751]
Shape: (2048, 2048)
Data type: uint16
Found 3560 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9

In [21]:
df = pd.read_csv("TNBC_shareCellData/cellData.csv", sep=",")
df_p = pd.read_csv("TNBC_shareCellData/patient_class.csv", header=None)
df_p.rename(columns={0:"SampleID", 1:"subtype"}, inplace=True)
subtype_map = {
    0: "Mixed",
    1: "Compartimentalized",
    2: "Cold"
}

df_p["subtype"] = df_p["subtype"].map(subtype_map).astype('category')




bad_Values = [42,43,44]
df = df[~df["SampleID"].isin(bad_Values)]


meta_cols = ['SampleID', 'cellLabelInImage', 'cellSize', 'tumorYN', 
             'tumorCluster', 'Group', 'immuneCluster', 'immuneGroup']

expr_cols = [c for c in df.columns if c not in meta_cols]

adata = AnnData(
    X = df[expr_cols].values,
    obs = df[meta_cols],
    var=pd.DataFrame(index=expr_cols)
)

adata.obs["UniqueID"] = adata.obs['SampleID'].astype(str) + "_" + adata.obs['cellLabelInImage'].astype(str)
all_cells["UniqueID"] = all_cells["image"].astype(str) + "_" + all_cells["CellID"].astype(str)
all_cells = all_cells.set_index('UniqueID', drop = False)
adata.obs = adata.obs.set_index('UniqueID', drop=False)

all_cells_aligned = all_cells.reindex(adata.obs_names)

adata.obsm["spatial"] = all_cells_aligned[["CentroidX", "CentroidY"]].to_numpy()

# X = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X
# min_val = X.min()
# print("Minimum before shift:", min_val)

# 2. If it’s < 0, shift everything up so the minimum is 0
# if min_val < 0:
#     shift = -min_val
#     print(f"Shifting all values by +{shift:.3g} to eliminate negatives")
#     X += shift
#     adata.X = X  # put it back into your AnnData

# sc.pp.normalize_total(adata)         # e.g. counts per cell
# sc.pp.log1p(adata)
# sc.pp.scale(adata)

# sc.tl.pca(adata, svd_solver='arpack', n_comps=50)
# sc.pp.neighbors(adata)               # builds graph on PCA
# sc.tl.umap(adata)
code2label_group = {
    1: "Unidentified",
    2: "Immune",
    3: "EndoThelial",
    4: "Mesenchymal-like",
    5: "Tumor",
    6: "Keratin-positive tumor"
}

code2label_immunegroup = {
    1: "Tregs",
    2: "CD4 T",
    3: "CD8 T",
    4: "CD3 T",
    5: "NK",
    6: "B",
    7: "Neutrophils",
    8: "Macrophages",
    9: "DC",
    10: "DC/Mono",
    11: "Mono/Neu",
    12: "Other immune"
}
adata.obs["Group"] = adata.obs["Group"].map(code2label_group).astype('category')
adata.obs["immuneGroup"] = adata.obs["immuneGroup"].map(code2label_immunegroup)
adata.obs = pd.merge(
    adata.obs,
    df_p,
    on="SampleID",
    how="left"
)

# Copy for later use
adata_raw = adata.copy()

adata.obs



Unnamed: 0,SampleID,cellLabelInImage,cellSize,tumorYN,tumorCluster,Group,immuneCluster,immuneGroup,UniqueID,subtype
0,1,2,146,1,0,Keratin-positive tumor,0,,1_2,Mixed
1,1,3,102,0,0,Immune,46,CD3 T,1_3,Mixed
2,1,4,43,1,0,Keratin-positive tumor,0,,1_4,Mixed
3,1,5,211,1,0,Keratin-positive tumor,0,,1_5,Mixed
4,1,6,177,0,0,Immune,75,B,1_6,Mixed
...,...,...,...,...,...,...,...,...,...,...
197673,41,5093,90,1,0,Keratin-positive tumor,0,,41_5093,Compartimentalized
197674,41,5094,132,1,21,Tumor,0,,41_5094,Compartimentalized
197675,41,5095,123,0,0,Immune,31,Macrophages,41_5095,Compartimentalized
197676,41,5096,99,0,0,Immune,56,Other immune,41_5096,Compartimentalized


In [4]:
y = adata.obs.groupby(['SampleID'])['subtype'].first().reset_index(drop=True)
y

0                  Mixed
1                  Mixed
2     Compartimentalized
3     Compartimentalized
4     Compartimentalized
5     Compartimentalized
6                  Mixed
7                  Mixed
8     Compartimentalized
9     Compartimentalized
10                 Mixed
11                 Mixed
12                 Mixed
13                 Mixed
14                  Cold
15    Compartimentalized
16                 Mixed
17                 Mixed
18                  Cold
19                 Mixed
20                 Mixed
21                  Cold
22                 Mixed
23                  Cold
24                  Cold
25                  Cold
26                 Mixed
27    Compartimentalized
28                 Mixed
29                 Mixed
30    Compartimentalized
31                 Mixed
32    Compartimentalized
33    Compartimentalized
34    Compartimentalized
35    Compartimentalized
36                 Mixed
37                 Mixed
38    Compartimentalized
39    Compartimentalized


# Coarse representation

In [5]:
coarse = adata.obs.groupby(["SampleID", "Group"]).size().unstack(fill_value=0).reset_index(drop=True)
# Normalize the counts to percentages
coarse_norm = coarse.div(coarse.sum(axis=1), axis=0)
coarse_norm

Group,EndoThelial,Immune,Keratin-positive tumor,Mesenchymal-like,Tumor,Unidentified
0,0.006774,0.495065,0.481517,0.008322,0.000194,0.008129
1,0.033025,0.31572,0.60568,0.042933,0.00033,0.002312
2,0.017736,0.503405,0.356136,0.11639,0.003642,0.002692
3,0.019871,0.626374,0.252898,0.032214,0.064579,0.004064
4,0.036256,0.516093,0.393082,0.043655,0.002775,0.008139
5,0.018339,0.386796,0.557686,0.021674,0.002334,0.013171
6,0.001173,0.163636,0.748094,0.033724,0.023754,0.029619
7,0.000319,0.17889,0.47162,0.117666,0.182717,0.048788
8,0.012217,0.532986,0.430363,0.008796,0.008308,0.00733
9,0.008734,0.625764,0.327511,0.032533,0.000655,0.004803


In [6]:
knn = KNeighborsClassifier()
coarse_knn = cross_val_score(knn, coarse_norm, y, scoring='accuracy', cv=5)
print(coarse_knn)
coarse_knn.mean(), coarse_knn.std()

[0.75  0.5   1.    0.875 0.625]


(0.75, 0.1767766952966369)

In [7]:
svc = svm.SVC()
coarse_svc = cross_val_score(svc, coarse_norm, y, scoring='accuracy', cv=5)
print(coarse_svc)
coarse_svc.mean(), coarse_svc.std()

[0.875 0.625 0.875 0.875 0.625]


(0.775, 0.1224744871391589)

# Fine representation

In [None]:
data_fine = adata.obs.copy()
data_fine.loc[data_fine["immuneGroup"].isna(), "immuneGroup"] = data_fine.loc[data_fine["immuneGroup"].isna(), "Group"]

In [10]:
fine = data_fine.groupby(["SampleID", "immuneGroup"]).size().unstack(fill_value=0).reset_index(drop=True)
# Normalize the counts to percentages
fine_norm = fine.div(fine.sum(axis=1), axis=0)
fine

immuneGroup,B,CD3 T,CD4 T,CD8 T,DC,DC/Mono,EndoThelial,Keratin-positive tumor,Macrophages,Mesenchymal-like,Mono/Neu,NK,Neutrophils,Other immune,Tregs,Tumor,Unidentified
0,1147,304,243,173,2,176,35,2488,281,43,3,6,10,213,0,1,42
1,2,11,21,245,37,9,100,1834,511,130,56,11,10,43,0,1,7
2,427,223,491,496,36,434,112,2249,726,735,21,12,19,273,21,23,17
3,329,369,733,304,2,458,132,1680,880,214,52,6,129,717,182,429,27
4,3,121,255,531,3,333,196,2125,1064,236,183,1,73,175,48,15,44
5,33,133,121,206,0,198,110,3345,710,130,467,0,13,436,3,14,79
6,0,1,1,4,0,2,4,2551,245,115,7,0,288,10,0,81,101
7,8,37,4,50,0,18,1,1479,269,369,53,0,8,114,0,573,153
8,60,229,340,391,73,212,75,2642,1236,54,163,6,119,384,59,51,45
9,18,75,165,410,2,99,40,1500,1304,149,414,26,77,233,43,3,22


In [11]:
knn = KNeighborsClassifier()
fine_knn = cross_val_score(knn, fine_norm, y, scoring='accuracy', cv=5)
print(fine_knn)
fine_knn.mean(), fine_knn.std()

[0.625 0.625 0.875 0.875 0.625]


(0.725, 0.1224744871391589)

In [12]:
svc = svm.SVC()
fine_svc = cross_val_score(svc, fine_norm, y, scoring='accuracy', cv=5)
print(fine_svc)
fine_svc.mean(), fine_svc.std()

[0.875 0.5   0.875 0.875 0.625]


(0.75, 0.15811388300841897)

# MENDER representation

In [None]:
batch_obs = 'subtype'
scale = 6
radius = 15

# I'm not sure why they did so many copies but I took it from this code 
# https://mender-tutorial.readthedocs.io/en/latest/MERSCOPE.html
adata = adata_raw.copy()

# adata.obs['SampleID'] = adata.obs['SampleID'].astype('category')

# Only do it for these 18 samples to make it run more quickly. 6 mixed, comp and cold
# sample_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 19, 22, 24, 25, 26]
# adata = adata[adata.obs['SampleID'].isin(sample_ids)].copy()


# TODO: I think the grouping here should be based on fine groups
# main body of MENDER
msm = MENDER.MENDER(
    adata,
    batch_obs = batch_obs,
    # determine which cell state to use
    # In our case the cell type does not need to be estimated since we have a ground truth stored in 
    # the dataset under all_group_name
    ct_obs= 'Group',
    random_seed=42,
    verbose=True
)


# set the MENDER parameters


msm.prepare()
msm.set_MENDER_para(
    # default of n_scales is 6
    n_scales=scale,

    # for single cell data, nn_mode is set to 'radius'
    nn_mode='radius',

    # default of n_scales is 15 um (see the manuscript for why).
    # MENDER also provide a function 'estimate_radius' for estimating the radius
    nn_para=radius,
)


# construct the context representation
msm.run_representation_mp(
    8
    # the number of processings
)

# set the spatial clustering parameter
# positive values for the expected number of domains
# negative values for the clustering resolution
msm.run_clustering_normal(-0.5)
# msm.run_clustering_normal(9)
msm.adata_MENDER.obs




default number of process is 200


In [None]:
mender = msm.adata_MENDER.obs.groupby(['SampleID', 'MENDER']).size().unstack(fill_value=0).reset_index(drop=True)
mender_norm = mender.div(mender.sum(axis=1), axis=0)
mender_norm

MENDER,0,1,2,3,4,5
0,0.000194,0.461777,0.038707,0.366751,0.131411,0.001161
1,0.00033,0.514861,0.023118,0.321334,0.139036,0.001321
2,0.0,0.151861,0.437055,0.134125,0.26445,0.01251
3,0.0,0.144061,0.449947,0.128556,0.26479,0.012645
4,0.000185,0.137625,0.45246,0.141324,0.255087,0.013319
5,0.0,0.158886,0.434645,0.129543,0.266589,0.010337
6,0.000293,0.55132,0.018182,0.293842,0.134604,0.00176
7,0.000319,0.541773,0.01977,0.300064,0.13648,0.001594
8,0.0,0.145626,0.456426,0.1295,0.258022,0.010425
9,0.0,0.139301,0.44738,0.144323,0.25524,0.013755


In [None]:
knn = KNeighborsClassifier()
mender_knn = cross_val_score(knn, mender_norm, y, scoring='accuracy', cv=5)
print(mender_knn)
mender_knn.mean(), mender_knn.std()

ValueError: Found input variables with inconsistent numbers of samples: [18, 40]

In [None]:
svc = svm.SVC()
mender_svc = cross_val_score(svc, mender_norm, y, scoring='accuracy', cv=5)
print(mender_svc)
mender_svc.mean(), mender_svc.std()

In [None]:
# Prepare data for barplot
bar_means = [mender_knn.mean(), fine_knn.mean(), coarse_knn.mean()]
bar_stds = [mender_knn.std(), fine_knn.std(), coarse_knn.std()]
bar_names = ['MENDER', 'Fine', 'Coarse']

fig = go.Figure(
    data=[
        go.Bar(
            x=bar_names,
            y=bar_means,
            error_y=dict(type='data', array=bar_stds, visible=True),
            marker_color=['#636EFA', '#EF553B', '#00CC96']
        )
    ]
)
fig.update_layout(
    title='KNN Accuracy Comparison',
    yaxis_title='Accuracy (mean ± std)',
    xaxis_title='Representation',
    yaxis=dict(range=[0, 1]),
    template='plotly_white'
)
fig.show()