In [1]:
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from anndata import AnnData
import MENDER
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn import svm
from scipy import stats
import re
import os
import glob
import tifffile
import pandas as pd
from skimage.measure import regionprops_table
import numpy as np
import plotly.graph_objects as go
import scipy.stats as st
from scipy import stats

# Load data

In [2]:
# Directory containing your segmented mask TIFF files
mask_dir = r'TNBC_shareCellData'
# Output CSV file path
output_csv = "cell_centroids.csv"

records = []

# Loop through each TIFF mask
for mask_path in glob.glob(os.path.join(mask_dir, "*.tiff")):
    # Load the mask (each cell labeled by a unique integer)
    mask = tifffile.imread(mask_path)

    if ('p30_labeledcellData' in mask_path):
        print('bruh')
    print("Shape:", mask.shape)          # e.g. (1024, 1024)
    print("Data type:", mask.dtype)      # e.g. uint16
    labels = np.unique(mask)
    print(f"Found {len(labels)-1} objects (ignoring background=0)")
    print("Some label values:", labels[:10], "…", labels[-10:])
    # Extract properties: label ID and centroids
    props = regionprops_table(
        mask,
        properties=('label', 'centroid')
    )
    
    # Convert to DataFrame
    df = pd.DataFrame(props)
    # Keep track of source image
    
    
    nums = re.search(r"\d+", os.path.basename(mask_path))
    df['image'] = int(nums.group()) if nums else None
    records.append(df)

# Concatenate all image DataFrames
all_cells = pd.concat(records, ignore_index=True)

# Rename columns for clarity
all_cells = all_cells.rename(
    columns={
        'label': 'CellID',
        'centroid-0': 'CentroidY',
        'centroid-1': 'CentroidX'
    }
)

# Save to CSV
all_cells.to_csv(output_csv, index=False)
print(f"Saved {len(all_cells)} cell centroids to {output_csv}")


Shape: (2048, 2048)
Data type: uint16
Found 3343 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [3334 3335 3336 3337 3338 3339 3340 3341 3342 3343]
Shape: (2048, 2048)
Data type: uint16
Found 9623 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [9614 9615 9616 9617 9618 9619 9620 9621 9622 9623]
Shape: (2048, 2048)
Data type: uint16
Found 3755 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [3746 3747 3748 3749 3750 3751 3752 3753 3754 3755]
Shape: (2048, 2048)
Data type: uint16
Found 7836 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [7827 7828 7829 7830 7831 7832 7833 7834 7835 7836]
Shape: (2048, 2048)
Data type: uint16
Found 7751 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [7742 7743 7744 7745 7746 7747 7748 7749 7750 7751]
Shape: (2048, 2048)
Data type: uint16
Found 2866 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9

In [3]:
print(all_cells.columns)

Index(['CellID', 'CentroidY', 'CentroidX', 'image'], dtype='object')


Perform Random subsampling to decreace dataset size

In [3]:
# Read in the full cell‐level table
df = pd.read_csv("TNBC_shareCellData/cellData.csv", sep=",")
print(f"Number of unique SampleIDs: {df['SampleID'].nunique()}")
print("\nUnique SampleIDs:")
print(df['SampleID'].unique())
print(f"Count for SampleID 33: {len(df[df['SampleID'] == 33])}")


# Choose how many cells to keep per patient:
# The patient with the fewest cells in the original dataset had 2217
# The mean is 5923.55
# The most cells is 9738
# n = 2217
n = 2020

# 3. Option A: if you only want patients who have ≥ n cells, and you want exactly n per patient:
#    (patients with fewer than n cells are dropped entirely)
df_filtered = df.groupby("SampleID").filter(lambda sub: len(sub) >= n)
print(df['SampleID'].value_counts())
df_subsample = (
    df_filtered
    .groupby("SampleID", group_keys=False)
    .sample(n=n, random_state=42)
)


# Write out a new CSV (e.g. “cellData_100.csv”)
out_name = f"TNBC_shareCellData/cellData_{n}.csv"
df_subsample.to_csv(out_name, sep=",", index=False)
print(f"Wrote subsampled table with {n} cells per patient to:\n  {out_name}")
print(df_subsample['SampleID'].unique())
print(f"Count for SampleID 33: {len(df_subsample[df_subsample['SampleID'] == 33])}")

Number of unique SampleIDs: 43

Unique SampleIDs:
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
Count for SampleID 33: 2046
SampleID
16    8212
35    7716
13    7665
17    7071
12    6995
4     6643
3     6315
37    6280
14    6270
9     6139
28    6061
6     5998
18    5539
21    5423
5     5406
1     5167
32    5158
26    5119
11    5112
20    5103
29    4819
24    4613
10    4580
41    4532
23    4490
19    4400
27    4332
38    4330
40    4285
39    4030
31    3415
7     3410
15    3315
8     3136
22    3072
2     3028
36    2939
34    2856
25    2658
33    2046
43    1381
42    1380
44    1217
Name: count, dtype: int64
Wrote subsampled table with 2020 cells per patient to:
  TNBC_shareCellData/cellData_2020.csv
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 31 32 33 34 35 36 37 38 39 40 41]
Count for SampleID 33: 2020


In [4]:
print(df_subsample['SampleID'].unique())
print(f"Count for SampleID 33: {len(df_subsample[df_subsample['SampleID'] == 33])}")


[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 31 32 33 34 35 36 37 38 39 40 41]
Count for SampleID 33: 2020


In [5]:
# df = pd.read_csv(f"TNBC_shareCellData/cellData_{n}.csv", sep=",")
df = pd.read_csv(f"TNBC_shareCellData/cellData.csv", sep=",") # TODO comment out if not doing the whole thing
df_p = pd.read_csv("TNBC_shareCellData/patient_class.csv", header=None)
df_p.rename(columns={0:"SampleID", 1:"subtype"}, inplace=True)
subtype_map = {
    0: "Mixed",
    1: "Compartimentalized",
    2: "Cold"
}

df_p["subtype"] = df_p["subtype"].map(subtype_map).astype('category')



# TO DO: LOOK INTO THIS FILTERING SINCE SAMPLEID 30 DOESN"T ACTUALLY EXIST SO MAYBE IT IS MORE COMPLICATED THAN JUST REMOVING A FEW
bad_Values = [42,43,44]

df = df[~df["SampleID"].isin(bad_Values)]


meta_cols = ['SampleID', 'cellLabelInImage', 'cellSize', 'tumorYN', 
             'tumorCluster', 'Group', 'immuneCluster', 'immuneGroup']

expr_cols = [c for c in df.columns if c not in meta_cols]

adata = AnnData(
    X = df[expr_cols].values,
    obs = df[meta_cols],
    var=pd.DataFrame(index=expr_cols)
)

adata.obs["UniqueID"] = adata.obs['SampleID'].astype(str) + "_" + adata.obs['cellLabelInImage'].astype(str)
all_cells["UniqueID"] = all_cells["image"].astype(str) + "_" + all_cells["CellID"].astype(str)
all_cells = all_cells.set_index('UniqueID', drop = False)
adata.obs = adata.obs.set_index('UniqueID', drop=False)

all_cells_aligned = all_cells.reindex(adata.obs_names)

adata.obsm["spatial"] = all_cells_aligned[["CentroidX", "CentroidY"]].to_numpy()

# X = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X
# min_val = X.min()
# print("Minimum before shift:", min_val)

# 2. If it’s < 0, shift everything up so the minimum is 0
# if min_val < 0:
#     shift = -min_val
#     print(f"Shifting all values by +{shift:.3g} to eliminate negatives")
#     X += shift
#     adata.X = X  # put it back into your AnnData

# sc.pp.normalize_total(adata)         # e.g. counts per cell
# sc.pp.log1p(adata)
# sc.pp.scale(adata)

# sc.tl.pca(adata, svd_solver='arpack', n_comps=50)
# sc.pp.neighbors(adata)               # builds graph on PCA
# sc.tl.umap(adata)
code2label_group = {
    1: "Unidentified",
    2: "Immune",
    3: "EndoThelial",
    4: "Mesenchymal-like",
    5: "Tumor",
    6: "Keratin-positive tumor"
}

code2label_immunegroup = {
    1: "Tregs",
    2: "CD4 T",
    3: "CD8 T",
    4: "CD3 T",
    5: "NK",
    6: "B",
    7: "Neutrophils",
    8: "Macrophages",
    9: "DC",
    10: "DC/Mono",
    11: "Mono/Neu",
    12: "Other immune"
}
adata.obs["Group"] = adata.obs["Group"].map(code2label_group).astype('category')
adata.obs["immuneGroup"] = adata.obs["immuneGroup"].map(code2label_immunegroup)
adata.obs = pd.merge(
    adata.obs,
    df_p,
    on="SampleID",
    how="left"
)

# Copy for later use
adata_raw = adata.copy()

adata.obs



Unnamed: 0,SampleID,cellLabelInImage,cellSize,tumorYN,tumorCluster,Group,immuneCluster,immuneGroup,UniqueID,subtype
0,1,2,146,1,0,Keratin-positive tumor,0,,1_2,Mixed
1,1,3,102,0,0,Immune,46,CD3 T,1_3,Mixed
2,1,4,43,1,0,Keratin-positive tumor,0,,1_4,Mixed
3,1,5,211,1,0,Keratin-positive tumor,0,,1_5,Mixed
4,1,6,177,0,0,Immune,75,B,1_6,Mixed
...,...,...,...,...,...,...,...,...,...,...
197673,41,5093,90,1,0,Keratin-positive tumor,0,,41_5093,Compartimentalized
197674,41,5094,132,1,21,Tumor,0,,41_5094,Compartimentalized
197675,41,5095,123,0,0,Immune,31,Macrophages,41_5095,Compartimentalized
197676,41,5096,99,0,0,Immune,56,Other immune,41_5096,Compartimentalized


# Coarse representation

In [11]:
# Compute “coarse” counts per SampleID × Group, but KEEP SampleID as the index
coarse = (
    adata.obs
         .groupby(["SampleID", "Group"])
         .size()
         .unstack(fill_value=0)
)
# Now coarse.index is SampleID
print(coarse)

# Normalize each row to percentages
coarse_norm = coarse.div(coarse.sum(axis=1), axis=0)
# coarse_norm.index is still SampleID

# Build y so that it lines up with coarse_norm.index 
y = adata.obs.groupby("SampleID")["subtype"].first().loc[coarse_norm.index]
print(coarse_norm.shape)
print(y.shape)
# Run KNN with cross‐validation
knn = KNeighborsClassifier()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
coarse_knn = cross_val_score(knn, coarse_norm, y, scoring="accuracy", cv=cv)

print(coarse_knn)
print(coarse_knn.mean(), coarse_knn.std())

# UNCOMMENT TO SHOW COARSE REPRESENTATION
# coarse_norm

Group     EndoThelial  Immune  Keratin-positive tumor  Mesenchymal-like   
SampleID                                                                  
1                  35    2558                    2488                43  \
2                 100     956                    1834               130   
3                 112    3179                    2249               735   
4                 132    4161                    1680               214   
5                 196    2790                    2125               236   
6                 110    2320                    3345               130   
7                   4     558                    2551               115   
8                   1     561                    1479               369   
9                  75    3272                    2642                54   
10                 40    2866                    1500               149   
11                 18    1617                    3193                29   
12                  6    

In [30]:
svc = svm.SVC()
coarse_svc = cross_val_score(svc, coarse_norm, y, scoring='accuracy', cv=cv)
print(coarse_svc)
coarse_svc.mean(), coarse_svc.std()

[0.625 0.625 0.875 0.875 0.75  0.625 0.875 0.875 0.75  0.625 0.5   0.75
 0.75  0.875 0.625 0.875 0.625 0.75  0.75  0.75  0.875 0.75  0.75  0.75
 0.625 0.75  0.875 0.75  0.625 0.75  0.75  0.875 0.875 0.5   0.5   0.75
 0.75  0.5   0.75  0.625 0.875 0.875 0.75  0.625 0.625 0.75  0.875 0.625
 0.875 0.625]


(0.7325, 0.1145916663636584)

# Fine representation

In [20]:
data_fine = adata.obs.copy()
data_fine.loc[data_fine["immuneGroup"].isna(), "immuneGroup"] = data_fine.loc[data_fine["immuneGroup"].isna(), "Group"]
data_fine

Unnamed: 0,SampleID,cellLabelInImage,cellSize,tumorYN,tumorCluster,Group,immuneCluster,immuneGroup,UniqueID,subtype
0,1,2,146,1,0,Keratin-positive tumor,0,Keratin-positive tumor,1_2,Mixed
1,1,3,102,0,0,Immune,46,CD3 T,1_3,Mixed
2,1,4,43,1,0,Keratin-positive tumor,0,Keratin-positive tumor,1_4,Mixed
3,1,5,211,1,0,Keratin-positive tumor,0,Keratin-positive tumor,1_5,Mixed
4,1,6,177,0,0,Immune,75,B,1_6,Mixed
...,...,...,...,...,...,...,...,...,...,...
197673,41,5093,90,1,0,Keratin-positive tumor,0,Keratin-positive tumor,41_5093,Compartimentalized
197674,41,5094,132,1,21,Tumor,0,Tumor,41_5094,Compartimentalized
197675,41,5095,123,0,0,Immune,31,Macrophages,41_5095,Compartimentalized
197676,41,5096,99,0,0,Immune,56,Other immune,41_5096,Compartimentalized


In [21]:
fine = data_fine.groupby(["SampleID", "immuneGroup"]).size().unstack(fill_value=0).reset_index(drop=True)
# Normalize the counts to percentages
fine_norm = fine.div(fine.sum(axis=1), axis=0)
# UNCOMMENT TO SHOW FINE REPRESENTAITON
# fine

In [22]:
knn = KNeighborsClassifier()
fine_knn = cross_val_score(knn, fine_norm, y, scoring='accuracy', cv=cv)
print(fine_knn)
fine_knn.mean(), fine_knn.std()

[0.5   0.625 0.75  0.875 0.75  0.625 0.75  0.875 0.625 0.625 0.75  0.75
 0.625 0.875 0.625 0.75  0.75  0.75  0.75  0.75  0.625 0.75  0.75  0.75
 0.625 0.625 0.875 0.75  0.625 0.75  0.75  0.75  0.75  0.5   0.625 0.75
 0.875 0.625 0.75  0.5   0.625 0.625 0.75  0.625 0.75  0.75  0.875 0.5
 0.875 0.625]


(0.7075, 0.10188841936157415)

In [23]:
svc = svm.SVC()
fine_svc = cross_val_score(svc, fine_norm, y, scoring='accuracy', cv=cv)
print(fine_svc)
fine_svc.mean(), fine_svc.std()

[0.625 0.625 0.875 0.875 0.75  0.75  0.875 0.875 0.75  0.625 0.5   0.75
 0.75  0.875 0.75  0.875 0.75  0.75  0.625 0.75  0.75  0.75  0.625 0.875
 0.625 0.75  0.875 0.75  0.75  0.75  0.75  0.875 0.875 0.75  0.5   0.75
 0.75  0.625 0.75  0.625 0.75  0.875 0.625 0.625 0.75  0.75  0.875 0.625
 0.75  0.75 ]


(0.7425, 0.09813893213195261)

# MENDER representation

In [13]:
# batch_obs = 'subtype'
batch_obs = 'SampleID'
scale = 6
radius = 15

# I'm not sure why they did so many copies but I took it from this code 
# https://mender-tutorial.readthedocs.io/en/latest/MERSCOPE.html
adata = adata_raw.copy()

# adata.obs['SampleID'] = adata.obs['SampleID'].astype('category')

# Only do it for these 18 samples to make it run more quickly. 6 mixed, comp and cold
# sample_ids = [5, 13, 1, 2, 3, 4]
# sample_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 19, 22, 24, 25, 26]
sample_ids = [
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,  22, 23, 24, 25, 26,
    27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
]

adata = adata[adata.obs['SampleID'].isin(sample_ids)].copy()

adata.obs['SampleID'] = adata.obs['SampleID'].astype('category')


# TODO: I think the grouping here should be based on fine groups
# main body of MENDER
msm = MENDER.MENDER(
    adata,
    batch_obs = batch_obs,
    # determine which cell state to use
    # In our case the cell type does not need to be estimated since we have a ground truth stored in 
    # the dataset under all_group_name
    ct_obs= 'Group',
    random_seed=42,
    verbose=True
)


# set the MENDER parameters


msm.prepare()
msm.set_MENDER_para(
    # default of n_scales is 6
    n_scales=scale,

    # for single cell data, nn_mode is set to 'radius'
    nn_mode='radius',

    # default of n_scales is 15 um (see the manuscript for why).
    # MENDER also provide a function 'estimate_radius' for estimating the radius
    nn_para=radius,
)


# construct the context representation
msm.run_representation_mp(
    500
    # the number of processings
)

# set the spatial clustering parameter
# positive values for the expected number of domains
# negative values for the clustering resolution
cluster_parameter = 13
# msm.run_clustering_normal(-0.5)
# msm.run_clustering_normal(2)
msm.run_clustering_normal(cluster_parameter)
msm.adata_MENDER.obs


default number of process is 200




total batch: 40, running batch 1
scale 0




total batch: 40, running batch 2
scale 0




total batch: 40, running batch 3
scale 0




scale 0, median #cells per radius (r=15): 1.0
scale 1
total batch: 40, running batch 4
scale 0
scale 0, median #cells per radius (r=15): 1.0
scale 1




total batch: 40, running batch 5
scale 0




scale 1, median #cells per radius (r=15): 2.0
scale 2
total batch: 40, running batch 6
scale 0




total batch: 40, running batch 7
scale 0
scale 0, median #cells per radius (r=15): 1.0
scale 1




scale 1, median #cells per radius (r=15): 3.0
scale 2
total batch: 40, running batch 8
scale 0
scale 2, median #cells per radius (r=15): 3.0
scale 3




scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 0, median #cells per radius (r=15): 1.0
scale 1
total batch: 40, running batch 9
scale 0
scale 0, median #cells per radius (r=15): 1.0
scale 1
total batch: 40, running batch 10
scale 0




scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 3, median #cells per radius (r=15): 3.0
scale 4




total batch: 40, running batch 11
scale 0
scale 2, median #cells per radius (r=15): 5.0
scale 3
scale 1, median #cells per radius (r=15): 



2.0total batch: 40, running batch 12




scale 2

scale 0
scale 1, median #cells per radius (r=15): 4.0
scale 2
scale 1, median #cells per radius (r=15): 3.0
scale 2
scale 1, median #cells per radius (r=15): 2.0
scale 2
scale 4, median #cells per radius (r=15): 5.0
scale 5
total batch: 40, running batch 13




scale 0
scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 1, median #cells per radius (r=15): 4.0
scale 2
total batch: 40, running batch 14
scale 0




scale 1, median #cells per radius (r=15): 4.0
scale 2scale 2, median #cells per radius (r=15):
total batch: 40, running batch 15 
3.0scale 0

scale 3




scale 2, median #cells per radius (r=15): scale 0, median #cells per radius (r=15):3.0 
scale 31.0

scale 1
scale 5, median #cells per radius (r=15): 6.0




scale 3, median #cells per radius (r=15): 6.0
scale 4
total batch: 40, running batch 16
scale 0
scale 2, median #cells per radius (r=15): 5.0
scale 3
scale 2, median #cells per radius (r=15): 6.0
scale 3




scale 0, median #cells per radius (r=15): 1.0scale 1, median #cells per radius (r=15): 
3.0scale 1

scale 2
scale 3, median #cells per radius (r=15): 4.0
scale 4
scale 0, median #cells per radius (r=15): 1.0
scale 1
total batch: 40, running batch 17
scale 3, median #cells per radius (r=15):scale 0 4.0

scale 4
scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 2, median #cells per radius (r=15): 6.0
scale 3total batch: 40, running batch 18

scale 0




scale 1, median #cells per radius (r=15): 3.0
scale 2
scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 1, median #cells per radius (r=15): 4.0
scale 2




scale 2, median #cells per radius (r=15): 5.0
scale 3
total batch: 40, running batch 19
scale 0
scale 4, median #cells per radius (r=15): 8.0
scale 5
scale 4, median #cells per radius (r=15): 5.0
scale 5
scale 4, median #cells per radius (r=15): 5.0
scale 5
scale 1, median #cells per radius (r=15):



 2.0
scale 2




scale 3, median #cells per radius (r=15): 7.0
scale 4
total batch: 40, running batch 20
scale 0
scale 2, median #cells per radius (r=15): 4.0
scale 3
scale 3, median #cells per radius (r=15): 8.0
scale 4




scale 1, median #cells per radius (r=15): total batch: 40, running batch 214.0

scale 0scale 2

scale 5, median #cells per radius (r=15): 7.0




scale 1, median #cells per radius (r=15): 4.0scale 2, median #cells per radius (r=15):
 4.0
scale 2scale 3

scale 2, median #cells per radius (r=15): 3.0
scale 3
scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 5, median #cells per radius (r=15): 6.0
total batch: 40, running batch 22
scale 0
scale 3, median #cells per radius (r=15): 8.0
scale 4
scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 0, median #cells per radius (r=15):1.0 
scale 1
scale 5, median #cells per radius (r=15): 9.0
scale 0, median #cells per radius (r=15): 1.0
scale 1scale 0, median #cells per radius (r=15):
 1.0
scale 1
scale 3, median #cells per radius (r=15):



 



7.0
scale 4
scale 4, median #cells per radius (r=15):scale 3, median #cells per radius (r=15):  6.08.0
scale 4

scale 5
total batch: 40, running batch 23

scale 0scale 2, median #cells per radius (r=15): 6.0
scale 3




scale 4, median #cells per radius (r=15):scale 1, median #cells per radius (r=15):  10.05.0
scale 5

scale 2
scale 3, median #cells per radius (r=15): 4.0
scale 4
scale 0, median #cells per radius (r=15): 1.0
scale 1
total batch: 40, running batch 24
scale 0
scale 3, median #cells per radius (r=15): 6.0
scale 4
scale 0, median #cells per radius (r=15): 1.0
scale 1




scale 4, median #cells per radius (r=15): 7.0
scale 5
scale 1, median #cells per radius (r=15): 3.0
scale 2
total batch: 40, running batch 25
scale 0
scale 2, median #cells per radius (r=15): 5.0
scale 3




scale 4, median #cells per radius (r=15): 10.0
scale 5
scale 1, median #cells per radius (r=15): 3.0
scale 2
total batch: 40, running batch 26
scale 0
scale 1, median #cells per radius (r=15):scale 1, median #cells per radius (r=15):scale 4, median #cells per radius (r=15):  2.05.0
4.0scale 2
 

scale 5scale 2

scale 2, median #cells per radius (r=15): scale 0, median #cells per radius (r=15):6.0
scale 3 
1.0
scale 1
total batch: 40, running batch 27
scale 0scale 4, median #cells per radius (r=15):scale 1, median #cells per radius (r=15):
  9.03.0
scale 5

scale 2




1.0scale 0, median #cells per radius (r=15): 
scale 1
scale 1, median #cells per radius (r=15): 6.0
scale 2
scale 0, median #cells per radius (r=15): 1.0




scale 1
scale 3, median #cells per radius (r=15): 8.0
scale 4
scale 2, median #cells per radius (r=15): 5.0
scale 3
scale 2, median #cells per radius (r=15): 3.0total batch: 40, running batch 28

scale 3
scale 0
scale 2, median #cells per radius (r=15):10.0 7.0scale 5, median #cells per radius (r=15):
 scale 3

scale 1, median #cells per radius (r=15):scale 4, median #cells per radius (r=15):scale 1, median #cells per radius (r=15):   8.04.0
scale 5, median #cells per radius (r=15):scale 5
scale 2
2.0scale 5, median #cells per radius (r=15):

 6.0
scale 2



 9.0

scale 2, median #cells per radius (r=15): 5.0
scale 3
scale 5, median #cells per radius (r=15): 12.0
scale 0, median #cells per radius (r=15): 1.0
scale 1
total batch: 40, running batch 29
scale 0
scale 2, median #cells per radius (r=15): 3.0
scale 3
scale 5, median #cells per radius (r=15): 13.0





scale 2, median #cells per radius (r=15): 5.0scale 3scale 2, median #cells per radius (r=15): 
5.0
scale 3
scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 1, median #cells per radius (r=15): 3.0
scale 2scale 4, median #cells per radius (r=15):
 10.0
scale 5
total batch: 40, running batch 31
scale 0
scale 3, median #cells per radius (r=15): 4.0
scale 5, median #cells per radius (r=15): scale 411.0

scale 3, median #cells per radius (r=15): 8.0
scale 4
total batch: 40, running batch 32
scale 0




scale 3, median #cells per radius (r=15): 6.0
scale 4
scale 3, median #cells per radius (r=15): 4.0scale 0, median #cells per radius (r=15): 
1.0scale 4

scale 1
scale 1, median #cells per radius (r=15): 3.0
scale 2
scale 3, median #cells per radius (r=15): 8.0
scale 4




scale 2, median #cells per radius (r=15):



 
3.09.0
scale 3scale 1, median #cells per radius (r=15):
 scale 2
total batch: 40, running batch 33
scale 0




scale 4, median #cells per radius (r=15): 5.0
scale 5scale 3, median #cells per radius (r=15):
 6.0
scale 4
scale 2, median #cells per radius (r=15): 4.0
scale 3
scale 2, median #cells per radius (r=15): 6.0
scale 3total batch: 40, running batch 34
scale 0

scale 0, median #cells per radius (r=15):scale 4, median #cells per radius (r=15):  5.01.0

scale 0, median #cells per radius (r=15):scale 1scale 5
 
1.0
scale 1




scale 4, median #cells per radius (r=15): 8.0

scale 5




scale 0, median #cells per radius (r=15): 1.0
scale 4, median #cells per radius (r=15):scale 1 10.0
scale 5
scale 5, median #cells per radius (r=15):total batch: 40, running batch 35 
scale 010.0

scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 5, median #cells per radius (r=15): 12.0
scale 2, median #cells per radius (r=15): 4.0









scale 3total batch: 40, running batch 36scale 1, median #cells per radius (r=15):
 scale 0
6.0
scale 2scale 5, median #cells per radius (r=15):
 6.0
scale 1, median #cells per radius (r=15): scale 1, median #cells per radius (r=15):2.0scale 2, median #cells per radius (r=15):
 4.0scale 2
 
4.0
scale 2
scale 3
scale 5, median #cells per radius (r=15): 9.0




scale 3, median #cells per radius (r=15): 9.0
scale 4
scale 3, median #cells per radius (r=15): total batch: 40, running batch 377.0

scale 4scale 0

scale 3, median #cells per radius (r=15): 7.0
scale 4
scale 1, median #cells per radius (r=15): 2.0
scale 2
scale 2, median #cells per radius (r=15): scale 1, median #cells per radius (r=15):2.0 
scale 32.0





scale 2
scale 4, median #cells per radius (r=15):



 11.0
scale 5
scale 4, median #cells per radius (r=15): 8.0scale 5, median #cells per radius (r=15):
 scale 56.0

scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 3, median #cells per radius (r=15): 12.0
scale 4
scale 3, median #cells per radius (r=15):total batch: 40, running batch 38scale 0, median #cells per radius (r=15): 5.0
 scale 4
1.0scale 0


scale 1
scale 3, median #cells per radius (r=15): 3.0
scale 4
scale 1, median #cells per radius (r=15): 3.0
scale 2
scale 2, median #cells per radius (r=15): 6.0
scale 3
scale 3, median #cells per radius (r=15): 9.0
scale 4




scale 5, median #cells per radius (r=15): 12.0
scale 3, median #cells per radius (r=15): 5.0
scale 4
total batch: 40, running batch 39
scale 0
scale 2, median #cells per radius (r=15): 3.0
scale 3
scale 4, median #cells per radius (r=15): 9.0
scale 2, median #cells per radius (r=15): scale 5
3.0
scale 3
scale 4, median #cells per radius (r=15): 4.0
scale 5




total batch: 40, running batch 40
scale 0
scale 3, median #cells per radius (r=15): 6.0
scale 4
scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 1, median #cells per radius (r=15): 2.0
scale 2
scale 5, median #cells per radius (r=15): 4.0
total batch: 40, running batch 41
scale 0
scale 3, median #cells per radius (r=15): 4.0
scale 4




scale 3, median #cells per radius (r=15):



 4.0




scale 4
scale 4, median #cells per radius (r=15): scale 0, median #cells per radius (r=15): 9.01.0

scale 1scale 5

scale 2, median #cells per radius (r=15): 4.0
scale 3
scale 0, median #cells per radius (r=15): 1.0
scale 1
scale 0, median #cells per radius (r=15): 1.0
scale 1scale 2, median #cells per radius (r=15):
 3.0
scale 3
scale 1, median #cells per radius (r=15): 4.0
scale 2
scale 5, median #cells per radius (r=15): 10.0
scale 2, median #cells per radius (r=15): 8.0
scale 3
scale 4, median #cells per radius (r=15): 7.0
scale 4, median #cells per radius (r=15):scale 5
 15.0scale 3, median #cells per radius (r=15):
 scale 58.0

scale 4
scale 4, median #cells per radius (r=15): 7.0
scale 5
scale 4, median #cells per radius (r=15): 5.0
scale 5
scale 4, median #cells per radius (r=15): 7.0
scale 5
scale 1, median #cells per radius (r=15): 3.0
scale 2scale 4, median #cells per radius (r=15):
 11.0
scale 5
scale 3, median #cells per radius (r=15): 4.0
scale 5, median #cells per radiu


See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html
  from .autonotebook import tqdm as notebook_tqdm


searching resolution to k=13
Res =  0.1 Num of clusters =  6
Res =  0.15000000000000002 Num of clusters =  9
Res changed to 0.15000000000000002
Res =  0.2 Num of clusters =  9
Res changed to 0.2
Res =  0.25 Num of clusters =  15
Res changed to 0.2
Res =  0.225 Num of clusters =  11
Res changed to 0.225
Res =  0.25 Num of clusters =  15
Res changed to 0.225
Res =  0.23750000000000002 Num of clusters =  14
Res changed to 0.225
Res =  0.23125 Num of clusters =  11
Res changed to 0.23125
Res =  0.23750000000000002 Num of clusters =  14
Res changed to 0.23125
Res =  0.234375 Num of clusters =  13
recommended res =  0.234375


Unnamed: 0,SampleID,cellLabelInImage,cellSize,tumorYN,tumorCluster,Group,immuneCluster,immuneGroup,UniqueID,subtype,batch,leiden,MENDER_leiden_k13,MENDER
0-0,1,2,146,1,0,Keratin-positive tumor,0,,1_2,Mixed,0,8,8,8
1-0,1,3,102,0,0,Immune,46,CD3 T,1_3,Mixed,0,1,1,1
2-0,1,4,43,1,0,Keratin-positive tumor,0,,1_4,Mixed,0,4,4,4
3-0,1,5,211,1,0,Keratin-positive tumor,0,,1_5,Mixed,0,0,0,0
4-0,1,6,177,0,0,Immune,75,B,1_6,Mixed,0,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197673-39,41,5093,90,1,0,Keratin-positive tumor,0,,41_5093,Compartimentalized,39,0,0,0
197674-39,41,5094,132,1,21,Tumor,0,,41_5094,Compartimentalized,39,2,2,2
197675-39,41,5095,123,0,0,Immune,31,Macrophages,41_5095,Compartimentalized,39,2,2,2
197676-39,41,5096,99,0,0,Immune,56,Other immune,41_5096,Compartimentalized,39,7,7,7


In [7]:

# Compute the raw counts per SampleID × MENDER, then normalize per sample
mender = (
    msm.adata_MENDER.obs
       .groupby(['SampleID', 'MENDER'])
       .size()
       .unstack(fill_value=0)
)
mender_norm = mender.div(mender.sum(axis=1), axis=0).reset_index()  # SampleID becomes a column

# Extract y_raw (subtype) for each SampleID, in the same sample_ids order
y_raw = adata_raw.obs.groupby('SampleID')['subtype'].first()
y_raw = y_raw.loc[sample_ids]  # reorder so it matches sample_ids

# Turn y_raw into a DataFrame so we can merge on SampleID
y_df = y_raw.reset_index().rename(columns={'subtype': 'subtype'})  # columns: ['SampleID','subtype']

# Merge the normalized MENDER fractions with the subtype column
merged = mender_norm.merge(y_df, on='SampleID', how='left')

file_path = f"mender_representation_total_{n}_samples_{cluster_parameter}_cluster_parameter.csv"
# Write the final table to CSV (includes SampleID, MENDER‐normalized columns, and subtype)
merged.to_csv(file_path, index=False)


NameError: name 'msm' is not defined

Read saved mender represation and run classification

In [12]:
# cluster_parameter = 13 # Todo uncomment when running from here

# file_path = f"mender_representation_total_{n}_samples_{cluster_parameter}_cluster_parameter.csv" # Todo uncomment when running from here
# Read the merged CSV back into a DataFrame
df = pd.read_csv(file_path)

# Put SampleID back into the index (optional, but often convenient)
df = df.set_index("SampleID")

# Extract y_raw as a Series of subtype values
y_raw = df["subtype"]

# Extract mender_norm by dropping the “subtype” column
mender_norm = df.drop(columns=["subtype"])

In [13]:
# Define your classifier
knn = KNeighborsClassifier()

# Compute cross‐val scores over all 10×5=50 folds at once
mender_knn = cross_val_score(knn, mender_norm, y_raw, scoring="accuracy", cv=cv)

print(mender_knn)
mender_knn.mean(), mender_knn.std()

[0.625 0.875 0.625 0.75  0.75  0.75  0.625 0.75  0.75  0.625 0.875 0.875
 0.75  0.75  0.625 0.75  0.75  0.625 0.5   0.625 0.75  0.625 0.625 1.
 0.625 0.875 0.75  0.875 0.75  0.75  0.75  0.875 0.875 0.75  0.625 0.75
 0.75  0.75  0.75  0.625 0.75  1.    0.5   0.875 0.75  0.75  0.625 0.875
 0.875 0.75 ]


(0.7425, 0.11014195385955344)

In [24]:
svc = svm.SVC()
# Define your classifier

# Use RepeatedStratifiedKFold to get 10 different random 5‐fold splits
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

# Compute cross‐val scores over all 10×5=50 folds at once
mender_svc = cross_val_score(svc, mender_norm, y_raw, scoring="accuracy", cv=cv)

print(mender_svc)
mender_svc.mean(), mender_svc.std()

[0.75  0.75  0.875 0.875 0.875 1.    1.    0.875 0.75  0.75  0.875 0.875
 0.875 0.875 0.75  0.875 0.875 0.875 0.5   0.875 0.75  0.875 0.75  0.875
 0.75  0.875 1.    0.875 0.75  0.875 1.    0.875 0.875 0.75  0.75  0.875
 0.875 0.75  0.875 0.75  0.875 0.875 0.75  0.875 0.75  0.875 0.875 0.625
 1.    0.875]


(0.8375, 0.09437293044088436)

# T-test

## KNN

In [25]:
# By default, ttest_ind returns a two-sided p-value
t_stat, p_two_sided = stats.ttest_ind(mender_knn, fine_knn, equal_var=False)

# Convert to one-sided p-value
if t_stat > 0:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1 - p_two_sided / 2

print(f"t-statistic: {t_stat:.4f}")
print(f"one-sided p-value: {p_one_sided:.4f}")
p_knn_mender_fine = p_one_sided

t-statistic: 1.6329
one-sided p-value: 0.0529


In [26]:
# By default, ttest_ind returns a two-sided p-value
t_stat, p_two_sided = stats.ttest_ind(mender_knn, coarse_knn, equal_var=False)

# Convert to one-sided p-value
if t_stat > 0:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1 - p_two_sided / 2

print(f"t-statistic: {t_stat:.4f}")
print(f"one-sided p-value: {p_one_sided:.4f}")
p_knn_mender_coarse = p_one_sided

t-statistic: 2.7475
one-sided p-value: 0.0036


## SVM

In [27]:
# By default, ttest_ind returns a two-sided p-value
t_stat, p_two_sided = stats.ttest_ind(mender_svc, fine_svc, equal_var=False)

# Convert to one-sided p-value
if t_stat > 0:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1 - p_two_sided / 2

print(f"t-statistic: {t_stat:.4f}")
print(f"one-sided p-value: {p_one_sided:.4f}")
p_svc_mender_fine = p_one_sided

t-statistic: 4.8842
one-sided p-value: 0.0000


In [31]:
# By default, ttest_ind returns a two-sided p-value
t_stat, p_two_sided = stats.ttest_ind(mender_svc, coarse_svc, equal_var=False)

# Convert to one-sided p-value
if t_stat > 0:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1 - p_two_sided / 2

print(f"t-statistic: {t_stat:.4f}")
print(f"one-sided p-value: {p_one_sided:.4f}")
p_svc_mender_coarse = p_one_sided

t-statistic: 4.9512
one-sided p-value: 0.0000


# Visualize results

In [32]:
# Calculate 95% confidence intervals for each group
def mean_ci(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    se = st.sem(data)
    h = se * st.t.ppf((1 + confidence) / 2., n-1)
    return m, h

means = []
cis = []
for arr in [mender_knn, fine_knn, coarse_knn]:
    m, h = mean_ci(arr)
    means.append(m)
    cis.append(h)

bar_names = ['MENDER', 'Fine', 'Coarse']

fig = go.Figure(
    data=[
        go.Bar(
            x=bar_names,
            y=means,
            error_y=dict(type='data', array=cis, visible=True, color='black', thickness=2, width=8),
            marker_color=['#636EFA', '#EF553B', '#00CC96'],
            showlegend=False
        )
    ]
)

# Add individual data points
all_knn = [mender_knn, fine_knn, coarse_knn]
for i, arr in enumerate(all_knn):
    fig.add_trace(
        go.Scatter(
            x=[bar_names[i]] * len(arr),
            y=arr,
            mode='markers',
            marker=dict(color='black', size=8),
            name='Data points',
            showlegend=False
        )
    )

# Add p-value annotations above the Fine and Coarse bars
fig.add_annotation(
    x='Fine',
    y=means[1] + 0.05,
    text=f"p = {p_knn_mender_fine:.2f}",
    showarrow=False,
    font=dict(size=14),
    xanchor='left'
)
fig.add_annotation(
    x='Coarse',
    y=means[2] + 0.05,
    text=f"p = {p_knn_mender_coarse:.2f}",
    showarrow=False,
    font=dict(size=14),
    xanchor='left'
)

fig.update_layout(
    title={'text': 'KNN Accuracy Comparison (with 95% CI)', 'x': 0.5, 'xanchor': 'center'},
    yaxis_title='Accuracy (mean ± 95% CI)',
    xaxis_title='Representation',
    yaxis=dict(range=[0, 1]),
    template='plotly_white',
    width=500,
    height=450
)
fig.show()

In [33]:
# Calculate 95% confidence intervals for each group
def mean_ci(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    se = st.sem(data)
    h = se * st.t.ppf((1 + confidence) / 2., n-1)
    return m, h

means = []
cis = []
for arr in [mender_svc, fine_svc, coarse_svc]:
    m, h = mean_ci(arr)
    means.append(m)
    cis.append(h)

bar_names = ['MENDER', 'Fine', 'Coarse']

fig = go.Figure(
    data=[
        go.Bar(
            x=bar_names,
            y=means,
            error_y=dict(type='data', array=cis, visible=True, color='black', thickness=2, width=8),
            marker_color=['#636EFA', '#EF553B', '#00CC96'],
            showlegend=False
        )
    ]
)

# Add individual data points
all_svc = [mender_svc, fine_svc, coarse_svc]
for i, arr in enumerate(all_svc):
    fig.add_trace(
        go.Scatter(
            x=[bar_names[i]] * len(arr),
            y=arr,
            mode='markers',
            marker=dict(color='black', size=8),
            name='Data points',
            showlegend=False
        )
    )

# Add p-value annotations above the Fine and Coarse bars
fig.add_annotation(
    x='Fine',
    y=means[1] + 0.05,
    text=f"p = {p_svc_mender_fine:.2f}",
    showarrow=False,
    font=dict(size=14),
    xanchor='left'
)
fig.add_annotation(
    x='Coarse',
    y=means[2] + 0.05,
    text=f"p = {p_svc_mender_coarse:.2f}",
    showarrow=False,
    font=dict(size=14),
    xanchor='left'
)

fig.update_layout(
    title={'text': 'SVM Accuracy Comparison (with 95% CI)', 'x': 0.5, 'xanchor': 'center'},
    yaxis_title='Accuracy (mean ± 95% CI)',
    xaxis_title='Representation',
    yaxis=dict(range=[0, 1]),
    template='plotly_white',
    width=500,
    height=450
)
fig.show()