In [19]:
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from anndata import AnnData
import MENDER
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import svm
from scipy import stats
import re
import os
import glob
import tifffile
import pandas as pd
from skimage.measure import regionprops_table
import numpy as np
import plotly.graph_objects as go
import scipy.stats as st
from scipy import stats

# Load data

In [20]:
# Directory containing your segmented mask TIFF files
mask_dir = r'TNBC_shareCellData'
# Output CSV file path
output_csv = "cell_centroids.csv"

records = []

# Loop through each TIFF mask
for mask_path in glob.glob(os.path.join(mask_dir, "*.tiff")):
    # Load the mask (each cell labeled by a unique integer)
    mask = tifffile.imread(mask_path)

    print("Shape:", mask.shape)          # e.g. (1024, 1024)
    print("Data type:", mask.dtype)      # e.g. uint16
    labels = np.unique(mask)
    print(f"Found {len(labels)-1} objects (ignoring background=0)")
    print("Some label values:", labels[:10], "…", labels[-10:])
    # Extract properties: label ID and centroids
    props = regionprops_table(
        mask,
        properties=('label', 'centroid')
    )
    
    # Convert to DataFrame
    df = pd.DataFrame(props)
    # Keep track of source image
    
    
    nums = re.search(r"\d+", os.path.basename(mask_path))
    df['image'] = int(nums.group()) if nums else None
    records.append(df)

# Concatenate all image DataFrames
all_cells = pd.concat(records, ignore_index=True)

# Rename columns for clarity
all_cells = all_cells.rename(
    columns={
        'label': 'CellID',
        'centroid-0': 'CentroidY',
        'centroid-1': 'CentroidX'
    }
)

# Save to CSV
all_cells.to_csv(output_csv, index=False)
print(f"Saved {len(all_cells)} cell centroids to {output_csv}")


Shape: (2048, 2048)
Data type: uint16
Found 5213 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [5204 5205 5206 5207 5208 5209 5210 5211 5212 5213]
Shape: (2048, 2048)
Data type: uint16
Found 6140 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [6131 6132 6133 6134 6135 6136 6137 6138 6139 6140]
Shape: (2048, 2048)
Data type: uint16
Found 8772 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [8763 8764 8765 8766 8767 8768 8769 8770 8771 8772]
Shape: (2048, 2048)
Data type: uint16
Found 9738 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [9729 9730 9731 9732 9733 9734 9735 9736 9737 9738]
Shape: (2048, 2048)
Data type: uint16
Found 7751 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9] … [7742 7743 7744 7745 7746 7747 7748 7749 7750 7751]
Shape: (2048, 2048)
Data type: uint16
Found 3560 objects (ignoring background=0)
Some label values: [0 1 2 3 4 5 6 7 8 9

Perform Random subsampling to decreace dataset size

In [21]:
# Read in the full cell‐level table
df = pd.read_csv("TNBC_shareCellData/cellData.csv", sep=",")

# Choose how many cells to keep per patient:
# The patient with the fewest cells in the original dataset had 2217
# The mean is 5923.55
# The most cells is 9738
n = 2217

# 3. Option A: if you only want patients who have ≥ n cells, and you want exactly n per patient:
#    (patients with fewer than n cells are dropped entirely)
df_filtered = df.groupby("SampleID").filter(lambda sub: len(sub) >= n)
df_subsample = (
    df_filtered
    .groupby("SampleID", group_keys=False)
    .sample(n=n, random_state=42)
)


# Write out a new CSV (e.g. “cellData_100.csv”)
out_name = f"TNBC_shareCellData/cellData_{n}.csv"
df_subsample.to_csv(out_name, sep=",", index=False)
print(f"Wrote subsampled table with {n} cells per patient to:\n  {out_name}")

Wrote subsampled table with 2217 cells per patient to:
  TNBC_shareCellData/cellData_2217.csv


In [22]:
df = pd.read_csv("TNBC_shareCellData/cellData_600.csv", sep=",")
df_p = pd.read_csv("TNBC_shareCellData/patient_class.csv", header=None)
df_p.rename(columns={0:"SampleID", 1:"subtype"}, inplace=True)
subtype_map = {
    0: "Mixed",
    1: "Compartimentalized",
    2: "Cold"
}

df_p["subtype"] = df_p["subtype"].map(subtype_map).astype('category')



# TO DO: LOOK INTO THIS FILTERING SINCE SAMPLEID 30 DOESN"T ACTUALLY EXIST SO MAYBE IT IS MORE COMPLICATED THAN JUST REMOVING A FEW
bad_Values = [42,43,44]

df = df[~df["SampleID"].isin(bad_Values)]


meta_cols = ['SampleID', 'cellLabelInImage', 'cellSize', 'tumorYN', 
             'tumorCluster', 'Group', 'immuneCluster', 'immuneGroup']

expr_cols = [c for c in df.columns if c not in meta_cols]

adata = AnnData(
    X = df[expr_cols].values,
    obs = df[meta_cols],
    var=pd.DataFrame(index=expr_cols)
)

adata.obs["UniqueID"] = adata.obs['SampleID'].astype(str) + "_" + adata.obs['cellLabelInImage'].astype(str)
all_cells["UniqueID"] = all_cells["image"].astype(str) + "_" + all_cells["CellID"].astype(str)
all_cells = all_cells.set_index('UniqueID', drop = False)
adata.obs = adata.obs.set_index('UniqueID', drop=False)

all_cells_aligned = all_cells.reindex(adata.obs_names)

adata.obsm["spatial"] = all_cells_aligned[["CentroidX", "CentroidY"]].to_numpy()

# X = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X
# min_val = X.min()
# print("Minimum before shift:", min_val)

# 2. If it’s < 0, shift everything up so the minimum is 0
# if min_val < 0:
#     shift = -min_val
#     print(f"Shifting all values by +{shift:.3g} to eliminate negatives")
#     X += shift
#     adata.X = X  # put it back into your AnnData

# sc.pp.normalize_total(adata)         # e.g. counts per cell
# sc.pp.log1p(adata)
# sc.pp.scale(adata)

# sc.tl.pca(adata, svd_solver='arpack', n_comps=50)
# sc.pp.neighbors(adata)               # builds graph on PCA
# sc.tl.umap(adata)
code2label_group = {
    1: "Unidentified",
    2: "Immune",
    3: "EndoThelial",
    4: "Mesenchymal-like",
    5: "Tumor",
    6: "Keratin-positive tumor"
}

code2label_immunegroup = {
    1: "Tregs",
    2: "CD4 T",
    3: "CD8 T",
    4: "CD3 T",
    5: "NK",
    6: "B",
    7: "Neutrophils",
    8: "Macrophages",
    9: "DC",
    10: "DC/Mono",
    11: "Mono/Neu",
    12: "Other immune"
}
adata.obs["Group"] = adata.obs["Group"].map(code2label_group).astype('category')
adata.obs["immuneGroup"] = adata.obs["immuneGroup"].map(code2label_immunegroup)
adata.obs = pd.merge(
    adata.obs,
    df_p,
    on="SampleID",
    how="left"
)

# Copy for later use
adata_raw = adata.copy()

adata.obs



Unnamed: 0,SampleID,cellLabelInImage,cellSize,tumorYN,tumorCluster,Group,immuneCluster,immuneGroup,UniqueID,subtype
0,1,1839,230,0,0,Immune,85,CD4 T,1_1839,Mixed
1,1,3381,361,0,0,Immune,84,B,1_3381,Mixed
2,1,3934,121,0,0,Immune,85,B,1_3934,Mixed
3,1,3648,187,0,0,Immune,75,B,1_3648,Mixed
4,1,1639,567,1,0,Keratin-positive tumor,0,,1_1639,Mixed
...,...,...,...,...,...,...,...,...,...,...
23995,41,2848,416,0,0,Immune,56,Other immune,41_2848,Compartimentalized
23996,41,722,738,1,0,Keratin-positive tumor,0,,41_722,Compartimentalized
23997,41,2184,639,0,0,Immune,46,CD3 T,41_2184,Compartimentalized
23998,41,631,258,0,0,Immune,65,B,41_631,Compartimentalized


In [23]:
# print(adata.var.columns)
counts = df.groupby("SampleID").size().reset_index(name="n_cells")

print(counts.head(45))


    SampleID  n_cells
0          1      600
1          2      600
2          3      600
3          4      600
4          5      600
5          6      600
6          7      600
7          8      600
8          9      600
9         10      600
10        11      600
11        12      600
12        13      600
13        14      600
14        15      600
15        16      600
16        17      600
17        18      600
18        19      600
19        20      600
20        21      600
21        22      600
22        23      600
23        24      600
24        25      600
25        26      600
26        27      600
27        28      600
28        29      600
29        31      600
30        32      600
31        33      600
32        34      600
33        35      600
34        36      600
35        37      600
36        38      600
37        39      600
38        40      600
39        41      600


In [24]:
y = adata.obs.groupby(['SampleID'])['subtype'].first().reset_index(drop=True)
y

0                  Mixed
1                  Mixed
2     Compartimentalized
3     Compartimentalized
4     Compartimentalized
5     Compartimentalized
6                  Mixed
7                  Mixed
8     Compartimentalized
9     Compartimentalized
10                 Mixed
11                 Mixed
12                 Mixed
13                 Mixed
14                  Cold
15    Compartimentalized
16                 Mixed
17                 Mixed
18                  Cold
19                 Mixed
20                 Mixed
21                  Cold
22                 Mixed
23                  Cold
24                  Cold
25                  Cold
26                 Mixed
27    Compartimentalized
28                 Mixed
29                 Mixed
30    Compartimentalized
31                 Mixed
32    Compartimentalized
33    Compartimentalized
34    Compartimentalized
35    Compartimentalized
36                 Mixed
37                 Mixed
38    Compartimentalized
39    Compartimentalized


# Coarse representation

In [25]:
coarse = adata.obs.groupby(["SampleID", "Group"]).size().unstack(fill_value=0).reset_index(drop=True)
# Normalize the counts to percentages
coarse_norm = coarse.div(coarse.sum(axis=1), axis=0)
coarse_norm

Group,EndoThelial,Immune,Keratin-positive tumor,Mesenchymal-like,Tumor,Unidentified
0,0.003333,0.465,0.513333,0.011667,0.0,0.006667
1,0.028333,0.341667,0.585,0.043333,0.0,0.001667
2,0.028333,0.526667,0.316667,0.118333,0.006667,0.003333
3,0.013333,0.628333,0.265,0.03,0.056667,0.006667
4,0.035,0.538333,0.356667,0.056667,0.003333,0.01
5,0.015,0.358333,0.585,0.023333,0.001667,0.016667
6,0.001667,0.173333,0.74,0.036667,0.02,0.028333
7,0.0,0.176667,0.478333,0.103333,0.188333,0.053333
8,0.018333,0.493333,0.456667,0.01,0.006667,0.015
9,0.006667,0.648333,0.301667,0.035,0.0,0.008333


In [26]:
knn = KNeighborsClassifier()
coarse_knn = cross_val_score(knn, coarse_norm, y, scoring='accuracy', cv=5)
print(coarse_knn)
coarse_knn.mean(), coarse_knn.std()

[0.875 0.5   1.    0.875 0.625]


(0.775, 0.18371173070873836)

In [27]:
svc = svm.SVC()
coarse_svc = cross_val_score(svc, coarse_norm, y, scoring='accuracy', cv=5)
print(coarse_svc)
coarse_svc.mean(), coarse_svc.std()

[0.875 0.5   0.875 0.75  0.625]


(0.725, 0.1457737973711325)

# Fine representation

In [28]:
data_fine = adata.obs.copy()
data_fine.loc[data_fine["immuneGroup"].isna(), "immuneGroup"] = data_fine.loc[data_fine["immuneGroup"].isna(), "Group"]

In [29]:
fine = data_fine.groupby(["SampleID", "immuneGroup"]).size().unstack(fill_value=0).reset_index(drop=True)
# Normalize the counts to percentages
fine_norm = fine.div(fine.sum(axis=1), axis=0)
fine

immuneGroup,B,CD3 T,CD4 T,CD8 T,DC,DC/Mono,EndoThelial,Keratin-positive tumor,Macrophages,Mesenchymal-like,Mono/Neu,NK,Neutrophils,Other immune,Tregs,Tumor,Unidentified
0,121,30,39,18,0,17,2,308,34,7,0,1,0,19,0,0,4
1,1,2,6,48,9,1,17,351,106,26,15,6,1,10,0,0,1
2,41,25,45,58,6,40,17,190,76,71,2,1,1,17,4,4,2
3,40,28,60,27,0,39,8,159,90,18,5,0,13,61,14,34,4
4,0,14,27,63,0,40,21,214,127,34,21,0,11,17,3,2,6
5,4,8,12,19,0,19,9,351,62,14,53,0,0,38,0,1,10
6,0,1,1,1,0,0,1,444,46,22,0,0,54,1,0,12,17
7,3,7,1,10,0,4,0,287,57,62,6,0,1,17,0,113,32
8,2,20,33,31,8,20,11,274,118,6,22,1,7,29,5,4,9
9,3,7,21,53,0,15,4,181,176,21,65,5,13,27,4,0,5


In [30]:
knn = KNeighborsClassifier()
fine_knn = cross_val_score(knn, fine_norm, y, scoring='accuracy', cv=5)
print(fine_knn)
fine_knn.mean(), fine_knn.std()

[0.75  0.5   0.75  0.875 0.625]


(0.7, 0.1274754878398196)

In [31]:
svc = svm.SVC()
fine_svc = cross_val_score(svc, fine_norm, y, scoring='accuracy', cv=5)
print(fine_svc)
fine_svc.mean(), fine_svc.std()

[0.875 0.5   0.875 0.75  0.625]


(0.725, 0.1457737973711325)

# MENDER representation

In [32]:
# batch_obs = 'subtype'
batch_obs = 'SampleID'
scale = 3
radius = 15

# I'm not sure why they did so many copies but I took it from this code 
# https://mender-tutorial.readthedocs.io/en/latest/MERSCOPE.html
adata = adata_raw.copy()

# adata.obs['SampleID'] = adata.obs['SampleID'].astype('category')

# Only do it for these 18 samples to make it run more quickly. 6 mixed, comp and cold
# sample_ids = [5, 13, 1, 2, 3, 4]
# sample_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 19, 22, 24, 25, 26]
sample_ids = [
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,  22, 23, 24, 25, 26,
    27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
]

adata = adata[adata.obs['SampleID'].isin(sample_ids)].copy()

adata.obs['SampleID'] = adata.obs['SampleID'].astype('category')


# TODO: I think the grouping here should be based on fine groups
# main body of MENDER
msm = MENDER.MENDER(
    adata,
    batch_obs = batch_obs,
    # determine which cell state to use
    # In our case the cell type does not need to be estimated since we have a ground truth stored in 
    # the dataset under all_group_name
    ct_obs= 'Group',
    random_seed=42,
    verbose=True
)


# set the MENDER parameters


msm.prepare()
msm.set_MENDER_para(
    # default of n_scales is 6
    n_scales=scale,

    # for single cell data, nn_mode is set to 'radius'
    nn_mode='radius',

    # default of n_scales is 15 um (see the manuscript for why).
    # MENDER also provide a function 'estimate_radius' for estimating the radius
    nn_para=radius,
)


# construct the context representation
msm.run_representation_mp(
    8
    # the number of processings
)

# set the spatial clustering parameter
# positive values for the expected number of domains
# negative values for the clustering resolution
msm.run_clustering_normal(-0.5)
# msm.run_clustering_normal(2)
# msm.run_clustering_normal(9)
msm.adata_MENDER.obs


default number of process is 200



See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


Unnamed: 0,SampleID,cellLabelInImage,cellSize,tumorYN,tumorCluster,Group,immuneCluster,immuneGroup,UniqueID,subtype,batch,MENDER_leiden_res0.5,MENDER
0-0,1,1839,230,0,0,Immune,85,CD4 T,1_1839,Mixed,0,22,22
1-0,1,3381,361,0,0,Immune,84,B,1_3381,Mixed,0,33,33
2-0,1,3934,121,0,0,Immune,85,B,1_3934,Mixed,0,22,22
3-0,1,3648,187,0,0,Immune,75,B,1_3648,Mixed,0,32,32
4-0,1,1639,567,1,0,Keratin-positive tumor,0,,1_1639,Mixed,0,25,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995-39,41,2848,416,0,0,Immune,56,Other immune,41_2848,Compartimentalized,39,25,25
23996-39,41,722,738,1,0,Keratin-positive tumor,0,,41_722,Compartimentalized,39,6,6
23997-39,41,2184,639,0,0,Immune,46,CD3 T,41_2184,Compartimentalized,39,5,5
23998-39,41,631,258,0,0,Immune,65,B,41_631,Compartimentalized,39,22,22


In [33]:
mender = msm.adata_MENDER.obs.groupby(['SampleID', 'MENDER']).size().unstack(fill_value=0).reset_index(drop=True)
mender_norm = mender.div(mender.sum(axis=1), axis=0)
mender_norm

MENDER,0,1,2,3,4,5,6,7,8,9,...,84,85,86,87,88,89,90,91,92,93
0,0.238333,0.146667,0.083333,0.046667,0.068333,0.035,0.006667,0.026667,0.015,0.008333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.238333,0.105,0.11,0.078333,0.048333,0.045,0.018333,0.008333,0.013333,0.015,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001667
2,0.181667,0.168333,0.036667,0.03,0.08,0.08,0.013333,0.035,0.005,0.038333,...,0.0,0.0,0.0,0.0,0.005,0.001667,0.0,0.0,0.0,0.0
3,0.125,0.215,0.051667,0.041667,0.133333,0.098333,0.008333,0.048333,0.006667,0.018333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001667,0.001667,0.0,0.0
4,0.175,0.22,0.063333,0.036667,0.096667,0.048333,0.025,0.025,0.006667,0.025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001667,0.0,0.0
5,0.26,0.148333,0.111667,0.065,0.045,0.036667,0.04,0.021667,0.015,0.008333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001667,0.0
6,0.231667,0.065,0.143333,0.115,0.02,0.01,0.056667,0.005,0.026667,0.015,...,0.0,0.0,0.003333,0.0,0.0,0.0,0.0,0.0,0.0,0.001667
7,0.205,0.061667,0.076667,0.035,0.013333,0.015,0.02,0.001667,0.021667,0.031667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001667,0.0,0.0,0.0
8,0.203333,0.186667,0.065,0.048333,0.07,0.063333,0.031667,0.031667,0.013333,0.008333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001667,0.0,0.0,0.0
9,0.128333,0.206667,0.041667,0.035,0.103333,0.101667,0.001667,0.05,0.005,0.008333,...,0.0,0.0,0.0,0.001667,0.001667,0.0,0.0,0.0,0.0,0.003333


In [16]:
# Only do it for these 18 samples to make it run more quickly. 6 mixed, comp and cold
# sample_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 19, 22, 24, 25, 26]
y_raw = adata_raw.obs.groupby(['SampleID'])['subtype'].first()
y_raw = y_raw[sample_ids]

In [17]:
knn = KNeighborsClassifier()
mender_knn = cross_val_score(knn, mender_norm, y_raw, scoring='accuracy', cv=5)
print(mender_knn)
mender_knn.mean(), mender_knn.std()

[0.875 0.625 0.875 0.75  0.625]


(0.75, 0.11180339887498948)

In [18]:
svc = svm.SVC()
mender_svc = cross_val_score(svc, mender_norm, y_raw, scoring='accuracy', cv=5)
print(mender_svc)
mender_svc.mean(), mender_svc.std()

[0.875 0.625 0.875 0.875 0.625]


(0.775, 0.1224744871391589)

# T-test

## KNN

In [None]:
# By default, ttest_ind returns a two-sided p-value
t_stat, p_two_sided = stats.ttest_ind(mender_knn, fine_knn, equal_var=False)

# Convert to one-sided p-value
if t_stat > 0:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1 - p_two_sided / 2

print(f"t-statistic: {t_stat:.4f}")
print(f"one-sided p-value: {p_one_sided:.4f}")
p_knn_mender_fine = p_one_sided

t-statistic: 4.4907
one-sided p-value: 0.0054


  t_stat, p_two_sided = stats.ttest_ind(mender_knn, fine_knn, equal_var=False)


In [None]:
# By default, ttest_ind returns a two-sided p-value
t_stat, p_two_sided = stats.ttest_ind(mender_knn, coarse_knn, equal_var=False)

# Convert to one-sided p-value
if t_stat > 0:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1 - p_two_sided / 2

print(f"t-statistic: {t_stat:.4f}")
print(f"one-sided p-value: {p_one_sided:.4f}")
p_knn_mender_coarse = p_one_sided

t-statistic: 2.8284
one-sided p-value: 0.0237


  t_stat, p_two_sided = stats.ttest_ind(mender_knn, coarse_knn, equal_var=False)


## SVM

In [None]:
# By default, ttest_ind returns a two-sided p-value
t_stat, p_two_sided = stats.ttest_ind(mender_svc, fine_svc, equal_var=False)

# Convert to one-sided p-value
if t_stat > 0:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1 - p_two_sided / 2

print(f"t-statistic: {t_stat:.4f}")
print(f"one-sided p-value: {p_one_sided:.4f}")
p_svc_mender_fine = p_one_sided

t-statistic: 3.1623
one-sided p-value: 0.0171


  t_stat, p_two_sided = stats.ttest_ind(mender_svc, fine_svc, equal_var=False)


In [None]:
# By default, ttest_ind returns a two-sided p-value
t_stat, p_two_sided = stats.ttest_ind(mender_svc, coarse_svc, equal_var=False)

# Convert to one-sided p-value
if t_stat > 0:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1 - p_two_sided / 2

print(f"t-statistic: {t_stat:.4f}")
print(f"one-sided p-value: {p_one_sided:.4f}")
p_svc_mender_coarse = p_one_sided

t-statistic: 3.6742
one-sided p-value: 0.0107


  t_stat, p_two_sided = stats.ttest_ind(mender_svc, coarse_svc, equal_var=False)


# Visualize results

In [None]:
# Calculate 95% confidence intervals for each group
def mean_ci(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    se = st.sem(data)
    h = se * st.t.ppf((1 + confidence) / 2., n-1)
    return m, h

means = []
cis = []
for arr in [mender_knn, fine_knn, coarse_knn]:
    m, h = mean_ci(arr)
    means.append(m)
    cis.append(h)

bar_names = ['MENDER', 'Fine', 'Coarse']

fig = go.Figure(
    data=[
        go.Bar(
            x=bar_names,
            y=means,
            error_y=dict(type='data', array=cis, visible=True, color='black', thickness=2, width=8),
            marker_color=['#636EFA', '#EF553B', '#00CC96'],
            showlegend=False
        )
    ]
)

# Add individual data points
all_knn = [mender_knn, fine_knn, coarse_knn]
for i, arr in enumerate(all_knn):
    fig.add_trace(
        go.Scatter(
            x=[bar_names[i]] * len(arr),
            y=arr,
            mode='markers',
            marker=dict(color='black', size=8),
            name='Data points',
            showlegend=False
        )
    )

# Add p-value annotations above the Fine and Coarse bars
fig.add_annotation(
    x='Fine',
    y=means[1] + 0.05,
    text=f"p = {p_knn_mender_fine:.2f}",
    showarrow=False,
    font=dict(size=14),
    xanchor='left'
)
fig.add_annotation(
    x='Coarse',
    y=means[2] + 0.05,
    text=f"p = {p_knn_mender_coarse:.2f}",
    showarrow=False,
    font=dict(size=14),
    xanchor='left'
)

fig.update_layout(
    title={'text': 'KNN Accuracy Comparison (with 95% CI)', 'x': 0.5, 'xanchor': 'center'},
    yaxis_title='Accuracy (mean ± 95% CI)',
    xaxis_title='Representation',
    yaxis=dict(range=[0, 1]),
    template='plotly_white',
    width=500,
    height=450
)
fig.show()