In [1]:
import warnings
warnings.filterwarnings("ignore")
import MENDER
import scanpy as sc
import pandas as pd
import numpy as np
from sklearn.metrics import *
import time

import pysodb
from sklearn import svm

In [2]:
sodb = pysodb.SODB()
adata_raw = sodb.load_experiment('codeluppi2018spatial','cortex')

load experiment[cortex] in dataset[codeluppi2018spatial]


In [3]:
# remove invalid cells
adata_raw = adata_raw[adata_raw.obs['Region']!='Excluded']

In [4]:
gt_obs = 'Region'

In [5]:
# Code from tutorial
# input parameters of MENDER
scale = 6

# radius is set to 150 not 15, because the unit of the spatial coordination is 0.1 um for this data, as suggested by estimate_radius
radius = 150

n_domains = len(adata_raw.obs['Region'].cat.categories)
print(n_domains)
# record running time
time_st = time.time()


adata = adata_raw.copy()



######### determine cell state using standard Leiden [start]  #########
# this step can be optionally skipped if reliable cell type annotation is available
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=4000)
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)

sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.leiden(adata,resolution=2,key_added='ct',random_state=666)
adata.obs['ct'] = adata.obs['ct'].astype('category')
######### determine cell state using standard Leiden [end]  #########


# main body of MENDER
msm = MENDER.MENDER_single(
    adata,
    # determine which cell state to use
    # we use the cell state got by Leiden
    # ct_obs='ct',

    random_seed=666
)


# set the MENDER parameters


msm.set_MENDER_para(
    # default of n_scales is 6
    n_scales=scale,

    # for single cell data, nn_mode is set to 'radius'
    nn_mode='radius',

    # default of n_scales is 15 um (see the manuscript for why).
    # MENDER also provide a function 'estimate_radius' for estimating the radius
    nn_para=radius,

)
# construct the context representation
msm.run_representation(
    # the number of processings
    # 8
)

# set the spatial clustering parameter
# positive values for the expected number of domains
# negative values for the clustering resolution
msm.run_clustering_normal(n_domains)

time_ed = time.time()
time_cost = time_ed-time_st

11
scale 0, median #cells per radius (r=150): 1.0
scale 1, median #cells per radius (r=150): 2.0
scale 2, median #cells per radius (r=150): 3.0
scale 3, median #cells per radius (r=150): 4.0
scale 4, median #cells per radius (r=150): 5.0
scale 5, median #cells per radius (r=150): 6.0
searching resolution to k=11
Res =  0.1 Num of clusters =  5
Res =  0.15000000000000002 Num of clusters =  7
Res changed to 0.15000000000000002
Res =  0.2 Num of clusters =  7
Res changed to 0.2
Res =  0.25 Num of clusters =  8
Res changed to 0.25
Res =  0.3 Num of clusters =  8
Res changed to 0.3
Res =  0.35 Num of clusters =  8
Res changed to 0.35
Res =  0.39999999999999997 Num of clusters =  8
Res changed to 0.39999999999999997
Res =  0.44999999999999996 Num of clusters =  9
Res changed to 0.44999999999999996
Res =  0.49999999999999994 Num of clusters =  10
Res changed to 0.49999999999999994
Res =  0.5499999999999999 Num of clusters =  11
recommended res =  0.5499999999999999


In [6]:
# X = msm.adata.obsm['whole']
# pd.DataFrame(msm.adata.obsm['whole']).to_csv("osmfish_whole2.csv")

In [7]:
y = adata_raw.obs['ClusterName'].astype('category')


In [8]:
msm.adata.obsm['whole'].shape

(4839, 150)

In [9]:
from sklearn.cross_decomposition import CCA

# CCA
matrix1 = msm.adata.obsm['whole']
matrix2 = adata_raw.X

cca = CCA(n_components=33)
cca.fit(matrix1, matrix2)

matrix1_c, matrix2_c = cca.transform(matrix1, matrix2)
cca_concat = np.hstack([matrix1_c, matrix2_c])

In [10]:
X_cca = cca_concat
X_gene = adata_raw.X
X_mender = msm.adata.obsm['whole']

In [11]:
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold

lsvc = svm.LinearSVC(random_state=42)
scores_linear_svc_mender = cross_val_score(lsvc, X_mender, y, scoring='accuracy', cv=cv)
print(scores_linear_svc_mender)
scores_linear_svc_gene = cross_val_score(lsvc, X_gene, y, scoring='accuracy', cv=cv)
print(scores_linear_svc_gene)
scores_linear_svc_cca = cross_val_score(lsvc, X_cca, y, scoring='accuracy', cv=cv)
print(scores_linear_svc_cca)

[0.57231405 0.57954545 0.54958678 0.55475207 0.56256463]
[0.75309917 0.76239669 0.74793388 0.77169421 0.8024819 ]
[0.82128099 0.81714876 0.80785124 0.77789256 0.84074457]


In [13]:
rbf_svc = svm.SVC(kernel='rbf', random_state=42)
scores_rbf_mender = cross_val_score(rbf_svc, X_mender, y, scoring='accuracy', cv=cv)
print(scores_rbf_mender)
scores_rbf_gene = cross_val_score(rbf_svc, X_gene, y, scoring='accuracy', cv=cv)
print(scores_rbf_gene)
scores_rbf_cca = cross_val_score(rbf_svc, X_cca, y, scoring='accuracy', cv=cv)
print(scores_rbf_cca)

[0.52582645 0.54235537 0.53512397 0.51652893 0.53257497]
[0.72107438 0.73966942 0.71900826 0.69628099 0.71975181]
[0.77066116 0.76446281 0.76652893 0.75103306 0.78386763]


In [14]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(max_depth=None, random_state=42)
scores_r_forest_mender = cross_val_score(random_forest, X_mender, y, scoring='accuracy', cv=cv)
print(scores_r_forest_mender)
scores_r_forest_gene = cross_val_score(random_forest, X_gene, y, scoring='accuracy', cv=cv)
print(scores_r_forest_gene)
scores_r_forest_cca = cross_val_score(random_forest, X_cca, y, scoring='accuracy', cv=cv)
print(scores_r_forest_cca)

[0.65392562 0.66942149 0.63429752 0.6322314  0.65460186]
[0.79132231 0.81301653 0.80681818 0.79752066 0.81178904]
[0.76549587 0.79752066 0.77995868 0.76756198 0.79007239]


In [15]:
#RBF SVM, Linear SVM and Random Forest.
scores = {"Linear SVM": [scores_linear_svc_gene, scores_linear_svc_cca, scores_linear_svc_mender],
          "RBF SVM": [scores_rbf_gene, scores_rbf_cca, scores_rbf_mender],
          "Random Forest": [scores_r_forest_gene, scores_r_forest_cca, scores_r_forest_mender]}

In [16]:
from scipy.stats import ttest_ind


def one_sided_t_test(first, second):
    t_stat, p_two_sided = ttest_ind(first, second, equal_var=False)

    # Convert to one-sided p-value
    if t_stat > 0:
        p_one_sided = p_two_sided / 2
    else:
        p_one_sided = 1 - p_two_sided / 2

    print(f"t-statistic: {t_stat:.4f}")
    print(f"one-sided p-value: {p_one_sided:.4f}")
    return p_one_sided

In [17]:
# from other code
import plotly.graph_objects as go
import scipy.stats as st
from scipy import stats

data = [scores_linear_svc_gene, scores_linear_svc_cca, scores_linear_svc_mender]
labels = ['Gene', 'CCA', 'MENDER']
colors = ['#1f77b4', '#d62728', '#2ca02c']  # blue, red, green

for clf_name, score in scores.items():    # Prepare data
    data = [score[0], score[1], score[2]]
    labels = ['Gene', 'CCA', 'MENDER']
    colors = ['#1f77b4', '#d62728', '#2ca02c']  # blue, red, green

    # Calculate 95% confidence intervals for each group
    def mean_ci(data, confidence=0.95):
        n = len(data)
        m = np.mean(data)
        se = st.sem(data)
        h = se * st.t.ppf((1 + confidence) / 2., n-1)
        return m, h

    means = []
    cis = []
    for arr in [score[0], score[1], score[2]]:
        m, h = mean_ci(arr)
        means.append(m)
        cis.append(h)

    bar_names = ['Gene', 'CCA', 'MENDER']

    fig = go.Figure(
        data=[
            go.Bar(
                x=bar_names,
                y=means,
                error_y=dict(type='data', array=cis, visible=True, color='black', thickness=2, width=8),
                marker_color=['#636EFA', '#EF553B', '#00CC96'],
                showlegend=False
            )
        ]
    )

    # Add individual data points
    all_data_points = [score[0], score[1], score[2]]
    for i, arr in enumerate(all_data_points):
        fig.add_trace(
            go.Scatter(
                x=[bar_names[i]] * len(arr),
                y=arr,
                mode='markers',
                marker=dict(color='black', size=8),
                name='Data points',
                showlegend=False
            )
        )

    fig.add_annotation(
        x='Gene',
        y=means[1] + 0.05,
        text=f"p = {one_sided_t_test(score[1], score[0]):.4f}",
        showarrow=False,
        font=dict(size=14),
        xanchor='left'
    )
    fig.add_annotation(
        x='MENDER',
        y=means[2] + 0.05,
        text=f"p = {one_sided_t_test(score[1], score[2]):.4f}",
        showarrow=False,
        font=dict(size=14),
        xanchor='left'
    )

    fig.update_layout(
        title={'text': f'{clf_name} Accuracy Comparison (with 95% CI)', 'x': 0.5, 'xanchor': 'center'},
        yaxis_title='Accuracy (mean ± 95% CI)',
        xaxis_title='Representation',
        yaxis=dict(range=[0, 1]),
        template='plotly_white',
        width=500,
        height=450
    )
    fig.show()

t-statistic: 3.2259
one-sided p-value: 0.0061
t-statistic: 21.3698
one-sided p-value: 0.0000


t-statistic: 5.5471
one-sided p-value: 0.0003
t-statistic: 34.5027
one-sided p-value: 0.0000


t-statistic: -3.1948
one-sided p-value: 0.9924
t-statistic: 14.0552
one-sided p-value: 0.0000
