## Xenium CellCharter niche derivation

In [1]:
import sys 
import os
from datetime import datetime
today = datetime.now().strftime("%Y-%m-%d")
import anndata as ad
import hdf5plugin
import squidpy as sq
import pandas as pd
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt

  from pkg_resources import DistributionNotFound, get_distribution


In [2]:
import scvi
import cellcharter as cc
from lightning.pytorch import seed_everything

seed_everything(12345)
scvi.settings.seed = 12345

Seed set to 12345
Seed set to 12345


In [3]:
# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/nfs/team205/vk8/projects/thymus_ageing_atlas/Spatial_analyses/'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/nfs/team205/vk8/projects/thymus_ageing_atlas/General_analysis/scripts')

%reload_ext autoreload

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to show output from all the lines in a cells
pd.set_option('display.max_column',None) # display all the columns in pandas
pd.options.display.max_rows = 100

In [32]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42
sc.settings.set_figure_params(dpi = 80, color_map = 'RdPu', vector_friendly = True, format = 'pdf',  dpi_save=300)
from matplotlib import font_manager
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf")
#plt.style.use(f'/nfs/team205/vk8/projects/thymus_ageing_atlas/General_analysis/scripts/plotting/thyAgeing.mplstyle')

In [6]:
# Define plot and path dirs
plot_path = os.path.join(repo_path, 'plots')
data_path = os.path.join(repo_path, 'data')
model_path = os.path.join(repo_path, 'models')
results_path = os.path.join(repo_path, 'results')
general_data_path = '/nfs/team205/vk8/projects/thymus_ageing_atlas/General_analysis/data'

## Load adata 

In [7]:
xenium_adata = ad.read_h5ad(f'{data_path}/xenium/adata_xenium_2025-01-14.zarr')

In [8]:
sc.pp.filter_cells(xenium_adata, min_counts = 100)
knn_pred_df = pd.read_csv(f'{data_path}/xenium/ThyAge_xenium_scarches_annos_2025-04-09.csv', index_col = 0)
knn_cols = knn_pred_df.columns.tolist()
xenium_adata.obs[knn_cols] = knn_pred_df.reindex(xenium_adata.obs_names)

In [9]:
#xenium_adata.layers["counts"] = xenium_adata.X.copy()

sc.pp.normalize_total(xenium_adata, target_sum=1e4)
sc.pp.log1p(xenium_adata)

## Load scVI dimensionality reduction 

In [10]:
import pickle 
with open (f'{data_path}/xenium/ThyAge_scell2xenium_integ_2025-04-09_embed.pickle', 'rb') as file:
    scell2xen_embed = pickle.load(file)

xenium_adata.obsm['X_scVI'] = scell2xen_embed['X_scVI'].copy()
xenium_adata.obsm['X_umap'] = scell2xen_embed['X_umap'].copy()
xenium_adata.obsp = scell2xen_embed['obsp'].copy()    

  scell2xen_embed = pickle.load(file)


In [None]:
sc.pl.umap(xenium_adata, color = 'knn_pred-taa_l3')

## CellCharter's spatial clustering
It's now time to compute the spatial clusters.<br>
CellCharter encodes all cells of each sample as a network. Every cell is a node and two cells are connected by an edge if they are physically close to each other in the tissue.
We can obtain this network using squidpy's `gr.spatial_neighbors` function. <br> <br>
As shown in the [CODEX notebook](https://cellcharter.readthedocs.io/en/latest/notebooks/codex_mouse_spleen.html), the Delaunay triangulation has the drawback of generating connections between distant cells.<br>
We remove this long links using CellCharter's `gr.remove_long_links` function.


In [None]:
sq.gr.spatial_neighbors(xenium_adata, library_key='Sample_hr', coord_type='generic', delaunay=True, spatial_key='spatial', percentile=99)

The next step is the neighborhood aggregation, that incorporates the features of a cell's environment (i.e. neighbors) into the features of the cell itself. <br>
It consists of concatenating the features of every cell with the features aggregated from neighbors ad increasing layers from the considered cell, up to a certain layer `n_layers`.<br>
Aggregation functions are used to obtain a single feature vector from the vectors of multiple neighbors, with the default being the `mean` function.

In this case we use 3 layers of neighbors, so we obtain, for each cell, a feature vector of length 40. That is the cell's reduced vector size from scVI (10) plus the 3 aggregated vectors of length 10 each, from the 3 layers of neighbors.

In [None]:
cc.gr.aggregate_neighbors(xenium_adata, n_layers=3, use_rep='X_scVI', out_key='X_cellcharter', sample_key='Sample_hr')

Now we can cluster cells based on these features that merge the information of each cell with its neighbors. <br>
If already have a specific number of clusters in mind, you can use the `tl.Cluster` class as shown in the [CODEX notebook](https://cellcharter.readthedocs.io/en/latest/notebooks/codex_mouse_spleen.html#cellcharter-s-spatial-clustering). <br><br>
However, if you want to use CellCharter's stability analysis to suggest the best candidates for the number of clusters, you can use the `tl.ClusterAutoK` class. <br>
It is going to repeat `max_runs` times the clustering for each value K in the range provided in the `n_clusters` parameter and it will compute the stability between adjacent Ks, as described in [CellCharter's paper](https://doi.org/10.1101/2023.01.10.523386). If the stability curve between two runs doesn't change by more than the `convergence_tol` parameter, the process will complete early without running the full `max_runs` times. <br>

Of course, this will increase the computational time, but the scalability of CellCharter's clustering step makes this analysis tractable even for large-scale datasets. <br>

In [None]:
autok = cc.tl.ClusterAutoK(
    n_clusters=(2,10), 
    max_runs=10,
    convergence_tol=0.001
)

In [None]:
autok.fit(xenium_adata, use_rep='X_cellcharter')

The best candidates for the number of clusters are the ones associated with peaks of stability. In this case, 8 is a good candidate.

In [None]:
cc.pl.autok_stability(autok)

If we omit the `k` parameter in `autok.predict`, CellCharter will automatically cluster cells with the number of clusters assoiated with the highest stability. <br>
We can still choose the number of clusters by setting the `k` parameter to the desired value.

In [None]:
xenium_adata.obs['cluster_cellcharter'] = autok.predict(xenium_adata, use_rep='X_cellcharter')

In [None]:
xenium_adata.obs['cluster_cellcharter6'] = autok.predict(xenium_adata, use_rep='X_cellcharter', k = 6)

In [None]:
xenium_adata.obs['cluster_cellcharter10'] = autok.predict(xenium_adata, use_rep='X_cellcharter', k = 10)

In [None]:
xenium_adata.obs[['cluster_cellcharter', 'cluster_cellcharter6', 'cluster_cellcharter10']].to_csv(f'{data_path}/xenium/ThyAge_xenium_adata_paed+adult_CellCharter_clusters.csv')

In [None]:

cellcharter_objs = {'obsp':xenium_adata.obsp, 'X_cellcharter': xenium_adata.obsm['X_cellcharter'],
                   'autok': autok }

import pickle 
with open (f'{data_path}/xenium/ThyAge_xenium_adata_paed+adult_CellCharter_v1.pickle', 'wb') as file:
    pickle.dump(cellcharter_objs, file, protocol=pickle.HIGHEST_PROTOCOL)