https://github.com/ventolab/CellphoneDB/blob/master/notebooks/0_prepare_your_data_from_anndata.ipynb

## Import modules

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import os
import sys
from scipy import sparse
import pickle
import matplotlib.pyplot as plt

sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)

In [2]:
import importlib.util
import sys
spec = importlib.util.spec_from_file_location("module.name", "/nfs/team205/kk18/function/python/utils.py")
utils = importlib.util.module_from_spec(spec)
sys.modules["module.name"] = utils
spec.loader.exec_module(utils)

## Set path

In [3]:
sample_id = 'HEA_FOET14880396'
cellphone_input_dir = f'/lustre/scratch126/cellgen/team205/kk18/VisiumHD/objects/cellphonedb/{sample_id}/inputs'
cellphone_input_dir

'/lustre/scratch126/cellgen/team205/kk18/VisiumHD/objects/cellphonedb/HEA_FOET14880396/inputs'

In [4]:
celltype_col = 'fine_grain_4Jan2025'

In [5]:
niche_label_col = 'tt_prediction'
niche_of_interest = 'SAnode'

## Read in anndata

In [11]:
# read in 
adata = sc.read_h5ad(f'/lustre/scratch126/cellgen/team205/kk18/VisiumHD/objects/{sample_id}_b2c_cells_filtered_celltype-selected_lognorm.h5ad')
# rename sinus-horn cells
adata.obs.replace({celltype_col:
                   {'SinoatrialNodePacemakerCellsHorn':'SinusHornPacemakerCells'}},
                  inplace=True
                 )
adata

AnnData object with n_obs × n_vars = 92164 × 18047
    obs: 'object_id', 'bin_count', 'array_row', 'array_col', 'labels_joint_source', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'celltypist_coarse', 'conf_score_coarse', 'celltypist_coarse_fin', 'celltypist_coarse2midmod', 'conf_score_coarse2midmod', 'celltypist_mid_fin', 'celltypist_midmod2fine', 'conf_score_midmod2fine', 'fine_grain_4Jan2025', 'HistologicalAnnotation', 'cluster_cellcharter_k-8', 'cluster_cellcharter_k-10', 'cluster_cellcharter_k-12', 'cluster_cellcharter_k-14', 'cluster_cellcharter_k-16', 'cluster_cellcharter_k-18', 'cluster_cellcharter_k-20', 'cluster_cellcharter_k-25', 'cluster_cellcharter_k-30', 'cluster_cellcharter_k-35', 'cluster_cellcharter_k-40', 'cluster_cellcharter_k-45', 'cluster_cellc

In [12]:
adata.X.data[:5]

array([1.27187559, 1.95641568, 1.75023687, 1.96760567, 2.7810601 ])

In [13]:
# update cell type name
df = pd.read_csv('/nfs/team205/heart/anndata_objects/Foetal/finegrain_name_mapping.csv')
celltype_mapping = df.set_index('Full_name')['Short_name_finalised'].to_dict()
adata.obs.replace({celltype_col:celltype_mapping},inplace=True)

## Subset cellular niche of interest

In [14]:
adata_sub = adata[adata.obs[niche_label_col]==niche_of_interest]
adata_sub

View of AnnData object with n_obs × n_vars = 1313 × 18047
    obs: 'object_id', 'bin_count', 'array_row', 'array_col', 'labels_joint_source', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'celltypist_coarse', 'conf_score_coarse', 'celltypist_coarse_fin', 'celltypist_coarse2midmod', 'conf_score_coarse2midmod', 'celltypist_mid_fin', 'celltypist_midmod2fine', 'conf_score_midmod2fine', 'fine_grain_4Jan2025', 'HistologicalAnnotation', 'cluster_cellcharter_k-8', 'cluster_cellcharter_k-10', 'cluster_cellcharter_k-12', 'cluster_cellcharter_k-14', 'cluster_cellcharter_k-16', 'cluster_cellcharter_k-18', 'cluster_cellcharter_k-20', 'cluster_cellcharter_k-25', 'cluster_cellcharter_k-30', 'cluster_cellcharter_k-35', 'cluster_cellcharter_k-40', 'cluster_cellcharter_k-45', 'cluste

In [15]:
# filter cell types
n_cell_thresh = 10
val_counts = adata_sub.obs[celltype_col].value_counts()
celltype_sel = list(val_counts.index[val_counts>n_cell_thresh])
adata_sub = adata_sub[adata_sub.obs[celltype_col].isin(celltype_sel)]
adata_sub.obs[celltype_col].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
aCMR,21,0.016292
AVNPC,36,0.027929
CorCapEC,19,0.01474
CorPeri,26,0.020171
FB,61,0.047324
GVVenEC,28,0.021722
MacCX3,18,0.013964
ParaN,23,0.017843
SANPCHd,631,0.489527
SHPC,222,0.172227


## Save

In [16]:
meta = adata_sub.obs.copy()
meta['cell_id'] = meta.index
meta = meta[['cell_id', celltype_col]]
meta.to_csv(f'{cellphone_input_dir}/meta_{niche_of_interest}.txt', index=False, sep = "\t")

adata_sub.write(f'{cellphone_input_dir}/log_norm_counts_{niche_of_interest}.h5ad')

In [17]:
f'{cellphone_input_dir}/meta_{niche_of_interest}.txt'

'/lustre/scratch126/cellgen/team205/kk18/VisiumHD/objects/cellphonedb/HEA_FOET14880396/inputs/meta_SAnode.txt'

In [18]:
f'{cellphone_input_dir}/log_norm_counts_{niche_of_interest}.h5ad'

'/lustre/scratch126/cellgen/team205/kk18/VisiumHD/objects/cellphonedb/HEA_FOET14880396/inputs/log_norm_counts_SAnode.h5ad'

In [None]:
!ls -lh {cellphone_input_dir}