### Notebook to format objects cells for `CellChat` CCI inference.

- **Developed by**: Carlos Talavera-López Ph.D
- **Faculty of Medicine - Julius-Maximilian-Universität Würzburg**
- v240711

### Import required modules

In [1]:
import anndata
import numpy as np
import scipy as sp
import scanpy as sc
import pandas as pd

### Setting up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 200, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.10.8
scanpy      1.10.2
-----
PIL                 10.3.0
appnope             0.1.4
asttokens           NA
comm                0.2.2
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.8.1
decorator           5.1.1
executing           2.0.1
h5py                3.11.0
ipykernel           6.29.5
ipywidgets          8.1.3
jedi                0.19.1
joblib              1.4.2
kiwisolver          1.4.5
legacy_api_wrap     NA
llvmlite            0.42.0
matplotlib          3.8.4
mpl_toolkits        NA
natsort             8.4.0
numba               0.59.1
numpy               1.26.4
packaging           24.0
pandas              2.2.2
parso               0.8.4
platformdirs        4.2.1
prompt_toolkit      3.0.43
psutil              5.9.8
pure_eval           0.2.2
pydev_ipython       NA
pydevconsole        NA
pydevd              2.9.5
pydevd_file_utils   NA
pydevd_plugins      NA
pydevd_tracing      NA
pygments            2.18.0


### Read in individual compartments

In [3]:
adata_cmc = sc.read_h5ad('/Users/cartalop/github/hofmann_macro/data/heart_mm_nuclei-23-0092_CMC_states_ctl240131.raw.h5ad')
adata_cmc.obs['cell_states'] = adata_cmc.obs['C_scANVI'].copy()
adata_cmc

AnnData object with n_obs × n_vars = 8257 × 32285
    obs: 'cell_source', 'cell_type', 'donor', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'cell_states', 'seed_labels', 'genotype', 'batch', 'doublet_scores', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    obsm: 'X_scANVI', 'X_scVI', 'X_umap'

In [4]:
adata_lymphoid = sc.read_h5ad('/Users/cartalop/github/hofmann_macro/data/heart_mm_nuclei-23-0092_scANVI-Lymphoid_states_ctl240527.raw.h5ad')
adata_lymphoid.obs['cell_states'] = adata_lymphoid.obs['cell_type'].copy()
adata_lymphoid

AnnData object with n_obs × n_vars = 65544 × 16060
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states', 'sample', 'cell_source', 'genotype', 'nCount_HTO', 'nFeature_HTO', 'HTO_classification', 'Library', 'CD45_Annotation', 'cell_type', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'scrublet_score', 'batch', 'doublet_scores', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'MPC_Annotation', 'compartment'
    var: 'gene_ids-DMD_immune-DMD', 'feature_types-DMD_immune-DMD', 'genome-DMD_immune-DMD', 'mt-DMD_immune-DMD', 'ribo-DMD_immune-DMD', 'n_cells_by_counts-DMD_immune-DMD', 'mean_counts-DMD_immune-DMD', 'pct_dropout_by_counts-DMD_immune-DMD', 'total_counts-DMD

In [5]:
adata_myeloid = sc.read_h5ad('/Users/cartalop/github/hofmann_macro/data/heart_mm_nuclei-23-0092_scANVI-Myeloid_states_ctl240502.raw.h5ad')
adata_myeloid.obs['cell_states'] = adata_myeloid.obs['cell_type'].copy()
adata_myeloid

AnnData object with n_obs × n_vars = 41716 × 16060
    obs: 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'seed_labels', 'donor', 'cell_states', 'sample', 'cell_source', 'genotype', 'nCount_HTO', 'nFeature_HTO', 'HTO_classification', 'Library', 'CD45_Annotation', 'cell_type', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'scrublet_score', 'batch', 'doublet_scores', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'MPC_Annotation', 'compartment'
    var: 'gene_ids-DMD_immune-DMD', 'feature_types-DMD_immune-DMD', 'genome-DMD_immune-DMD', 'mt-DMD_immune-DMD', 'ribo-DMD_immune-DMD', 'n_cells_by_counts-DMD_immune-DMD', 'mean_counts-DMD_immune-DMD', 'pct_dropout_by_counts-DMD_immune-DMD', 'total_counts-DMD

### Merge all objects into one

In [6]:
adata_new = adata_cmc.concatenate(adata_lymphoid, adata_myeloid, 
                                  batch_key = 'object',
                                  batch_categories = ['CMC', 'Lymphoid', 'Myeloid'], 
                                  join = 'inner')
adata_new

  adata_new = adata_cmc.concatenate(adata_lymphoid, adata_myeloid,


AnnData object with n_obs × n_vars = 115517 × 16060
    obs: 'cell_source', 'cell_type', 'donor', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'cell_states', 'seed_labels', 'genotype', 'batch', 'doublet_scores', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'C_scANVI', 'orig.ident', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'annotation_broad', 'annotation_detailed', 'sample_id', 'nCount_HTO', 'nFeature_HTO', 'HTO_classification', 'Library', 'CD45_Annotation', 'MPC_Annotation', 'compartment', 'object'
    var: 'gene_ids-CMC', 'feature_types-CMC', 'genome-CMC', 'mt-CMC', 'ribo-CMC', 'n_cells_by_counts-CMC', 'mean_counts-CMC', 'pct_dropout_by_counts-CMC', 'total_counts-CMC', 'gene_ids-DMD_immune-DMD-Lymphoid', 'feature_types-DMD_immune-

In [7]:
adata_new.obs['samples'] = adata_new.obs['sample'].copy()
adata_new.obs['samples'].cat.categories


Index(['A9_2', 'A10_2', 'A11_2', 'A12_2', 'B1_2', 'B2_2', 'Hashtag1-TotalA',
       'Hashtag2-TotalA', 'Hashtag3-TotalA', 'Hashtag4-TotalA',
       'Hashtag5-TotalA', 'Hashtag6-TotalA', 'Hashtag7-TotalA',
       'Hashtag8-TotalA', 'Hashtag9-TotalA', 'Hashtag10-TotalA',
       'Hashtag11-TotalA', 'Hashtag12-TotalA', 'Hashtag13-TotalA',
       'Hashtag14-TotalA', 'Hashtag15-TotalA'],
      dtype='object')

### Clean `adata.obs` for clarity

In [8]:
variables_to_keep = [
    'cell_source', 'cell_type', 'donor', 'cell_states', 
    'genotype', 'compartment', 'object', 'samples'
]

adata_new.obs = adata_new.obs[variables_to_keep]

In [9]:
variables_to_keep = ['gene_ids-CMC']

adata_new.var = adata_new.var[variables_to_keep]

In [10]:
del(adata_new.obsm)

In [11]:
adata_new

AnnData object with n_obs × n_vars = 115517 × 16060
    obs: 'cell_source', 'cell_type', 'donor', 'cell_states', 'genotype', 'compartment', 'object', 'samples'
    var: 'gene_ids-CMC'

### Make a subset of all populations

- Removing stuff like `nan`or doublets.

In [12]:
adata_new.obs['cell_states'].cat.categories

Index(['B_cells', 'B_mem', 'B_naive', 'CD4Tctl', 'CD4Th', 'CD4Tnaive',
       'CD8Tcm', 'CD8Tctl', 'CD8Tem', 'CD8Temra', 'CD8Tnaive', 'Ccr2+MHCII+MØ',
       'DC', 'DC2', 'ILC', 'Isg15+MØ', 'Ly6ChiMo', 'Ly6CloMo', 'MAIT',
       'MHCII+MØtr', 'Mast', 'MØinf', 'NK', 'NØ', 'Plasma_cells',
       'Spp1+Gpnmb+MØ', 'T', 'TLF+MØ', 'Treg', 'gdT', 'vCM1', 'vCM2', 'vCM3',
       'vCM4'],
      dtype='object')

In [13]:
adata_new.obs_names

Index(['ACACTGATCATTATCC-1-A9_2-CMC', 'TCTTCCTGTCATAACC-1-A9_2-CMC',
       'ATCTCTAGTTTCAGAC-1-A9_2-CMC', 'TGAATGCAGCTCCATA-1-A9_2-CMC',
       'GCAGCTGCACAAGTGG-1-A9_2-CMC', 'GAGACTTCAGTTAGAA-1-A9_2-CMC',
       'CCGATCTTCGCACGGT-1-A9_2-CMC', 'TAACTTCTCACAGAGG-1-A9_2-CMC',
       'CAACGATGTCTACGTA-1-A9_2-CMC', 'ATCGATGGTACCTAGT-1-A9_2-CMC',
       ...
       'TTTGACTCAGGCAATG-1_2-MDX_MPC-1-Myeloid',
       'TTTGATCCATAACAGA-1_2-MDX_MPC-1-Myeloid',
       'TTTGATCGTCGTATTG-1_2-MDX_MPC-1-Myeloid',
       'TTTGATCTCAAGTAAG-1_2-MDX_MPC-1-Myeloid',
       'TTTGGAGAGACTAAGT-1_2-MDX_MPC-1-Myeloid',
       'TTTGGAGAGCGACTGA-1_2-MDX_MPC-1-Myeloid',
       'TTTGGAGAGGATAATC-1_2-MDX_MPC-1-Myeloid',
       'TTTGGAGTCTCGGCTT-1_2-MDX_MPC-1-Myeloid',
       'TTTGTTGAGATTCGAA-1_2-MDX_MPC-1-Myeloid',
       'TTTGTTGGTGATATAG-1_2-MDX_MPC-1-Myeloid'],
      dtype='object', length=115517)

In [14]:
adata_new.var_names

Index(['Xkr4', 'Rp1', 'Sox17', 'Mrpl15', 'Lypla1', 'Tcea1', 'Rgs20', 'Atp6v1h',
       'Oprk1', 'Npbwr1',
       ...
       'Mid1', 'Asmt', 'Kdm5d', 'Uty', 'Ddx3y', 'Usp9y', 'Sry', 'Vamp7',
       'Spry3', 'Tmlhe'],
      dtype='object', length=16060)

### Subset per condition

In [15]:
adata_new.obs['genotype'].cat.categories

Index(['Mdx', 'MdxSCID', 'WT'], dtype='object')

In [16]:
adata_new_WT = adata_new[adata_new.obs['genotype'].isin(['WT'])]
adata_new_WT

View of AnnData object with n_obs × n_vars = 69819 × 16060
    obs: 'cell_source', 'cell_type', 'donor', 'cell_states', 'genotype', 'compartment', 'object', 'samples'
    var: 'gene_ids-CMC'

### Export CTRL

In [17]:
sc.pp.normalize_per_cell(adata_new_WT, counts_per_cell_after = 1e6)
sc.pp.log1p(adata_new_WT)
adata_new_WT.X = sp.sparse.csc_matrix(adata_new_WT.X)

normalizing by total count per cell


  adata.obs[key_n_counts] = counts_per_cell


    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


In [18]:
adata_new_WT.write('/Users/cartalop/github/hofmann_macro/data/DMD-WT_CMC-Immune_ctl240711.log.h5ad')

### Export Conditions

In [19]:
adata_new.obs['genotype'].cat.categories

Index(['Mdx', 'MdxSCID', 'WT'], dtype='object')

In [22]:
adata_new_COND = adata_new[adata_new.obs['genotype'].isin(['MdxSCID'])]
sc.pp.normalize_per_cell(adata_new_COND, counts_per_cell_after = 1e6)
sc.pp.log1p(adata_new_COND)
adata_new_COND.X = sp.sparse.csc_matrix(adata_new_COND.X)
adata_new_COND

normalizing by total count per cell


  adata.obs[key_n_counts] = counts_per_cell


    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


AnnData object with n_obs × n_vars = 29502 × 16060
    obs: 'cell_source', 'cell_type', 'donor', 'cell_states', 'genotype', 'compartment', 'object', 'samples', 'n_counts'
    var: 'gene_ids-CMC'
    uns: 'log1p'

In [23]:
adata_new_COND.write('/Users/cartalop/github/hofmann_macro/data/DMD-MdxSCID_CMC-Immune_ctl240711.log.h5ad')