### Notebook to format objects cells for `CellChat` CCI inference.

- **Developed by**: Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v230626

### Import required modules

In [1]:
import anndata
import numpy as np
import scipy as sp
import scanpy as sc
import pandas as pd

### Setting up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 200, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.9.2
scanpy      1.9.3
-----
PIL                 10.0.0
appnope             0.1.3
asttokens           NA
backcall            0.2.0
colorama            0.4.6
comm                0.1.3
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.7
decorator           5.1.1
executing           1.2.0
h5py                3.9.0
importlib_resources NA
ipykernel           6.25.0
ipywidgets          8.0.7
jedi                0.18.2
joblib              1.3.1
kiwisolver          1.4.4
llvmlite            0.40.1
matplotlib          3.7.2
mpl_toolkits        NA
natsort             8.4.0
numba               0.57.1
numpy               1.24.4
packaging           23.1
pandas              2.0.3
parso               0.8.3
pexpect             4.8.0
pickleshare         0.7.5
pkg_resources       NA
platformdirs        3.9.1
prompt_toolkit      3.0.39
psutil              5.9.5
ptyprocess          0.7.0
pure_eval           0.2.2
pydev_ipython       NA
p

### Read heart global object 

In [3]:
adata = sc.read_h5ad('../../../data/Marburg_cell_states_locked_scANVI_ctl230901.raw.h5ad')
adata

AnnData object with n_obs × n_vars = 97573 × 27208
    obs: 'sex', 'age', 'ethnicity', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'illumina_stimunr', 'bd_rhapsody', 'n_genes', 'doublet_scores', 'predicted_doublets', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'condition', 'sample_group', 'IAV_score', 'group', 'Viral_score', 'cell_type', 'cell_states', 'leiden', 'cell_compartment', 'seed_labels', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'mt', 'ribo'
    obsm: 'X_scANVI', 'X_scVI', 'X_umap'

In [4]:
adata_new = anndata.AnnData(X = adata.X, obs = adata.obs, var = adata.var)
adata_new

AnnData object with n_obs × n_vars = 97573 × 27208
    obs: 'sex', 'age', 'ethnicity', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'illumina_stimunr', 'bd_rhapsody', 'n_genes', 'doublet_scores', 'predicted_doublets', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'condition', 'sample_group', 'IAV_score', 'group', 'Viral_score', 'cell_type', 'cell_states', 'leiden', 'cell_compartment', 'seed_labels', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'mt', 'ribo'

### Make a subset from the populations of interest

- Removing stuff like `nan`or doublets.

In [5]:
adata_new.obs['cell_states'].cat.categories

Index(['APOD+Ciliated', 'IGFBP6+Basal', 'SERPINE1+Basal', 'SERPINE2+Basal',
       'CCDC3+Basal1', 'DHRS9+Club', 'FB-like_Basal', 'IGFBP+Basal',
       'ImmuneClub', 'Ionocyte', 'KRT14+AQP1+Secretory', 'KRT14+Goblet',
       'KRT17+Goblet', 'MHCII+Club', 'MKI67+pBasal', 'MUC5B+Goblet',
       'NOTCH+Basal2', 'NOTCH3+SupraB', 'OASiav_Ciliated', 'OMG+Ciliated',
       'RARRES1+lip_Goblet', 'S100A2+Basal', 'SCGB1+KRT5-FOXA1+iav_Club',
       'SCGB1A1+Deutero', 'SCGB1A1+Goblet', 'KRT16+SupraB', 'TCN1+Club',
       'TNC+Basal', 'iavAPC_Epi', 'iav-lip_Club', 'iav_Goblet', 'ifn_Basal',
       'ifn_Goblet', 'mixed_Goblet1', 'mixed_Goblet2', 'p53_Ciliated'],
      dtype='object')

In [6]:
adata_subset = adata_new[adata_new.obs['cell_states'].isin(['IGFBP6+Basal', 'SERPINE1+Basal', 'SERPINE2+Basal',
       'CCDC3+Basal1', 'FB-like_Basal', 'IGFBP+Basal', 'ImmuneClub', 'Ionocyte', 'KRT14+AQP1+Secretory', 
       'MKI67+pBasal', 'NOTCH+Basal2', 'NOTCH3+SupraB', 'S100A2+Basal', 'TNC+Basal', 'ifn_Basal'])]
adata_subset

View of AnnData object with n_obs × n_vars = 56762 × 27208
    obs: 'sex', 'age', 'ethnicity', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'illumina_stimunr', 'bd_rhapsody', 'n_genes', 'doublet_scores', 'predicted_doublets', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'condition', 'sample_group', 'IAV_score', 'group', 'Viral_score', 'cell_type', 'cell_states', 'leiden', 'cell_compartment', 'seed_labels', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'mt', 'ribo'

In [7]:
adata_subset.var_names

Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2ML1-AS1',
       'A3GALT2', 'A4GALT', 'A4GNT',
       ...
       'ZW10', 'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B',
       'ZYX', 'ZZEF1'],
      dtype='object', length=27208)

### Subset per condition

In [8]:
adata_subset.obs['group'].cat.categories

Index(['healthy_ctrl', 'healthy_iav', 'copd_ctrl', 'copd_iav'], dtype='object')

In [9]:
adata_new_CTRL = adata_subset[adata_subset.obs['group'].isin(['healthy_ctrl'])]
adata_new_CTRL

View of AnnData object with n_obs × n_vars = 13283 × 27208
    obs: 'sex', 'age', 'ethnicity', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'illumina_stimunr', 'bd_rhapsody', 'n_genes', 'doublet_scores', 'predicted_doublets', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'condition', 'sample_group', 'IAV_score', 'group', 'Viral_score', 'cell_type', 'cell_states', 'leiden', 'cell_compartment', 'seed_labels', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'mt', 'ribo'

### Export CTRL

In [11]:
sc.pp.normalize_per_cell(adata_new_CTRL, counts_per_cell_after = 1e6)
sc.pp.log1p(adata_new_CTRL)
adata_new_CTRL.X = sp.sparse.csc_matrix(adata_new_CTRL.X)

normalizing by total count per cell


  adata.obs[key_n_counts] = counts_per_cell


    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


In [12]:
adata_new_CTRL.write('../../../data/Basal_Healthy-CTRL_anotated.h5ad')

### Export Conditions

In [10]:
adata_subset.obs['group'].cat.categories

Index(['healthy_ctrl', 'healthy_iav', 'copd_ctrl', 'copd_iav'], dtype='object')

In [17]:
adata_new_COND = adata_subset[adata_subset.obs['group'].isin(['copd_iav'])]
sc.pp.normalize_per_cell(adata_new_COND, counts_per_cell_after = 1e6)
sc.pp.log1p(adata_new_COND)
adata_new_COND.X = sp.sparse.csc_matrix(adata_new_COND.X)
adata_new_COND

normalizing by total count per cell


  adata.obs[key_n_counts] = counts_per_cell


    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


AnnData object with n_obs × n_vars = 14425 × 27208
    obs: 'sex', 'age', 'ethnicity', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'illumina_stimunr', 'bd_rhapsody', 'n_genes', 'doublet_scores', 'predicted_doublets', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'condition', 'sample_group', 'IAV_score', 'group', 'Viral_score', 'cell_type', 'cell_states', 'leiden', 'cell_compartment', 'seed_labels', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'mt', 'ribo'
    uns: 'log1p'

In [18]:
adata_new_COND.write('../../../data/Basal_COPD-IAV_anotated.h5ad')