## Notebook to format objects cells for CellChat CCI inference.
**Developed by:** Anna Maguza  
**Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**  
**Date:** 27th of June 2023

### Import required modules

In [2]:
import anndata
import numpy as np
import scipy as sp
import scanpy as sc
import pandas as pd

### Setting up working environment

In [3]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 200, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.4.0
appnope                     0.1.2
asttokens                   NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
cffi                        1.15.1
colorama                    0.4.6
comm                        0.1.2
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
entrypoints                 0.4
executing                   0.8.3
h5py                        3.8.0
hypergeom_ufunc             NA
importlib_resources         NA
ipykernel                   6.19.2
ipython_genutils            0.2.0
jedi                        0.18.1
joblib                      1.2.0
jupyter_server              1.23.6
kiwisolver                  1.4.4
llvmlite                    0.39.1
matplotlib                  3.7.1
mpl_toolkits  

### Read data

In [7]:
adata = sc.read_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_epithelial_cells_all_genes.h5ad')
adata

AnnData object with n_obs × n_vars = 156195 × 26442
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'Unified Cell States', 'doublet_scores', 'predicted_doublets', 'doublet_info'
    var: 'feature_types-0-0-0', 'gene_name-1-0-0', 'gene_id-0-0', 'GENE-1-0'

In [8]:
# rename column in adata.obs 'Unified Cell States' to 'Unified_Cell_States' 
adata.obs.rename(columns = {'Unified Cell States': 'Unified_Cell_States'}, inplace = True)

In [9]:
adata_new = anndata.AnnData(X = adata.X, obs = adata.obs, var = adata.var)
adata_new

AnnData object with n_obs × n_vars = 156195 × 26442
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'Unified_Cell_States', 'doublet_scores', 'predicted_doublets', 'doublet_info'
    var: 'feature_types-0-0-0', 'gene_name-1-0-0', 'gene_id-0-0', 'GENE-1-0'

### Make a subset of all populations

* Removing stuff like nanor doublets

In [10]:
adata_new.obs['Unified_Cell_States'].cat.categories

Index(['Colonocyte', 'Enterochromaffin cells', 'Enterocyte',
       'Enteroendocrine cells', 'Epithelial cells', 'Goblet cells', 'L cells',
       'Microfold cell', 'Paneth cells', 'Stem cells', 'TA', 'Tuft cells'],
      dtype='object')

In [11]:
adata_new.obs_names

Index(['AAAGCAATCCGTTGTC-1-Human_colon_16S8000511',
       'AACACGTTCCTGCAGG-1-Human_colon_16S8000511',
       'AACCATGGTCGGCATC-1-Human_colon_16S8000511',
       'AACCGCGTCAACACCA-1-Human_colon_16S8000511',
       'AACGTTGTCGGGAGTA-1-Human_colon_16S8000511',
       'AACTCAGAGAGTCTGG-1-Human_colon_16S8000511',
       'AACTTTCCACAACTGT-1-Human_colon_16S8000511',
       'AACTTTCTCTTGTACT-1-Human_colon_16S8000511',
       'AAGCCGCCAATCAGAA-1-Human_colon_16S8000511',
       'AAGGTTCCACAGACTT-1-Human_colon_16S8000511',
       ...
       'N110.LPA.TACAGTGTCCAGGGCT', 'N110.LPA.TACGGTATCTACTATC',
       'N110.LPA.TACTTGTGTGTGCCTG', 'N110.LPA.TCATTACCAGGTTTCA',
       'N110.LPA.TCGCGAGCACAGGTTT', 'N110.LPA.TCGGGACGTCAACTGT',
       'N110.LPA.TGAGCATTCCAGTAGT', 'N110.LPA.TGGCCAGAGAGGACGG',
       'N110.LPA.TTCTTAGCAGTCCTTC', 'N110.LPA.TTTATGCAGACTACAA'],
      dtype='object', name='cell_id', length=156195)

In [12]:
adata_new.var_names

Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2ML1-AS1',
       'A2ML1-AS2', 'A4GALT', 'A4GNT',
       ...
       'ZW10', 'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B',
       'ZYX', 'ZZEF1'],
      dtype='object', name='gene_name', length=26442)

### Export the dataset

In [13]:
sc.pp.normalize_per_cell(adata_new, counts_per_cell_after = 1e6)
sc.pp.log1p(adata_new)
adata_new.X = sp.sparse.csc_matrix(adata_new.X)

normalizing by total count per cell
    finished (0:00:04): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


In [14]:
adata_new.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cell_cell_interaction/Healthy_epithelial/CellChat/Healthy_epithelial_prepared_for_cellchat.h5ad')