## Notebook to format objects cells for CellChat CCI inference.
**Developed by:** Anna Maguza  
**Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**  
**Date:** 27th of June 2023

### Import required modules

In [1]:
import anndata
import numpy as np
import scipy as sp
import scanpy as sc
import pandas as pd

### Setting up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 200, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.4.0
appnope                     0.1.2
asttokens                   NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
cffi                        1.15.1
colorama                    0.4.6
comm                        0.1.2
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
entrypoints                 0.4
executing                   0.8.3
h5py                        3.8.0
hypergeom_ufunc             NA
importlib_resources         NA
ipykernel                   6.19.2
ipython_genutils            0.2.0
jedi                        0.18.1
joblib                      1.2.0
jupyter_server              1.23.6
kiwisolver                  1.4.4
llvmlite                    0.39.1
matplotlib                  3.7.1
mpl_toolkits  

In [3]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Read data

In [4]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Labels_transfer/scBalance/Joanito_predicted_labels_with_scBalance_7000.h5ad'
adata = sc.read_h5ad(input)
X_is_raw(adata)

False

In [5]:
# Extract the raw counts
adata = adata.raw.to_adata()
adata

AnnData object with n_obs × n_vars = 35714 × 33287
    obs: 'nFeature_RNA', 'pct_counts_mito', 'Sample_ID', 'Donor_ID', 'sample.origin', 'dataset_x', 'Cell Type', 'iCMS', 'msi', 'dataset_y', 'Sex', 'Tumor Stage', 'MSS/MSI', 'Location', 'Side', 'Group Stage', 'Stage TNM', 'iCMS.transcriptomic', 'iCMS.inferCNV', 'KRAS', 'BRAF', 'TP53', 'APC', 'PIK3CA', 'LymphNode', 'Normal', 'Tumor', 'CMS', 'Age_group', 'Study_name', 'Diagnosis', 'n_genes_by_counts', 'total_counts', 'Library_Preparation_Protocol', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'Predicted Label'
    var: 'feature_types', 'genome'
    uns: 'log1p'

In [6]:
X_is_raw(adata)

True

In [7]:
# rename column in adata.obs 'Unified Cell States' to 'Unified_Cell_States' 
adata.obs.rename(columns = {'Predicted Label': 'Unified_Cell_States'}, inplace = True)

In [8]:
adata.obs['Unified_Cell_States'].value_counts()

TA                        22094
Paneth cells               7622
Colonocyte                 2136
Enterocyte                 1661
Tuft cells                 1400
Stem cells                  555
Epithelial cells            103
Goblet cells                 94
Enteroendocrine cells        31
Microfold cell                9
Enterochromaffin cells        6
L cells                       3
Name: Unified_Cell_States, dtype: int64

In [9]:
adata_new = anndata.AnnData(X = adata.X, obs = adata.obs, var = adata.var)
adata_new

AnnData object with n_obs × n_vars = 35714 × 33287
    obs: 'nFeature_RNA', 'pct_counts_mito', 'Sample_ID', 'Donor_ID', 'sample.origin', 'dataset_x', 'Cell Type', 'iCMS', 'msi', 'dataset_y', 'Sex', 'Tumor Stage', 'MSS/MSI', 'Location', 'Side', 'Group Stage', 'Stage TNM', 'iCMS.transcriptomic', 'iCMS.inferCNV', 'KRAS', 'BRAF', 'TP53', 'APC', 'PIK3CA', 'LymphNode', 'Normal', 'Tumor', 'CMS', 'Age_group', 'Study_name', 'Diagnosis', 'n_genes_by_counts', 'total_counts', 'Library_Preparation_Protocol', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'Unified_Cell_States'
    var: 'feature_types', 'genome'

### Make a subset of all populations

* Removing stuff like nanor doublets

In [10]:
adata_new.obs['Unified_Cell_States'].cat.categories

Index(['Colonocyte', 'Enterochromaffin cells', 'Enterocyte',
       'Enteroendocrine cells', 'Epithelial cells', 'Goblet cells', 'L cells',
       'Microfold cell', 'Paneth cells', 'Stem cells', 'TA', 'Tuft cells'],
      dtype='object')

In [11]:
adata_new.obs_names

Index(['CRC16_MUX8563_AAACGGGGTCGATTGT-1', 'CRC16_MUX8563_AAAGATGCAGAAGCAC-1',
       'CRC16_MUX8563_AAAGCAATCTAACGGT-1', 'CRC16_MUX8563_ACAGCCGGTCTCTTAT-1',
       'CRC16_MUX8563_ACAGCTATCCGTCATC-1', 'CRC16_MUX8563_ACATACGGTTACGTCA-1',
       'CRC16_MUX8563_ACATGGTGTCCATGAT-1', 'CRC16_MUX8563_ACCGTAAAGCCCAATT-1',
       'CRC16_MUX8563_ACGAGGACATCTGGTA-1', 'CRC16_MUX8563_ACGCCGAGTCTGCAAT-1',
       ...
       'KUL5_EXT127_CATCAGAAGTACGATA-1', 'KUL5_EXT127_CCAATCCTCGGATGTT-1',
       'KUL5_EXT127_CCGGTAGAGCGTGAAC-1', 'KUL5_EXT127_CGCTTCATCCAGTAGT-1',
       'KUL5_EXT127_CTACGTCCAGAGTGTG-1', 'KUL5_EXT127_GACTGCGAGTAGCGGT-1',
       'KUL5_EXT127_GTGCATAGTTTGACAC-1', 'KUL5_EXT127_TATCAGGGTGTGAAAT-1',
       'KUL5_EXT127_TCACAAGAGATCCCGC-1', 'KUL5_EXT127_TGGTTCCAGAGATGAG-1'],
      dtype='object', name='cell.ID', length=35714)

In [12]:
adata_new.var_names

Index(['MIR1302-2HG', 'FAM138A', 'OR4F5', 'AL627309.1', 'AL627309.3',
       'AL627309.2', 'AL627309.4', 'AL732372.1', 'OR4F29', 'AC114498.1',
       ...
       'AC007325.2', 'BX072566.1', 'AL354822.1', 'AC023491.2', 'AC004556.1',
       'AC233755.2', 'AC233755.1', 'AC240274.1', 'AC213203.1', 'FAM231C'],
      dtype='object', length=33287)

### Export the dataset

In [13]:
sc.pp.normalize_per_cell(adata_new, counts_per_cell_after = 1e6)
sc.pp.log1p(adata_new)
adata_new.X = sp.sparse.csc_matrix(adata_new.X)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


In [14]:
adata_new.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cell_cell_interaction/Cancer_epithelial/CellChat/Cancer_epithelial_prepared_for_cellchat.h5ad')