### Notebook to format objects cells for CellChat CCI inference
- **Developed by:** Anna Maguza
- **Place:** Wuerzburg Institute for System Immunology
- **Date:** 16th November 2023


### Import required modules

In [1]:
import anndata
import numpy as np
import scipy as sp
import scanpy as sc
import pandas as pd

### Setting up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 200, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'svg')


-----
anndata     0.9.2
scanpy      1.9.5
-----
PIL                         10.0.1
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        23.1.0
attrs                       23.1.0
babel                       2.13.0
backcall                    0.2.0
certifi                     2023.07.22
cffi                        1.16.0
charset_normalizer          3.3.0
colorama                    0.4.6
comm                        0.1.4
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.8.0
decorator                   5.1.1
defusedxml                  0.7.1
executing                   2.0.0
fastjsonschema              NA
fqdn                        NA
h5py                        3.9.0
idna                        3.4
igraph                      0.11.2
ipykernel                   6.25.2
ipywidgets                  8.1.1
isoduration                 NA
jedi   

In [3]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Read data

In [4]:
input_path = '/home/amaguza/data/Processed_data/Gut_data/Stem_cells/Fetal_healthy_stem_cells_leiden.h5ad'
adata = sc.read_h5ad(input_path)

In [5]:
# Extract the raw counts
adata = adata.raw.to_adata()
adata

AnnData object with n_obs × n_vars = 7817 × 19868
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'n_genes', 'n_counts', 'leiden', 'cluster'
    var: 'feature_types-0-0-0', 'gene_name-1-0-0', 'gene_id-0-0', 'GENE-1-0', 'n_counts', 'n_cells'
    uns: 'Age_colors', 'Age_group_colors', 'Donor_ID_colors', 'Library_Preparation_Protocol_colors', 'Sex_colors', '_scvi_manager_uuid', '_scvi_uuid', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'rank_genes_groups', 'umap'
    obsm: 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs', '_scvi_extra_continuous_covs'
    obsp: 'connectivities', 'distances'

In [6]:
X_is_raw(adata)

True

In [7]:
adata_new = anndata.AnnData(X = adata.X, obs = adata.obs, var = adata.var)
adata_new

AnnData object with n_obs × n_vars = 7817 × 19868
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'n_genes', 'n_counts', 'leiden', 'cluster'
    var: 'feature_types-0-0-0', 'gene_name-1-0-0', 'gene_id-0-0', 'GENE-1-0', 'n_counts', 'n_cells'

### Make a subset of all populations

* Removing stuff like nanor doublets

In [8]:
adata_new.obs['cluster'].cat.categories

Index(['ASS1+_SLC40A1+_SC', 'RPS10+_RPS17+_SC', 'FXYD3+_CKB+_SC'], dtype='object')

In [9]:
adata_new.obs_names

Index(['AAACCTGCATCTACGA-1-Human_colon_16S8159182',
       'AAACGGGTCGGCGCAT-1-Human_colon_16S8159182',
       'AAACGGGTCTTACCGC-1-Human_colon_16S8159182',
       'AAAGATGAGATATACG-1-Human_colon_16S8159182',
       'AAAGATGGTCTCCATC-1-Human_colon_16S8159182',
       'AAAGATGTCACCTTAT-1-Human_colon_16S8159182',
       'AAAGCAAGTGACCAAG-1-Human_colon_16S8159182',
       'AAAGCAATCCCTAACC-1-Human_colon_16S8159182',
       'AAAGCAATCGACGGAA-1-Human_colon_16S8159182',
       'AAAGTAGCAAGCCATT-1-Human_colon_16S8159182',
       ...
       'CCAATCCCACCACCAG-1-4918STDY7718977',
       'CCAGCGACATCTGGTA-1-4918STDY7718977',
       'CCGTACTCACGTAAGG-1-4918STDY7718977',
       'CCTTCGATCCACGTGG-1-4918STDY7718977',
       'CGTAGCGTCGCCAAAT-1-4918STDY7718977',
       'CTAGCCTAGTGAACGC-1-4918STDY7718977',
       'CTCGTACGTCAAACTC-1-4918STDY7718977',
       'CTGTGCTAGGATGTAT-1-4918STDY7718977',
       'GATCGTAAGTAGGTGC-1-4918STDY7718977',
       'TCAGCAAAGTGCGATG-1-4918STDY7718977'],
      dtype='objec

In [10]:
adata_new.var_names

Index(['A1BG', 'A1CF', 'A2M', 'A2M-AS1', 'A4GALT', 'AAAS', 'AACS', 'AADAC',
       'AADAT', 'AAED1',
       ...
       'ZW10', 'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B',
       'ZYX', 'ZZEF1'],
      dtype='object', name='gene_name', length=19868)

### Export the dataset

In [11]:
sc.pp.normalize_per_cell(adata_new, counts_per_cell_after = 1e6)
sc.pp.log1p(adata_new)
adata_new.X = sp.sparse.csc_matrix(adata_new.X)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


  if not is_categorical_dtype(df_full[k]):


In [12]:
adata_new.write('/home/amaguza/data/Processed_data/Gut_data/Stem_cells/FetalSC_cellchat/FetalSC_prepared_fpr_CellChat.h5ad')