### Notebook for the fetal stem cells identification of fate probabilities and driver genes with CellRank
- **Developed by:** Anna Maguza
- **Place:** Wuerzburg Institute for System Immunology
- **Date:** 13th November 2023

### Import packages

In [1]:
import numpy as np
import pandas as pd

import cellrank as cr
import scanpy as sc
from cellrank.kernels import RealTimeKernel

from moscot.problems.time import TemporalProblem


In [2]:
import matplotlib.pyplot as plt

### Set up the cells

In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3
sc.logging.print_versions()

sc.settings.set_figure_params(frameon=False, dpi=100)
cr.settings.verbosity = 2

-----
anndata     0.10.3
scanpy      1.9.5
-----
PIL                         10.1.0
absl                        NA
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        23.1.0
attrs                       23.1.0
babel                       2.13.1
brotli                      1.1.0
cellrank                    2.0.0
certifi                     2023.07.22
cffi                        1.16.0
charset_normalizer          3.3.2
chex                        0.1.83
cloudpickle                 3.0.0
colorama                    0.4.6
comm                        0.1.4
cycler                      0.12.1
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.8.0
decorator                   5.1.1
defusedxml                  0.7.1
docrep                      0.3.2
executing                   2.0.1
fastjsonschema              NA
flax                        0.6.1
fqdn                        NA
gmpy2

In [5]:
import warnings

warnings.simplefilter("ignore", category=UserWarning)

### Data Upload

In [6]:
# upload anndata file
input = 'FetalSC_data/Fetal_healthy_stem_cells_CellRank_experimental_time.h5ad'
adata = sc.read_h5ad(input)

### Initialize an estimator

In [7]:
vk = cr.kernels.RealTimeKernel.from_adata(adata, key="T_fwd")

In [8]:
g = cr.estimators.GPCCA(vk)
print(g)

GPCCA[kernel=RealTimeKernel[n=7817], initial_states=None, terminal_states=None]


In [9]:
g.fit(cluster_key="cluster", n_states=[4, 12])

Computing Schur decomposition
Adding `adata.uns['eigendecomposition_fwd']`
       `.schur_vectors`
       `.schur_matrix`
       `.eigendecomposition`
    Finish (0:00:00)
Calculating minChi criterion in interval `[4, 12]`
Computing `6` macrostates
Adding `.macrostates`
       `.macrostates_memberships`
       `.coarse_T`
       `.coarse_initial_distribution
       `.coarse_stationary_distribution`
       `.schur_vectors`
       `.schur_matrix`
       `.eigendecomposition`
    Finish (0:00:00)


GPCCA[kernel=RealTimeKernel[n=7817], initial_states=None, terminal_states=None]

In [10]:
g.predict_terminal_states()

Adding `adata.obs['term_states_fwd']`
       `adata.obs['term_states_fwd_probs']`
       `.terminal_states`
       `.terminal_states_probabilities`
       `.terminal_states_memberships
    Finish`


GPCCA[kernel=RealTimeKernel[n=7817], initial_states=None, terminal_states=['ASS1+_SLC40A1+_SC_1', 'ASS1+_SLC40A1+_SC_3', 'RPS10+_RPS17+_SC_2']]

In [11]:
g.predict_initial_states(allow_overlap=True)

Adding `adata.obs['init_states_fwd']`
       `adata.obs['init_states_fwd_probs']`
       `.initial_states`
       `.initial_states_probabilities`
       `.initial_states_memberships
    Finish`


GPCCA[kernel=RealTimeKernel[n=7817], initial_states=['ASS1+_SLC40A1+_SC_4'], terminal_states=['ASS1+_SLC40A1+_SC_1', 'ASS1+_SLC40A1+_SC_3', 'RPS10+_RPS17+_SC_2']]

In [12]:
#print the estimator
g

GPCCA[kernel=RealTimeKernel[n=7817], initial_states=['ASS1+_SLC40A1+_SC_4'], terminal_states=['ASS1+_SLC40A1+_SC_1', 'ASS1+_SLC40A1+_SC_3', 'RPS10+_RPS17+_SC_2']]

### Compute fate probabilities

In [13]:
#Compute terminal states
g.compute_fate_probabilities()

Computing fate probabilities


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 35.29/s]

Adding `adata.obsm['lineages_fwd']`
       `.fate_probabilities`
    Finish (0:00:00)



[0]PETSC ERROR: ------------------------------------------------------------------------
[0]PETSC ERROR: Caught signal number 13 Broken Pipe: Likely while reading or writing to a socket
[0]PETSC ERROR: Try option -start_in_debugger or -on_error_attach_debugger
[0]PETSC ERROR: or see https://petsc.org/release/faq/#valgrind and https://petsc.org/release/faq/
[0]PETSC ERROR: configure using --with-debugging=yes, recompile, link, and run 
[0]PETSC ERROR: to get more information on the crash.
Abort(59) on node 0 (rank 0 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 59) - process 0


In [None]:
g.plot_fate_probabilities(same_plot=True)

In [15]:
adata

AnnData object with n_obs × n_vars = 7817 × 19868
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'n_genes', 'n_counts', 'leiden', 'cluster', 'Development_Week', 'Development_Day', 'proliferation', 'apoptosis', 'macrostates_fwd', 'term_states_fwd', 'term_states_fwd_probs', 'init_states_fwd', 'init_states_fwd_probs', 'clusters_gradients'
    var: 'feature_types-0-0-0', 'gene_name-1-0-0', 'gene_id-0-0', 'GENE-1-0', 'n_counts', 'n_cells'
    uns: 'Age_colors', 'Age_group_colors', 'Donor_ID_colors', 'Library_Preparation_Protocol_colors', 'Sample_ID_colors', 'Sex_colors', 'T_fwd_params', '_scvi_manager_uuid', '_scvi_uuid', 

In [16]:
adata.obs['term_states_fwd'].value_counts()

term_states_fwd
ASS1+_SLC40A1+_SC_1    30
ASS1+_SLC40A1+_SC_3    30
RPS10+_RPS17+_SC_2     30
Name: count, dtype: int64

In [20]:
adata.obs["fate_probabilities_ASS1_SLC40A1_1"] = g.fate_probabilities["ASS1+_SLC40A1+_SC_1"].X.flatten()
adata.obs["fate_probabilities_ASS1_SLC40A1_3"] = g.fate_probabilities["ASS1+_SLC40A1+_SC_3"].X.flatten()
adata.obs["fate_probabilities_RPS10_RPS17_2"] = g.fate_probabilities["RPS10+_RPS17+_SC_2"].X.flatten()

In [None]:
sc.pl.embedding(
    adata,
    basis="umap",
    color=["fate_probabilities_ASS1_SLC40A1_1", 'fate_probabilities_ASS1_SLC40A1_3', 'fate_probabilities_RPS10_RPS17_2'],
    color_map="magma_r",
    s=50,
    ncols=3,
    vmax="p96",
)

In [None]:
cr.pl.circular_projection(adata, keys=["cluster"], legend_loc="right")

# Uncover driver genes

### Correlate fate probabilities with gene expression
If a gene is systematically higher or lower expressed in cells that are more or less likely to differentiate towards a given terminal states, respectively, then we call this gene a putative driver gene.

### Stem cells ASS1 SLC40A1_1

In [25]:
#print the estimator
g

GPCCA[kernel=RealTimeKernel[n=7817], initial_states=['ASS1+_SLC40A1+_SC_4'], terminal_states=['ASS1+_SLC40A1+_SC_1', 'ASS1+_SLC40A1+_SC_3', 'RPS10+_RPS17+_SC_2']]

In [26]:
adata.obs['clusters_gradients'].value_counts()

clusters_gradients
ASS1+_SLC40A1+_SC_1    7349
RPS10+_RPS17+_SC_2      438
ASS1+_SLC40A1+_SC_3      30
Name: count, dtype: int64

In [29]:
adata.obs['macrostates_fwd'].value_counts()

macrostates_fwd
ASS1+_SLC40A1+_SC_1    30
RPS10+_RPS17+_SC_1     30
ASS1+_SLC40A1+_SC_2    30
ASS1+_SLC40A1+_SC_3    30
ASS1+_SLC40A1+_SC_4    30
RPS10+_RPS17+_SC_2     30
Name: count, dtype: int64

In [30]:
driver_clusters = ["RPS10+_RPS17+_SC_1", "ASS1+_SLC40A1+_SC_2", 'ASS1+_SLC40A1+_SC_4']

ASS1_SLC40A1_1_df = g.compute_lineage_drivers(
    lineages=["ASS1+_SLC40A1+_SC_1"], cluster_key="macrostates_fwd", clusters=driver_clusters)

ASS1_SLC40A1_1_df.head(10)


Adding `adata.varm['terminal_lineage_drivers']`
       `.lineage_drivers`
    Finish (0:00:00)


Unnamed: 0_level_0,ASS1+_SLC40A1+_SC_1_corr,ASS1+_SLC40A1+_SC_1_pval,ASS1+_SLC40A1+_SC_1_qval,ASS1+_SLC40A1+_SC_1_ci_low,ASS1+_SLC40A1+_SC_1_ci_high
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MT-ND5,0.911091,1.969001e-46,2.786727e-42,0.86772,0.940693
MT-CO2,0.892919,6.425274e-41,3.03123e-37,0.84142,0.928345
MT-CO1,0.866685,8.157761e-35,1.649383e-31,0.803876,0.910379
MT-CYB,0.864009,2.781182e-34,4.920259e-31,0.800074,0.908537
MT-CO3,0.852878,3.296628e-32,3.888098e-29,0.784315,0.900856
MT-ATP6,0.852676,3.577832e-32,3.895158e-29,0.784031,0.900717
MT-ND3,0.852416,3.976236e-32,4.019691e-29,0.783664,0.900537
MT-ND1,0.823539,1.2620930000000001e-27,7.442668e-25,0.743198,0.880468
MT-ND4L,0.769449,2.012378e-21,5.933581e-19,0.668953,0.84232
MT-ND4,0.724257,1.2423270000000002e-17,2.919169e-15,0.608422,0.809877


In [None]:
sc.pl.embedding(
    adata,
    basis="umap",
    color=["fate_probabilities_ASS1_SLC40A1_1"] + list(ASS1_SLC40A1_1_df.index[:20]),
    color_map="magma_r",
    s=50,
    ncols=5,
    vmax="p96",
)


### Stem cells ASS1 SLC40A1_3

In [36]:
driver_clusters = ["RPS10+_RPS17+_SC_1", "ASS1+_SLC40A1+_SC_2", 'ASS1+_SLC40A1+_SC_4']

ASS1_SLC40A1_3_df = g.compute_lineage_drivers(
    lineages=["ASS1+_SLC40A1+_SC_3"], cluster_key="macrostates_fwd", clusters=driver_clusters)

ASS1_SLC40A1_3_df.head(10)


Adding `adata.varm['terminal_lineage_drivers']`
       `.lineage_drivers`
    Finish (0:00:00)


Unnamed: 0_level_0,ASS1+_SLC40A1+_SC_3_corr,ASS1+_SLC40A1+_SC_3_pval,ASS1+_SLC40A1+_SC_3_qval,ASS1+_SLC40A1+_SC_3_ci_low,ASS1+_SLC40A1+_SC_3_ci_high
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RAB23,0.860392,1.387597e-33,1.9638670000000002e-29,0.794943,0.906044
SH3BP5L,0.759671,1.6198169999999998e-20,1.7188010000000003e-17,0.655742,0.835345
MAP3K5,0.758496,2.064552e-20,1.7188010000000003e-17,0.654158,0.834505
NANOG,0.758496,2.064552e-20,1.7188010000000003e-17,0.654158,0.834505
TPBG,0.758496,2.064552e-20,1.7188010000000003e-17,0.654158,0.834505
KCNS3,0.758496,2.064552e-20,1.7188010000000003e-17,0.654158,0.834505
SNHG22,0.758496,2.064552e-20,1.7188010000000003e-17,0.654158,0.834505
C16orf86,0.758496,2.064552e-20,1.7188010000000003e-17,0.654158,0.834505
RNF166,0.758496,2.064552e-20,1.7188010000000003e-17,0.654158,0.834505
AL355922.2,0.758496,2.064552e-20,1.7188010000000003e-17,0.654158,0.834505


In [None]:
sc.pl.embedding(
    adata,
    basis="umap",
    color=["fate_probabilities_ASS1_SLC40A1_3"] + list(ASS1_SLC40A1_3_df.index[:20]),
    color_map="magma_r",
    s=50,
    ncols=5,
    vmax="p96",
)

### Stem cells RPS10 RPS17_2

In [39]:
driver_clusters = ["RPS10+_RPS17+_SC_1", "ASS1+_SLC40A1+_SC_2", 'ASS1+_SLC40A1+_SC_4']

RPS10_RPS17_2_df = g.compute_lineage_drivers(
    lineages=["RPS10+_RPS17+_SC_2"], cluster_key="macrostates_fwd", clusters=driver_clusters)

RPS10_RPS17_2_df.head(10)

Adding `adata.varm['terminal_lineage_drivers']`
       `.lineage_drivers`
    Finish (0:00:00)


Unnamed: 0_level_0,RPS10+_RPS17+_SC_2_corr,RPS10+_RPS17+_SC_2_pval,RPS10+_RPS17+_SC_2_qval,RPS10+_RPS17+_SC_2_ci_low,RPS10+_RPS17+_SC_2_ci_high
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RPS6,0.906815,5.198548e-45,3.678753e-41,0.86151,0.937795
RPLP1,0.888282,1.051759e-39,3.721388e-36,0.834747,0.925181
RPS24,0.887407,1.752253e-39,4.2630129999999996e-36,0.83349,0.924584
RPL10,0.887354,1.807255e-39,4.2630129999999996e-36,0.833413,0.924547
RPS27A,0.859903,1.717139e-33,2.7002969999999998e-30,0.79425,0.905707
RPL10A,0.858495,3.152881e-33,4.4622729999999995e-30,0.792257,0.904736
RPS3A,0.857942,3.9947590000000004e-33,5.139802e-30,0.791474,0.904354
RPS13,0.847495,2.798851e-31,2.640809e-28,0.776726,0.897132
RPS14,0.845366,6.343070000000001e-31,5.610842e-28,0.77373,0.895657
RPS8,0.836484,1.646682e-29,1.370911e-26,0.761265,0.88949


In [None]:
sc.pl.embedding(
    adata,
    basis="umap",
    color=["fate_probabilities_RPS10_RPS17_2"] + list(RPS10_RPS17_2_df.index[:20]),
    color_map="magma_r",
    s=50,
    ncols=5,
    vmax="p96",
)


In [41]:
adata

AnnData object with n_obs × n_vars = 7817 × 19868
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'n_genes', 'n_counts', 'leiden', 'cluster', 'Development_Week', 'Development_Day', 'proliferation', 'apoptosis', 'macrostates_fwd', 'term_states_fwd', 'term_states_fwd_probs', 'init_states_fwd', 'init_states_fwd_probs', 'clusters_gradients', 'fate_probabilities_ASS1_SLC40A1_1', 'fate_probabilities_ASS1_SLC40A1_3', 'fate_probabilities_RPS10+_RPS17+_2', 'fate_probabilities_RPS10_RPS17_2'
    var: 'feature_types-0-0-0', 'gene_name-1-0-0', 'gene_id-0-0', 'GENE-1-0', 'n_counts', 'n_cells'
    uns: 'Age_colors', 'Age_group_color

In [None]:
# Save adata
output = 'FetalSC_data/Fetal_healthy_stem_cells_CellRank_experimental_time.h5ad'
adata.write(output)