# Notebook for the cell-cell interaction research of fetal stem cells with myeloid cells using NicheNet package
- **Developed by:** Anna Maguza
- **Place:** Wuerzburg Institute for System Immunology
- **Date:** 15th January 2024

### Import required modules

In [1]:
# Setting up R dependencies
import anndata2ri
import rpy2
from rpy2.robjects import r
import random

anndata2ri.activate()

%load_ext rpy2.ipython

  anndata2ri.activate()


In [2]:
!cd/home/amaguza/R_projects/NicheNet

/bin/bash: line 1: cd/home/amaguza/R_projects/NicheNet: No such file or directory


In [3]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import anndata as ad
import decoupler as dc

In [4]:
%%R
suppressPackageStartupMessages({
    library(reticulate)
    library(ggplot2)
    library(tidyr)
    library(dplyr)
    library(purrr)
    library(tibble)
})


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
  library ‘/usr/share/R/library’ contains no packages


### Set up working environment

In [5]:
%matplotlib inline

In [6]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.10.4
scanpy      1.9.6
-----
PIL                         10.2.0
anndata2ri                  1.3.1
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        23.2.0
attrs                       23.2.0
babel                       2.14.0
backcall                    0.2.0
brotli                      1.1.0
certifi                     2023.11.17
cffi                        1.16.0
charset_normalizer          3.3.2
colorama                    0.4.6
comm                        0.2.1
cycler                      0.12.1
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.8.0
decorator                   5.1.1
decoupler                   1.5.0
defusedxml                  0.7.1
exceptiongroup              1.2.0
executing                   2.0.1
fastjsonschema              NA
fqdn                        NA
get_annotations             NA
h5py                        3.10.0
idna 

In [7]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Data Upload

In [8]:
input_path = 'FetalSC_data/FetalSC_and_other_fetal_cells/FetalSC_and_myeloid_cells_raw.h5ad'
adata = sc.read_h5ad(input_path)

In [9]:
# Extract the raw counts
adata = adata.raw.to_adata()
adata

AnnData object with n_obs × n_vars = 14997 × 26442
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels'
    var: 'feature_types-0-0-0', 'gene_name-1-0-0', 'gene_id-0-0', 'GENE-1-0'

In [10]:
X_is_raw(adata)

True

In [11]:
# Save raw counts in raw
adata.raw = adata

In [12]:
adata.layers['counts'] = adata.X.copy()

In [13]:
sc.pp.normalize_total(adata, target_sum = 1e6, exclude_highly_expressed = True)
sc.pp.log1p(adata)

normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['ACTB', 'CD74', 'CLC', 'CST3', 'CXCL8', 'DEFA5', 'DEFA6', 'FOS', 'FTH1', 'FTL', 'HBA2', 'HBG2', 'HSP90AA1', 'HSPA1A', 'HSPA6', 'HSPB1', 'LYZ', 'MALAT1', 'MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-CYB', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MTRNR2L12', 'NEAT1', 'PF4', 'PPBP', 'PRG2', 'S100A12', 'S100A8', 'S100A9', 'SPP1', 'SST', 'TMSB4X', 'TPSAB1']
    finished (0:00:00)


+ Load NicheNet Prior-Knowledge

In [14]:
%%R
# load NicheNet (NicheNet is only available on GitHub)
suppressPackageStartupMessages({
    if(!require(nichenetr)) remotes::install_github("saeyslab/nichenetr", upgrade = "never")
})

In [15]:
%%R
# Increase timeout threshold
options(timeout=600)

# Load PK
ligand_target_matrix <- readRDS(url("https://zenodo.org/record/7074291/files/ligand_target_matrix_nsga2r_final.rds"))
lr_network <- readRDS(url("https://zenodo.org/record/7074291/files/lr_network_human_21122021.rds"))

+ Step 1. Define cell types of interest to be considered as senders/sources and receiver/targets of CCC interactions

In [16]:
adata.obs['Cell States'].value_counts()

Cell States
MTRNR2L12+ASS1+_SC               3979
RPS10+_RPS17+_SC                 3544
Macrophages                      3105
LYVE1+ Macrophage                1757
cDC2                             1232
Monocytes                         439
FXYD3+_CKB+_SC                    294
cDC1                              210
CLC+ Mast cell                    121
MPO+ mono-neutrophil              114
pDC                               102
Megakaryocyte                      56
Mast cell                          31
Lymphoid DC                         9
MMP9+ Inflammatory macrophage       4
Name: count, dtype: int64

In [17]:
sender_celltypes = ["MTRNR2L12+ASS1+_SC", "FXYD3+_CKB+_SC", "RPS10+_RPS17+_SC"]
receiver_celltypes = ["Macrophages", 'LYVE1+ Macrophage']

+ Step 2. Define a set of ligands that can potentially affect receiver cell types

In [18]:
# Helper function to obtain sufficiently expressed genes
from functools import reduce


def get_expressed_genes(adata, cell_type, expr_prop):
    # calculate proportions
    temp = adata[adata.obs["Cell States"] == cell_type, :]
    a = temp.X.getnnz(axis=0) / temp.X.shape[0]
    stats = (
        pd.DataFrame({"genes": temp.var_names, "props": a})
        .assign(cell_type=cell_type)
        .sort_values("genes")
    )

    # obtain expressed genes
    stats = stats[stats["props"] >= expr_prop]
    expressed_genes = stats["genes"].values

    return expressed_genes

In [19]:
sender_expressed = reduce(
    np.union1d,
    [
        get_expressed_genes(adata, cell_type=cell_type, expr_prop=0.1)
        for cell_type in sender_celltypes
    ],
)
receiver_expressed = reduce(
    np.union1d,
    [
        get_expressed_genes(adata, cell_type=cell_type, expr_prop=0.1)
        for cell_type in receiver_celltypes
    ],
)

In [20]:
%%R -i sender_expressed -i receiver_expressed
# get ligands and receptors in the resource
ligands <- lr_network %>% pull(from) %>% unique()
receptors <- lr_network %>% pull(to) %>% unique()

# only keep the intersect between the resource and the data
expressed_ligands <- intersect(ligands, sender_expressed)
expressed_receptors <- intersect(receptors, receiver_expressed)

# filter the network to only include ligands for which both the ligand and receptor are expressed
potential_ligands <- lr_network %>% 
  filter(from %in% expressed_ligands & to %in% expressed_receptors) %>%
  pull(from) %>% unique()

+ Step 3. Define a gene set of interest in receiver cell type(s)

In [21]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata,
    sample_col="Sample_ID",
    groups_col="Cell States",
    min_prop=0.1,
    min_smpls=3,
    layer="counts",
)

  cols = obs.groupby([sample_col, groups_col]).apply(lambda x: x.apply(lambda y: len(y.unique()) == 1)).all(0)


In [22]:
# Storing the raw counts
pdata.layers["counts"] = pdata.X.copy()

# Does PC1 captures a meaningful biological or technical fact?
pdata.obs["lib_size"] = pdata.X.sum(1)

# Normalize
sc.pp.normalize_total(pdata, target_sum=1e4)
sc.pp.log1p(pdata)
# check how this looks like
pdata

normalizing counts per cell
    finished (0:00:00)


  pdata.layers["counts"] = pdata.X.copy()


AnnData object with n_obs × n_vars = 171 × 23275
    obs: 'Sample_ID', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Chem', 'Layer', 'Cell States Kong', 'dataset', '_scvi_batch', 'psbulk_n_cells', 'psbulk_counts', 'lib_size'
    var: 'feature_types-0-0-0', 'gene_name-1-0-0', 'gene_id-0-0', 'GENE-1-0'
    uns: 'log1p'
    layers: 'psbulk_props', 'counts'

In [None]:
logFCs, pvals = dc.get_contrast(
    pdata,
    group_col="Cell States",
    condition_col="condition",
    condition="stim",
    reference="ctrl",
    method="t-test",
)