### Notebook for the epithelial label transfer of Reference mesenchymal dataset to rest of mesenchymal cells from GCA using `scANVI`

- **Developed by:** Anna Maguza
- **Institute of Computational Biology - Computational Health Department - Helmholtz Munich**
- 27th February 2022

### Import required modules

In [1]:
import scvi
import anndata
import warnings
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


### Set up working environment

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                 9.2.0
absl                NA
asttokens           NA
attr                22.1.0
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
bottleneck          1.3.5
brotli              NA
certifi             2022.09.24
cffi                1.15.1
charset_normalizer  2.1.1
chex                0.1.5
contextlib2         NA
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.3
decorator           5.1.1
defusedxml          0.7.1
deprecate           0.3.2
docrep              0.3.2
entrypoints         0.4
executing           1.2.0
flax                0.6.2
fsspec              2022.11.0
google              NA
h5py                3.7.0
hypergeom_ufunc     NA
idna                3.4
ipykernel           6.17.1
ipython_genutils    0.2.0
ipywidgets          8.0.2
jax                 0.3.25
jaxlib              0.3.25
jedi                0.18.2
joblib              1.2.0
jupyt

In [4]:
warnings.simplefilter(action = 'ignore')
scvi.settings.seed = 1712
%config InlineBackend.print_figure_kwargs = {'facecolor' : "w"}
%config InlineBackend.figure_format = 'retina'

Global seed set to 1712


In [5]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 2,
)

### Read in Reference object

In [6]:
reference_input = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/expi_map/Mesenchymal_reference_map_subset_after_geosketch2.h5ad'
reference_output = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/expi_map/Mesenchymal_reference_map_subset_after_geosketch2_output.h5ad'

In [7]:
reference = sc.read_h5ad(reference_input)
reference.X

<107299x40144 sparse matrix of type '<class 'numpy.float32'>'
	with 218795216 stored elements in Compressed Sparse Row format>

In [8]:
reference.obs

Unnamed: 0_level_0,UniqueCell_ID,Sample_ID,CellType,Study_name,Diagnosis,Age,Region code,Fraction,Gender,10X,...,doublet_scores,predicted_doublets,Age_group,Integrated_05,total_counts_ribo,Location,n_counts,percent_mito,percent_ribo,Subject
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AACACGTTCTTGCATT_Ileum-1_Stem Cell,AACACGTTCTTGCATT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Healthy adult,,,,,,...,,,,Stem_Cells_ext,,,,,,
AACCGCGCATGAAGTA_Ileum-1_Stem Cell,AACCGCGCATGAAGTA_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Healthy adult,,,,,,...,,,,Stem_Cells_ext,,,,,,
AACTCAGAGCGATCCC_Ileum-1_Stem Cell,AACTCAGAGCGATCCC_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Healthy adult,,,,,,...,,,,Stem_Cells_ext,,,,,,
AACTCCCTCTCAACTT_Ileum-1_Stem Cell,AACTCCCTCTCAACTT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Healthy adult,,,,,,...,,,,Stem_Cells_ext,,,,,,
AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Healthy adult,,,,,,...,,,,Stem_Cells_ext,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGACGGCTCCGTTGCT-1-4918STDY7718976,,,Mesenchymal,Gut Cell Atlas,fetal,10Wk,FTIL,SC-EPCAMP,M,3',...,0.043699421965317904,False,First trim,Stromal 1 (ADAMDEC1+),1721.0,SmallInt,5303.0,1.7605543,32.453327,BRC2134
TTAGTTCGTGTGGTTT-1-4918STDY7718976,,,Mesenchymal,Gut Cell Atlas,fetal,10Wk,FTIL,SC-EPCAMP,M,3',...,0.01658944851292781,False,First trim,Stromal 1 (ADAMDEC1+),1219.0,SmallInt,3829.0,1.1939639,31.835989,BRC2134
TTCCCAGGTGCGATAG-1-4918STDY7718976,,,Mesenchymal,Gut Cell Atlas,fetal,10Wk,FTIL,SC-EPCAMP,M,3',...,0.01797511138423721,False,First trim,Stromal 1 (ADAMDEC1+),1318.0,SmallInt,3603.0,1.5593741,36.580627,BRC2134
TTGCCGTTCCCAACGG-1-4918STDY7718976,,,Mesenchymal,Gut Cell Atlas,fetal,10Wk,FTIL,SC-EPCAMP,M,3',...,0.06678700361010828,False,First trim,Stromal 1 (ADAMDEC1+),1154.0,SmallInt,3010.0,1.9038773,38.33887,BRC2134


In [9]:
reference.obs['seed_labels'] = reference.obs['Integrated_05'].copy()

### Read query object

In [10]:
query_input = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/expi_map/Mesenchymal_query_map_after_geosketch2.h5ad'
query_output = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/expi_map/Mesenchymal_query_map_after_geosketch2_output.h5ad'

In [11]:
query = sc.read_h5ad(query_input)
query.X

<50994x40144 sparse matrix of type '<class 'numpy.float32'>'
	with 76880166 stored elements in Compressed Sparse Row format>

In [12]:
query.obs['seed_labels'] = 'Unknown'

In [13]:
# Concatenate reference and query
adata = reference.concatenate(query, batch_key = 'dataset', batch_categories = ['reference', 'query'])

### Select HVGs

In [14]:
adata_raw = adata.copy()
adata.layers['counts'] = adata.X.copy()

sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "donor",
    subset = True
)

### Transfer of annotation with scANVI

In [None]:
scvi.model.SCVI.setup_anndata(adata, batch_key = 'Sample_ID', labels_key = "seed_labels", layer = 'counts')

In [None]:
scvi_model = scvi.model.SCVI(adata, n_latent = 50, n_layers = 3, dispersion = 'gene-batch', gene_likelihood = 'nb')

In [None]:
scvi_model.train(100)

### Label transfer with `scANVI` 

In [None]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(scvi_model, 'Unknown')

In [None]:
scanvi_model.train(25)

In [None]:
adata.obs["C_scANVI"] = scanvi_model.predict(adata)

- Extract latent representation

In [None]:
adata.obsm["X_scANVI"] = scanvi_model.get_latent_representation(adata)

- Visualise corrected dataset

In [None]:
sc.pp.neighbors(adata, use_rep = "X_scANVI", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(adata, min_dist = 0.2, spread = 2, random_state = 1712)

In [None]:
adata.obs

In [None]:
sc.pl.umap(adata, frameon = False, color = ['C_scANVI', 'Integrated_05', 'CellType', 'dataset', 'Diagnosis', 'Study_name'], size = 0.6, legend_fontsize = 5, ncols = 3)

In [None]:
#Write anndata object to file
adata.write_h5ad('/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/expi_map/Mesenchymal_cells_after_scanvi.h5ad')

### Models Validation

In [None]:
input_file = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/expi_map/Mesenchymal_cells_after_scanvi.h5ad'
output_file = '/lustre/groups/talaveralopez/workspace/anna.maguza/Processed_datasets/expi_map/Mesenchymal_cells_after_scanvi_output.h5ad'
adata = sc.read_h5ad(input_file)

In [None]:
adata.obs['C_scANVI'].value_counts()

In [None]:
adata.obs['Integrated_05'].value_counts()

In [None]:
#Validation of the clusters
df = adata.obs.groupby(['Integrated_05', "C_scANVI"]).size().unstack(fill_value=0)
norm_df = df / df.sum(axis=0)

plt.figure(figsize=(8, 8))
_ = plt.pcolor(norm_df)
_ = plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=90)
_ = plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.xlabel("Predicted")
plt.ylabel("Observed")

In [None]:
#Calculate accuracy
print(f"Acc: {np.mean(adata.obs.CellType.cat.codes == adata.obs.C_scANVI.cat.codes)}")