### Notebook for the generation of an annotated manifold with `scANVI` 

- **Developed by:** Carlos Talavera-López Ph.D
- **Institute of Systems Immunology (WüSI) - JMU-Würzburg**
- v230608

### Import required modules

In [None]:
import torch
import scvi
import anndata
import warnings
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
from scib_metrics.benchmark import Benchmarker

### Set up working environment

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

In [None]:
warnings.simplefilter(action = 'ignore')
scvi.settings.seed = 1712
%config InlineBackend.print_figure_kwargs = {'facecolor' : "w"}
%config InlineBackend.figure_format = 'retina'
torch.set_float32_matmul_precision('medium')

In [None]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 3,
)

### Read in Healthy data

In [None]:
epith_all = sc.read_h5ad('../../../data/Marburg_All_ctl230404_leiden_states.raw.h5ad')
epith_all

In [None]:
epith_all.obs['leiden_states'].cat.categories

In [None]:
epith_Mix = epith_all[epith_all.obs['leiden_states'].isin(['Mixed_11', 'Mixed_12', 'Mixed_13', 'Mixed_16', 'Mixed_17'])]
epith_Mix

In [None]:
epith_Mix.obs['seed_labels'] = epith_Mix.obs['leiden_states'].copy()
epith_Mix.obs['seed_labels'].value_counts()

### Relabel cells for annotation

In [None]:
non_healthy_ctrl_indices = epith_Mix.obs['group'] != 'healthy_ctrl'
epith_Mix.obs['seed_labels'] = epith_Mix.obs['seed_labels'].cat.add_categories('Unknown')
epith_Mix.obs.loc[non_healthy_ctrl_indices, 'seed_labels'] = 'Unknown'
epith_Mix.obs['seed_labels'].value_counts()

### Recalculate IAV-score

In [None]:
sc.pl.scatter(epith_Mix, x = 'total_counts', y = 'n_genes', color = "group", frameon = False)

In [None]:
nc_genes = epith_Mix.var_names[epith_Mix.var_names.str.startswith('NC_')]
print(nc_genes)

In [None]:
adata_log = epith_Mix.copy()

In [None]:
sc.pp.normalize_total(adata_log, target_sum = 1e6, exclude_highly_expressed = True)
sc.pp.log1p(adata_log)
sc.tl.score_genes(adata_log, nc_genes, score_name = 'Viral_score')

In [None]:
epith_Mix.obs['Viral_score'] = adata_log.obs['Viral_score'].copy()
adata = epith_Mix.copy()

### Select HVGs

In [None]:
adata_raw = epith_Mix.copy()
adata.layers['counts'] = adata.X.copy()

sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "donor",
    subset = True
)
adata

### Transfer of annotation with scANVI

In [None]:
scvi.model.SCVI.setup_anndata(adata, categorical_covariate_keys = ["donor"], labels_key = "seed_labels", layer = 'counts')

In [None]:
scvi_model = scvi.model.SCVI(adata, n_latent = 50, n_layers = 3, dispersion = 'gene-batch', gene_likelihood = 'nb')

In [None]:
scvi_model.train()

### Label transfer with `scANVI` 

In [None]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(scvi_model, 'Unknown')

In [None]:
scanvi_model.train()

In [None]:
adata.obs["C_scANVI_v2"] = scanvi_model.predict(adata)

- Extract latent representation

In [None]:
adata.obsm["X_scANVI"] = scanvi_model.get_latent_representation(adata)

- Visualise corrected dataset

In [None]:
sc.pp.neighbors(adata, use_rep = "X_scANVI", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(adata, min_dist = 0.4, spread = 4, random_state = 1712)
sc.pl.umap(adata, frameon = False, color = ['group', 'disease', 'infection', 'C_scANVI', 'C_scANVI_v2', 'seed_labels', 'donor', 'bd_rhapsody', 'illumina_stimunr'], size = 2, legend_fontsize = 5, ncols = 4)

In [None]:
sc.pl.umap(adata, frameon = False, color = ['SMK', 'n_genes', 'doublet_scores', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'n_counts', 'sample_group', 'IAV_score', 'Viral_score'], size = 2, legend_fontsize = 5, ncols = 4, cmap = 'plasma')

### Visualise individual genes using new manifold 

In [None]:
adata_log.obsm['X_scVI'] = adata.obsm['X_scVI'].copy()
adata_log.obsm['X_umap'] = adata.obsm['X_umap'].copy()
adata_log.obsm['X_scANVI'] = adata.obsm['X_scANVI'].copy()

sc.pl.umap(adata_log, frameon = False, color = ['C_scANVI', 'C_scANVI_v2', 'Viral_score', 'NC_026431.1', 'NC_026432.1', 'NC_026433.1', 'NC_026434.1', 'NC_026435.1', 'NC_026436.1', 'NC_026437.1', 'NC_026438.1'], size = 2, legend_fontsize = 5, ncols = 4, cmap = 'plasma')

### Compute integration metrics

In [None]:
bm = Benchmarker(
    adata,
    batch_key = ["donor"],
    label_key = "C_scANVI_v2",
    embedding_obsm_keys = ["X_pca", "X_scVI", "X_scANVI"],
    n_jobs = -1,
)
bm.benchmark()

In [None]:
bm.plot_results_table(min_max_scale = False)

### Export annotated sample object 

In [None]:
adata.obs.index = pd.Index(['-'.join(idx.split('-')[:3]) for idx in adata.obs.index])
adata.obs.index

In [None]:
adata_raw.obs.index = pd.Index(['-'.join(idx.split('-')[:3]) for idx in adata_raw.obs.index])
adata_raw.obs.index

In [None]:
adata.obs_names

In [None]:
adata.obs['C_scANVI_v2'].cat.categories

In [None]:
adata.obs['C_scANVI_v2'].value_counts()

### Export annotated object with raw counts

In [None]:
adata

In [None]:
adata_raw

In [None]:
adata_export = anndata.AnnData(X = adata_raw.X, obs = adata.obs, var = adata_raw.var)
adata_export.obsm['X_scVI'] = adata.obsm['X_scVI'].copy()
adata_export.obsm['X_umap'] = adata.obsm['X_umap'].copy()
adata_export.obsm['X_scANVI'] = adata.obsm['X_scANVI'].copy()
adata_export