## Notebook for exploratory analysis of Fetal Gut Stem cells scRNA-Seq data using `scVI` 

- **Developed by:** Anna Maguza
- **Place:** Wuerzburg Institute for System Immunology
- **Creation date:** 24th June 2024
- **Last modified date:** 24th June 2024

### Load required modules

In [None]:
import scvi
import torch
import anndata
import warnings
import numpy as np
import scanpy as sc
import pandas as pd
import plotnine as p
from pywaffle import Waffle
import matplotlib.pyplot as plt

##### Setup working environment

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

In [None]:
torch.cuda.is_available()

In [None]:
warnings.simplefilter(action = 'ignore')
scvi.settings.seed = 1712
%config InlineBackend.print_figure_kwargs = {'facecolor' : "w"}
%config InlineBackend.figure_format = 'retina'
torch.set_float32_matmul_precision('medium')

In [None]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 3,
)

In [None]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Read in datasets

- Read in formatted object

In [None]:
adata = sc.read_h5ad('/mnt/LaCIE/annaM/gut_project/Processed_data/Gut_data/Fetal_all_cells/E-MTAB-9536_raw_velocity_anndata_filtered.h5ad')
adata

- Check if data is raw

In [None]:
X_is_raw(adata)

- Save raw counts

In [None]:
adata_copy = adata.copy()

In [None]:
adata.raw = adata

In [None]:
adata

- Perform basic filtering

In [None]:
sc.pp.filter_genes(adata, min_counts = 1)
sc.pp.filter_genes(adata, min_cells = 3)

#sc.pp.filter_cells(adata, min_genes = 50)
sc.pp.filter_cells(adata, min_counts = 3)

### Calculate HVGs

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)
adata

### Data integration with `scVI`

In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer = "counts",
    categorical_covariate_keys = ["Sample_ID"],
    labels_key = "Cell Type"
)

In [None]:
scvi_model = scvi.model.SCVI(adata,
                            n_latent = 50, 
                            n_layers = 3, 
                            dispersion = 'gene-batch', 
                            gene_likelihood = 'nb')

In [None]:
scvi_model.train(100, 
                 check_val_every_n_epoch = 1, 
                 enable_progress_bar = True, 
                 accelerator = "gpu",
                 devices = [0])

In [None]:
adata.obsm['X_scvi'] = scvi_model.get_latent_representation()

#### Evaluate model performance using the [_Svensson_](https://www.nxn.se/valent/2023/8/10/training-scvi-posterior-predictive-distributions-over-epochs) method

In [None]:
history_df = (
    scvi_model.history['elbo_train'].astype(float)
    .join(scvi_model.history['elbo_validation'].astype(float))
    .reset_index()
    .melt(id_vars = ['epoch'])
)

p.options.figure_size = 12, 6

p_ = (
    p.ggplot(p.aes(x = 'epoch', y = 'value', color = 'variable'), history_df.query('epoch > 0'))
    + p.geom_line()
    + p.geom_point()
    + p.scale_color_manual({'elbo_train': 'black', 'elbo_validation': 'red'})
    + p.theme_minimal()
)

print(p_)

In [None]:
sc.pp.neighbors(adata, use_rep = "X_scvi", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(adata, min_dist = 0.3, spread = 1, random_state = 1712)

In [None]:
sc.set_figure_params(dpi=300, figsize=(5, 5))
sc.pl.umap(adata, frameon = False, color = ['Cell Type', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 
                                            'Fraction', 'Sex', 'Library_Preparation_Protocol', 'Age_group', 
                                            'Location'], 
                                            size = 3, legend_fontsize = 5, ncols = 4, color_map='magma_r')

In [None]:
sc.set_figure_params(dpi=300, figsize=(10, 10))
sc.pl.umap(adata, frameon = False, color = ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo'], 
                                            size = 3, legend_fontsize = 5, ncols = 4, color_map='magma_r')

In [None]:
sc.set_figure_params(dpi=300, figsize=(5, 5))
sc.pl.umap(adata, frameon = False, color = ['Cell States'], 
                                            size = 5, legend_fontsize = 5, ncols = 4, color_map='magma_r')

+ Export data

In [None]:
adata

In [None]:
adata = adata.raw.to_adata()
adata

In [None]:
adata_copy

In [None]:
# merge layers from adata_copy to adata
adata.layers['ambiguous'] = adata_copy.layers['ambiguous']
adata.layers['spliced'] = adata_copy.layers['spliced']
adata.layers['unspliced'] = adata_copy.layers['unspliced']

In [None]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/Processed_data/Gut_data/Fetal_all_cells/E-MTAB-9536_raw_velocity_anndata_filtered.h5ad')