## Notebook for Healthy reference epithelial cells data integration and batch correction `scVI`

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 5th Juni 2023

### Load required modules

In [None]:
import sys
import scvi
import torch
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

import numpy as np
import scipy as sp
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42

In [None]:
torch.cuda.is_available()

In [None]:
torch.set_float32_matmul_precision('medium')

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

In [None]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 2,
)

In [None]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Read in datasets

In [None]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_integrated_data_all_genes.h5ad'
Healthy_adata = sc.read_h5ad(input)

### Preprocess in datasets

In [None]:
# Filter out fetal and pediatric samples
Healthy_adata = Healthy_adata[Healthy_adata.obs['Diagnosis'] != 'Fetal Healthy', :]
Healthy_adata = Healthy_adata[Healthy_adata.obs['Diagnosis'] != 'Pediatric healthy', :]
# Subset only epitheleal cells
Healthy_adata = Healthy_adata[Healthy_adata.obs['Cell Type'] == 'Epithelial', :]
Healthy_adata.obs['Unified Cell States'] = Healthy_adata.obs['Cell States']
Healthy_adata.obs['Unified Cell States'].replace({"Enterocytes TMIGD1 MEP1A": 'Enterocyte',
                                               'Enterocytes CA1 CA2 CA4-': 'Enterocyte',
                                               'Enterocytes TMIGD1 MEP1A GSTA1': 'Enterocyte',
                                               'Enterocytes BEST4': 'Enterocyte',
                                               'BEST4+ epithelial': 'Enterocyte',
                                               'Stem_Cells_GCA': 'Stem cells OLFM4',
                                               'Stem_Cells_ext': 'Stem cells OLFM4',
                                               'Tuft': 'Tuft cells',
                                               'Paneth': 'Paneth cells',
                                               'Epithelial Cycling cells': 'TA',
                                               'Goblet cells SPINK4': 'Goblet cells',
                                               'Goblet cell': 'Goblet cells',
                                               'Goblet cells MUC2 TFF1-': 'Goblet cells',
                                               'Goblet cells MUC2 TFF1': 'Goblet cells',
                                               'EC cells (TAC1+)': 'Enterochromaffin cells',
                                               'EECs': 'Enteroendocrine cells',
                                               'K cells (GIP+)': 'Enteroendocrine cells',
                                               'M/X cells (MLN/GHRL+)': 'Enteroendocrine cells',
                                               'Progenitor (NEUROG3+)': 'Enteroendocrine cells',
                                               'D cells (SST+)': 'Enteroendocrine cells',
                                               'I cells (CCK+)': 'Enteroendocrine cells',
                                               'N cells (NTS+)': 'Enteroendocrine cells',
                                               'L cells (PYY+)': 'L cells'}, inplace=True)

In [None]:
Healthy_adata.raw = Healthy_adata

In [None]:
Healthy_adata.layers['counts'] = Healthy_adata.X.copy()

# Calculate 5000 HVGs
sc.pp.highly_variable_genes(
    Healthy_adata,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

### Run scVI

In [None]:
scvi.model.SCVI.setup_anndata(Healthy_adata, batch_key = 'Sample_ID', labels_key = "Unified Cell States")

In [None]:
scvi_model = scvi.model.SCVI(Healthy_adata, n_latent = 50, n_layers = 3, dispersion = 'gene-batch', gene_likelihood = 'nb')

In [None]:
scvi_model.train()

In [None]:
Healthy_adata.obsm["X_scVI"] = scvi_model.get_latent_representation()

### UMAP calculation

In [None]:
sc.pp.neighbors(Healthy_adata, use_rep = "X_scVI", n_neighbors = 50, metric = 'minkowski')

In [None]:
sc.tl.umap(Healthy_adata, min_dist = 0.4, spread = 4, random_state = 1712)

In [None]:
Healthy_adata

In [None]:
sc.pl.umap(Healthy_adata, frameon = False, color = ['Library_Preparation_Protocol', 'Study_name', 'Unified Cell States', 'Age', 'Location', 'Donor_ID', 'Gender', 'n_genes_by_counts', 'total_counts', 'pct_counts_mito', 'pct_counts_ribo'], size = 1, legend_fontsize = 5, ncols = 3)

In [None]:
# Save the adata object
Healthy_adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Healthy_epithelial_scVI/Healthy_epithelial_cells_with_predicted_annotations_after_scVI.h5ad')