## Notebook for Joanito cancer epithelial cells data integration and batch correction `scVI`

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 2nd Juni 2023

### Load required modules

In [None]:
import sys
import scvi
import torch
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

import numpy as np
import scipy as sp
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42

In [None]:
torch.cuda.is_available()

In [None]:
torch.set_float32_matmul_precision('medium')

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

In [None]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 2,
)

In [None]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Read in datasets

In [None]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Predicted_cancer_labels/Joanito/Joanito_epithelial_cells_with_predicted_annotations_raw.h5ad'
adata = sc.read(input)

In [None]:
X_is_raw(adata)

In [None]:
adata.raw = adata

In [None]:
adata.layers['counts'] = adata.X.copy()

# Calculate 5000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [None]:
scvi.model.SCVI.setup_anndata(adata, batch_key = 'Sample_ID', labels_key = "Unified Cell States")

In [None]:
scvi_model = scvi.model.SCVI(adata, n_latent = 50, n_layers = 3, dispersion = 'gene-batch', gene_likelihood = 'nb')

In [None]:
scvi_model.train()

In [None]:
adata.obsm["X_scVI"] = scvi_model.get_latent_representation()

In [None]:
adata.X

In [None]:
adata.raw.X

### UMAP calculation

In [None]:
sc.pp.neighbors(adata, use_rep = "X_scVI", n_neighbors = 50, metric = 'minkowski')

In [None]:
sc.tl.umap(adata, min_dist = 0.4, spread = 4, random_state = 1712)

In [None]:
adata

In [None]:
sc.pl.umap(adata, frameon = False, color = ['Unified Cell States', 'Library_Preparation_Protocol', 'Location', 'Side', 'Group Stage', 'Stage TNM', 'Gender', 'Tumor Stage', 'Donor_ID','MSS/MSI', 'Sample_ID'], size = 1, legend_fontsize = 5, ncols = 3)

In [None]:
# Save the output
adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Predicted_cancer_labels/Joanito/Joanito_epithelial_cells_with_predicted_annotations_after_scVI.h5ad')