## Notebook for exploratory analysis of _Cai Y et al 2020_ and _Cai Y et al 2022_ scRNA-Seq data using `scVI`

- **Developed by**: Carlos Talavera-López Ph.D

- **Modified by**: Mairi McClean
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v221101; modified v230314

### Load required modules

In [None]:
import sys
import scvi
import torch
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

import numpy as np
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

In [None]:
# Should this be put on the server?
save_path = "/Volumes/Lacie/data_lake/Mairi_example/notebook_output/paths/"

In [None]:
%matplotlib inline
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42

In [None]:
torch.cuda.is_available()

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

### Read in datasets

- Read in _Cai Y et al 2020_

In [None]:
# caiy2020 = sc.read_h5ad('/lustre/groups/talaveralopez/datasets/tuberculosis/rna-seq/mairi_data_lake/processed_files/abridged_qc/human/Cai2020_scRNA_PBMC_mm230306_qcd.h5ad')
# caiy2020

caiy2020 = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/processed_files/abridged_qc/human/Cai2020_scRNA_PBMC_mm230315_qcd.h5ad')
caiy2020

- Read in _Cai Y et al 2022_

In [None]:
# caiy2022 = sc.read_h5ad('/lustre/groups/talaveralopez/datasets/tuberculosis/rna-seq/mairi_data_lake/processed_files/abridged_qc/human/Cai2022_scRNA_PBMC_mm230314_qcd.h5ad')
# caiy2022.obs['status'] = 'active_TB'
# caiy2022

caiy2022 = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/processed_files/abridged_qc/human/Cai2022_scRNA_PBMC_mm230315_qcd.h5ad')
caiy2022.obs['status'] = 'active_TB'
caiy2022

### Merge datasets

In [None]:
caiy_tb = caiy2020.concatenate(caiy2022, batch_key = 'dataset', batch_categories = ['caiy2020', 'caiy2022'], join = 'inner')
caiy_tb

### Check that anndata object only contains PBMC scRNA from healthy donors

In [None]:
caiy_tb.obs

In [None]:
caiy_tb.obs['data_type'].value_counts()

In [None]:
caiy_tb.obs['tissue'].value_counts()

In [None]:
# caiy_healthy = 

### Calculate HVGs

In [None]:
adata = caiy_tb_gex.copy()
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 8000,
    layer = "counts",
    batch_key = "sample",
    subset = True
)

### Data integration with `scVI`

In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer = "counts",
    categorical_covariate_keys = ["sample"],
    continuous_covariate_keys = ["n_genes", "n_counts"]
)

# Why are the continuous covariate keys as such, and not 'percent mito'/'percent ribo'?


In [None]:
model = scvi.model.SCVI(adata, n_layers = 3, n_latent = 50, gene_likelihood = "nb", dispersion = 'gene-batch')
model

# Why have all of the parameters not been included?
# What is the "dispersion" argument referring to specifically?


In [None]:
model.train()

In [None]:
# Why was the model not saved here?

In [None]:
latent = model.get_latent_representation()
adata.obsm["X_scVI"] = latent

In [None]:
sc.pp.neighbors(adata, use_rep = "X_scVI", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(adata, min_dist = 0.2, spread = 6, random_state = 1712)
sc.pl.umap(adata, frameon = False, color = ['study', 'individual', 'tissue', 'status', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'dataset', 'sample'], size = 0.8, legend_fontsize = 5, ncols = 4)

In [None]:
sc.pl.umap(adata, frameon = False, color = ['CD74', 'CD3E','CD40LG', 'CD8A', 'NKG7', 'CLEC9A', 'CD14', 'FCGR3A'], size = 1, legend_fontsize = 5, legend_loc = 'on data', ncols = 4)

### Use `SCCAF` to select `leiden` resolution

> What is SCCAF? "Single Cell Clustering Assessment Framework"
>> It is a way to cluster cells based on gene expression; it uses repeat applications of clustering and ML models to generate gene expression profiles - it identifies distinct cell groups and a weighted list of feature genes for each group

> What is the architecture/algorithm of the model?
>> ML [logistic regression, random forest, Gaussian process classification, support vector machine and decision tree] and 5-fold CV

In [None]:
sc.tl.leiden(adata, resolution = 0.7, random_state = 1786)

In [None]:
import matplotlib.pyplot as plt
from SCCAF import SCCAF_assessment, plot_roc
y_prob, y_pred, y_test, clf, cvsm, acc = SCCAF_assessment(adata.X, adata.obs['leiden'], n = 100)

In [None]:
plot_roc(y_prob, y_test, clf, cvsm = cvsm, acc = acc)
plt.show()

In [None]:
sc.pl.umap(adata, frameon = False, color = ['leiden', 'status', 'CD74'], size = 0.8, legend_fontsize = 5, legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, frameon = False, color = ['leiden', 'status', 'tissue', 'ADH7', 'CDH1', 'CD74', 'CD3E', 'MUC20', 'DUSP4', 'FOXJ1', 'MUC1', 'FOXI1'], size = 1, legend_fontsize = 5)

### Export clustered object

In [None]:
adata

In [None]:
caiy_tb_gex

In [None]:
# Making a hybrid anndata object using sections from both original anndata object and the cai_tb_gex object
adata_export = anndata.AnnData(X = caiy_tb_gex.X, var = caiy_tb_gex.var, obs = adata.obs, uns = adata.uns, obsm = adata.obsm, layers = caiy_tb_gex.layers, obsp = adata.obsp)
adata_export