### Notebook for Kong_2023 data integration and batch correction with `scVI`
- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- Creation date: 4th July 2023
- Last modified date: 26th of May 2024

### Load required modules

In [None]:
import scvi
import torch
import anndata
import warnings
import numpy as np
import scanpy as sc
import pandas as pd
import plotnine as p
from pywaffle import Waffle
import matplotlib.pyplot as plt

In [None]:
torch.cuda.is_available()

In [None]:
torch.set_float32_matmul_precision('medium')

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

In [None]:
arches_params = dict(
    use_layer_norm = "both",
    use_batch_norm = "none",
    encode_covariates = True,
    dropout_rate = 0.2,
    n_layers = 2,
)

In [None]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Read in datasets

In [None]:
input_dir = '/mnt/LaCIE/annaM/gut_project/raw_data/Kong_2023'
fig_dir = '/mnt/LaCIE/annaM/gut_project/Processed_data/Gut_data/Plots/Finding_stem_cells'

In [None]:
adata = sc.read_h5ad(f'{input_dir}/adata_Kong_2023_healthy_with_QC.h5ad')

In [None]:
adata_raw = sc.read_h5ad(f'{input_dir}/Kong_2023_raw_anndata.h5ad')

In [None]:
# Filter out cells from adata_raw that are not in adata
adata_raw = adata_raw[adata.obs_names]

# Replace counts in adata to raw counts from adata_raw
adata.X = adata_raw.X

del adata_raw

In [None]:
X_is_raw(adata)

In [None]:
# Save raw data
adata.raw = adata

In [None]:
adata.obs_keys

In [None]:
adata.obs['cell_type'].value_counts()

In [None]:
adata.layers['counts'] = adata.X.copy()

# Calculate 5000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "counts",
    batch_key = 'library_preparation_protocol__ontology_label',
    subset = True,
    span = 1
)

In [None]:
adata.obs.rename(columns = {'Celltype': 'Cell States'}, inplace = True)
adata.obs.rename(columns = {'cell_type': 'Cell Type'}, inplace = True)

* Modify Cell Types column

In [None]:
# List of specific Cell States to look for
cell_states_list = [
    'Stem cells OLFM4 LGR5',
    'Stem cells OLFM4 PCNA',
    'Stem cells OLFM4 GSTA1',
    'Stem cells OLFM4'
]

# Add 'Stem Cell' as a new category to the 'Cell Type' column
adata.obs['Cell Type'] = adata.obs['Cell Type'].cat.add_categories(['Stem Cell'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata.obs.loc[adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Stem Cell'

In [None]:
# List of specific Cell States to look for
cell_states_list = [
    'B cells',
    'B cells AICDA LRMP'
]

# Add 'B cells' as a new category to the 'Cell Type' column
adata.obs['Cell Type'] = adata.obs['Cell Type'].cat.add_categories(['B cells'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata.obs.loc[adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'B cells'

In [None]:
# List of specific Cell States to look for
cell_states_list = [
    'T cells CD4 FOSB',
    'T cells CD4 IL17A',
    'T cells CD8',
    'T cells CD8 KLRG1',
    'T cells Naive CD4',
    'T cells OGT',
    'Tregs',
    'NK cells KLRF1 CD3G-',
    'NK-like cells ID3 ENTPD1',
    'ILCs',
    'IELs ID3 ENTPD1',
    'Lymphatics'
]

# Add 'T cells' as a new category to the 'Cell Type' column
adata.obs['Cell Type'] = adata.obs['Cell Type'].cat.add_categories(['T cells'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata.obs.loc[adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'T cells'

In [None]:
cell_states_list = [
    'Plasma cells'
]

# Add 'Plasma cells' as a new category to the 'Cell Type' column
adata.obs['Cell Type'] = adata.obs['Cell Type'].cat.add_categories(['Plasma cells'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata.obs.loc[adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Plasma cells'

In [None]:
cell_states_list = [
    'Cycling cells',
    'DC1',
    'DC2 CD1D',
    'DC2 CD1D-',
    'Immune Cycling cells',
    'Macrophages',
    'Macrophages CCL3 CCL4',
    'Macrophages CXCL9 CXCL10',
    'Macrophages LYVE1',
    'Macrophages Metallothionein',
    'Macrophages PLA2G2D',
    'Mast cells',
    'Mature DCs',
    'Monocytes CHI3L1 CYP27A1', 
    'Monocytes HBB',
    'Monocytes S100A8 S100A9'
]

# Add 'Myeloid' as a new category to the 'Cell Type' column
adata.obs['Cell Type'] = adata.obs['Cell Type'].cat.add_categories(['Myeloid'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata.obs.loc[adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Myeloid'

In [None]:
cell_states_list = [
    'Glial cells'
]

# Add 'Neuronal' as a new category to the 'Cell Type' column
adata.obs['Cell Type'] = adata.obs['Cell Type'].cat.add_categories(['Neuronal'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata.obs.loc[adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Neuronal'

In [None]:
cell_states_list = [
    'Endothelial cells CA4 CD36',
    'Endothelial cells CD36',
    'Endothelial cells DARC',
    'Endothelial cells LTC4S SEMA3G'
]

# Add 'Endothelial' as a new category to the 'Cell Type' column
adata.obs['Cell Type'] = adata.obs['Cell Type'].cat.add_categories(['Endothelial'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata.obs.loc[adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Endothelial'

In [None]:
cell_states_list = [
    'Activated fibroblasts CCL19 ADAMADEC1',
    'Fibroblasts ADAMDEC1',
    'Fibroblasts KCNN3 LY6H',
    'Fibroblasts NPY SLITRK6',
    'Fibroblasts SFRP2 SLPI',
    'Fibroblasts SMOC2 PTGIS',
    'Inflammatory fibroblasts IL11 CHI3L1',
    'Pericytes HIGD1B STEAP4',
    'Pericytes RERGL NTRK2',
    'Stromal Cycling cells',
    'Myofibroblasts GREM1 GREM2',
    'Myofibroblasts HHIP NPNT'
]

# Add 'Mesenchymal' as a new category to the 'Cell Type' column
adata.obs['Cell Type'] = adata.obs['Cell Type'].cat.add_categories(['Mesenchymal'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata.obs.loc[adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Mesenchymal'

In [None]:
adata.obs.rename(columns = {'Cell Type': 'Cell_Type'}, inplace = True)

### Run Integration with scVI

In [None]:
scvi.model.SCVI.setup_anndata(adata, 
                              layer = "counts", 
                              labels_key = "Cell_Type", 
                              categorical_covariate_keys = ["biosample_id"])

In [None]:
scvi_model = scvi.model.SCVI(adata, n_latent = 50, n_layers = 3, dispersion = 'gene-batch', gene_likelihood = 'nb')

In [None]:
scvi_model.train(50, 
                 check_val_every_n_epoch = 1, 
                 enable_progress_bar = True, 
                 accelerator = "gpu",
                 devices = [0])

In [None]:
history_df = (
    scvi_model.history['elbo_train'].astype(float)
    .join(scvi_model.history['elbo_validation'].astype(float))
    .reset_index()
    .melt(id_vars = ['epoch'])
)

p.options.figure_size = 12, 6

p_ = (
    p.ggplot(p.aes(x = 'epoch', y = 'value', color = 'variable'), history_df.query('epoch > 0'))
    + p.geom_line()
    + p.geom_point()
    + p.scale_color_manual({'elbo_train': 'black', 'elbo_validation': 'red'})
    + p.theme_minimal()
)

print(p_)

In [None]:
adata.obsm["X_scVI"] = scvi_model.get_latent_representation()

### Integration with scANVI

In [None]:
scanvi_model = scvi.model.SCANVI.from_scvi_model(
    scvi_model,
    adata=adata,
    labels_key="Cell_Type",
    unlabeled_category="Unknown",
)

In [None]:
scanvi_model.train(100, 
                 check_val_every_n_epoch = 1, 
                 enable_progress_bar = True, 
                 accelerator = "gpu",
                 devices = [0])

In [None]:
history_df = (
    scanvi_model.history['elbo_train'].astype(float)
    .join(scanvi_model.history['elbo_validation'].astype(float))
    .reset_index()
    .melt(id_vars = ['epoch'])
)

p.options.figure_size = 12, 6

p_ = (
    p.ggplot(p.aes(x = 'epoch', y = 'value', color = 'variable'), history_df.query('epoch > 0'))
    + p.geom_line()
    + p.geom_point()
    + p.scale_color_manual({'elbo_train': 'black', 'elbo_validation': 'red'})
    + p.theme_minimal()
)

p_.save('fig1.png', dpi = 300)

print(p_)

In [None]:
adata.obsm["X_scANVI"] = scanvi_model.get_latent_representation(adata)

### UMAP calculation

In [None]:
sc.pp.neighbors(adata, use_rep = "X_scANVI", n_neighbors = 50, metric = 'minkowski')

In [None]:
sc.tl.umap(adata, min_dist = 0.4, spread = 4, random_state = 1712)

In [None]:
adata = adata.raw.to_adata()

In [None]:
adata.write(f'{input_dir}/Kong_scVI_scANVI.h5ad')

In [None]:
adata = sc.read_h5ad(f'{input_dir}/Kong_scVI_scANVI.h5ad')

+ Modify obs

In [None]:
# Change 'CO' in Kong_adata.obs['Site'] to 'Colon', 'TI' to 'Terminal Ileum', and 'SB' to 'Small Bowel'
adata.obs['Site'] = adata.obs['Site'].replace('CO', 'Colon')
adata.obs['Site'] = adata.obs['Site'].replace('TI', 'Terminal Ileum')
adata.obs['Site'] = adata.obs['Site'].replace('SB', 'Small Bowel')

# Rename columns in Kong-2023 dataset as in GCA_Smillie_Wang dataset
adata.obs.rename(columns = {'donor_id': 'Donor_ID'}, inplace = True)
adata.obs['Study_name'] = 'Kong 2023'
adata.obs.rename(columns = {'biosample_id': 'Sample_ID'}, inplace = True)
adata.obs.rename(columns = {'Site': 'Location'}, inplace = True)
adata.obs.rename(columns = {'Cell States': 'Cell_States'}, inplace = True)

adata.obs.rename(columns = {'library_preparation_protocol__ontology_label': 'Library_Preparation_Protocol'}, inplace = True)

adata.obs.drop(columns = ['log1p_n_genes_by_counts', 'log1p_total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts'], inplace = True)

adata.obs['sex'] = adata.obs['sex'].replace('male', 'Male')
adata.obs['sex'] = adata.obs['sex'].replace('female', 'Female')
adata.obs.rename(columns = {'sex': 'Sex'}, inplace = True)

adata.obs['disease__ontology_label'] = adata.obs['disease__ontology_label'].replace('normal', 'Healthy adult')
adata.obs.rename(columns = {'disease__ontology_label': 'Diagnosis'}, inplace = True)

adata.obs.drop(columns = ['G2M_score', 'percent_chrY', 'doublet_info', 'XIST-counts', 'S_score'], inplace = True)

adata.obs.drop(columns = ['organ', 'tissue', 'Type', 'library_preparation_protocol', 'disease', 'organ__ontology_label', 'species', 'species__ontology_label'], inplace = True)

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, frameon = False, color = ['Cell_Type', 'Donor_ID', 'Sample_ID', 'Location', 'Sex', 'Cell States', 'Library_Preparation_Protocol'], size = 1, legend_fontsize = 5, ncols = 3)

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, frameon = False, color = ['Cell_Type', 'Cell States'], size = 1, legend_fontsize = 5, ncols = 3)

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, frameon = False, color = ['Cell_Type', 'Donor_ID', 'Sample_ID', 'Location', 'Sex', 'Library_Preparation_Protocol'], size = 1, legend_fontsize = 5, ncols = 3)

In [None]:
adata.obs['predicted_doublets'] = adata.obs['predicted_doublets'].astype(str)
adata.obs['n_counts'] = adata.obs['n_counts'].astype(int)
adata.obs['n_genes'] = adata.obs['n_genes'].astype(int)

In [None]:
adata.obs_keys

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, frameon = False, color = ['n_genes_by_counts', 'n_counts', 'pct_counts_mito', 'pct_counts_ribo', 'predicted_doublets', ], size = 1, legend_fontsize = 5, ncols = 3)

In [None]:
# Make a column 'Stem_cell' in adata.obs, and put True if adata.obs['Cell_State'] == 'Stem_cell', False otherwise
adata.obs['Stem_cell'] = adata.obs['Cell_Type'] == 'Stem Cell'

In [None]:
adata.obs['Stem_cell'] = adata.obs['Stem_cell'].astype(str)

In [None]:
# Plot only Stem cells
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, frameon = False, color = ['Stem_cell'], size = 1, legend_fontsize = 5, ncols = 3)

In [None]:
Stem_cells_markers = ['CD24', 'DCLK1', 'LGR5', 'CD166', 'CD44', 'DCAMKL-1', 'SOX9', 'ACAD10', 'ACVR1C', 'ADH1C', 'ALDH1', 'ALK3', 'ARSE', 
'ASCL2', 'ATP10B', 'BMI1', 'C16orf89', 'C6orf136', 'CD29', 'CDCA7', 'CFTR','CHMP4C', 'CHP2', 'CLDN15', 'CLDN18', 'CLDN2', 'CPA6', 'DAPK2', 
'DDC', 'EFNA3', 'EPHB2', 'EPYC', 'EVPL', 'F2RL1', 'FBLN2', 'FOXD2-AS1', 'GATA6-AS1', 'GDF15', 'GJB1', 'GJB1', 'GOLT1A', 'GPX2', 'HNF1A', 
'HSD17B2', 'ITPKC','LEFTY1', 'LHFPL3-AS2', 'LIPG', 'LY6G6D', 'MGST1', 'MSI1', 'MYOM3', 'Musashi-1', 'NOX1', 'OLFM4', 'PCSK9', 'PDZD3', 
'PHLDA1', 'PKP2', 'PLAGL2', 'PLEKHH1', 'PPP1R1B', 'PTGDR', 'PTK7', 'RGMB', 'RNF157', 'RNF186', 'SFN', 'SLC27A2', 'SLC38A4', 'SLPI',
'SULT1B1', 'TAF4B', 'TANC1', 'TMEM171', 'TSPAN8', 'Telomerase Inhibitors', 'URB1-AS1', 'ZBED9', 'ZNF296', 'ASCL2', 'SMOC2']
sc.tl.score_genes(adata, Stem_cells_markers, score_name = 'Stem_cells_markers_score')

sc.set_figure_params(dpi=300)
sc.pl.umap(adata, color= ['Stem_cells_markers_score'], color_map = "RdPu", size = 0.3, frameon = False)

In [None]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/1_QC/Kong_scVI_scANVI.h5ad'
adata = sc.read(input)

adata.obs['predicted_doublets'] = adata.obs['predicted_doublets'].astype(str)
adata.obs['n_counts'] = adata.obs['n_counts'].astype(int)
adata.obs['n_genes'] = adata.obs['n_genes'].astype(int)

sc.set_figure_params(dpi=300)
sc.pl.umap(adata, color=['n_genes_by_counts', 'n_counts', 'pct_counts_mito', 'pct_counts_ribo', 'predicted_doublets'],
             color_map = "RdPu", size = 1, frameon = False, ncols=6)

In [None]:
# Change stem cells to epithelial cells
adata.obs['Cell_Type'] = adata.obs['Cell_Type'].replace('Stem Cell', 'Epithelial')

In [None]:
sc.set_figure_params(dpi=300)
sc.pl.umap(adata, color=['Cell_Type'],
             color_map = "RdPu", size = 1, frameon = False, ncols=6)

In [None]:
adata.obs['Cell_States'].value_counts()

In [None]:
adata.obs['Stem_cell'] = adata.obs['Cell_States'].copy()

In [None]:
adata.obs['Stem_cell'] = adata.obs['Stem_cell'].cat.set_categories(['Stem cells'])
adata.obs.loc[adata.obs['Cell_States'] == 'Stem cells OLFM4', 'Stem_cell'] = 'Stem cells'
adata.obs.loc[adata.obs['Cell_States'] == 'Stem cells OLFM4 GSTA1', 'Stem_cell'] = 'Stem cells'
adata.obs.loc[adata.obs['Cell_States'] == 'Stem cells OLFM4 LGR5', 'Stem_cell'] = 'Stem cells'
adata.obs.loc[adata.obs['Cell_States'] == 'Stem cells OLFM4 PCNA', 'Stem_cell'] = 'Stem cells'

In [None]:
adata.obs['Stem_cell'] = adata.obs['Stem_cell'].astype(str)

new_palette = ['#824670', '#759EB8']  # Hex codes for pink and light blue

# Assign the new color palette to your categories
adata.uns['Stem_cell_colors'] = new_palette

fig_dir = '/mnt/LaCIE/annaM/gut_project/Processed_data/Gut_data/Plots/Finding_stem_cells'

with plt.rc_context():
    sc.set_figure_params(dpi=300, figsize=(15, 15))
    sc.pl.umap(adata, frameon=False, color='Stem_cell', size=10, legend_fontsize=5, ncols=3, show=False)
    plt.savefig(f"{fig_dir}/Kong_stem_umap.png", bbox_inches="tight")

In [None]:
adata_log = adata.copy()
sc.pp.normalize_total(adata_log, target_sum = 1e6, exclude_highly_expressed = True)
sc.pp.log1p(adata_log)

In [None]:
stem_cells_markers = ['AXIN2', 'ASCL2', 'ATOH1', 'BMI1', 'CA12', 'CLU', 'GPX2', 'HMGCS2', 'LEFTY1', 'LGR5', 'LRIG1', 'MYC', 'OLFM4', 'SMOC2', 'TERT']

In [None]:
sc.tl.score_genes(adata_log, stem_cells_markers, score_name = 'Stem_cells_markers_score')

In [None]:
with plt.rc_context():
    sc.set_figure_params(dpi=300, figsize=(15, 15))
    sc.pl.umap(adata_log, color= ['Stem_cells_markers_score'], color_map = "magma_r", frameon=False, size = 10, show=False)
    plt.savefig(f"{fig_dir}/Kong_stem_markers.png", bbox_inches="tight")

In [None]:
stem_cells = adata_log[adata_log.obs['Stem_cell'] == 'Stem cells']
with plt.rc_context():
    sc.set_figure_params(dpi=300, figsize=(15, 15))
    sc.pl.dotplot(stem_cells, stem_cells_markers, groupby='Stem_cell', cmap = 'magma_r', show=False) 
    plt.savefig(f"{fig_dir}/Kong_stem_markers_dotplot.png", bbox_inches="tight")