# scVI data integration of T cell compartment

In [None]:
import sys 
import os
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
import scvi
import hdf5plugin

import matplotlib.pyplot as plt
import seaborn as sns

# Add repo path to sys path (allows to access scripts and metadata from repo)
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/T_NK_compartment'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

import torch
torch.cuda.is_available()

%reload_ext autoreload

In [None]:
# Define plot and path dirs
plots_path = os.path.join(repo_path, 'plots')
data_path = os.path.join(repo_path, 'data')
model_path = os.path.join(repo_path, 'models')
general_data_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data'

## Load data

In [None]:
# T cell object
adata = ad.read_h5ad(f'{general_data_path}/compartmentSplits/thyAgeing_t_nkSplit_scvi_v3_2024-11-05.zarr')

# Add cell type labels and remove cells with no labels
cell_labels = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v2_2024-08-22_curatedAnno_v4.csv', index_col=0)
adata.obs.drop(columns = [c for c in cell_labels.columns if 'taa' in c], inplace = True)
adata.obs = adata.obs.join(cell_labels[[c for c in cell_labels.columns if 'taa' in c]], how = 'left')

# Add celltypist predictions to adata
celltypist_predictions = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v3_2024-11-05_celltypist_taa_l1.csv', index_col=0)
adata.obs = adata.obs.join(celltypist_predictions, how = 'left')

adata

In [None]:
adata.obs['taa_l1'].value_counts(normalize=True)

## Additional QC filtering

Criteria:
- percent_mito < .08
- scrublet_score < .4

## Data integration

In [None]:
%autoreload 2
from scvi_wrapper import run_scvi

In [None]:
# Inspect number of cells per data set: very few cells in Campinoti causing singularities during hvg selection, hence using `span = 0.5`
adata.obs.groupby('study').agg(n_cells = ('study', 'count'),
                               mean_genes = ('n_genes', 'mean'))

First round of integration using 3000 highly variable genes (ran for 400 epochs).

In [None]:
object_version = f'v7_{today}'

# Run scvi
scvi_run = run_scvi(adata, 
                    layer_raw = 'X', 
                    # Excluded genes
                    include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
                    exclude_vdjgenes = True, remove_cite = False,
                    # Highly variable gene selection
                    batch_hv="study", hvg = 3000, span = 0.5,
                    # scVI 
                    batch_scvi="sample",
                    cat_cov_scvi=["donor", "chemistry_simple", "sex"], 
                    #cont_cov_scvi=["percent_mito", 'percent_ribo', 'n_genes'], 
                    max_epochs=400, batch_size=2000, early_stopping = True, early_stopping_patience = 15, early_stopping_min_delta = 10.0,
                    plan_kwargs = {'lr': 0.001, 'reduce_lr_on_plateau' : True, 'lr_patience' : 10, 'lr_threshold' : 20}, 
                    n_layers = 3, n_latent = 30, dispersion = 'gene-batch',
                    # Leiden clustering
                    leiden_clustering = None, col_cell_type = ['celltypist_pred_taa_l1'], 
                    fig_dir = f'{plots_path}/preprocessing', fig_prefix = f'thyAgeing_tSplit_scvi_{object_version}')

In [None]:
# Save adata and scvi model
overwrite = True

anno_cols = [c for c in scvi_run['data'].obs.columns if '_pred_' in c or '_prob_' in c or 'taa' in c]
if not os.path.exists(f'{data_path}/thyAgeing_tSplit_scvi_{object_version}.zarr') or overwrite:
    scvi_run['data'].obs = scvi_run['data'].obs.drop(columns=anno_cols)
    scvi_run['data'].write_h5ad(
        f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr',
        compression=hdf5plugin.FILTERS["zstd"],
        compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
    )
    scvi_run['vae'].save(f'{model_path}/thyAgeing_tSplit_scvi_{object_version}', save_anndata=False, overwrite=overwrite)
else:
    print('File already exists')

## Leiden clustering

In [None]:
object_version = 'v7_2024-11-06'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

adata

In [None]:
# Louvain clustering
res_list = [2.5]
for res in res_list:
    sc.tl.leiden(adata, resolution = res, key_added = f"leiden_r{res}")
    
adata.obs[[f'leiden_r{str(r)}' for r in res_list]].to_csv(f'{data_path}/thyAgeing_tSplit_scvi_{object_version}_leidenClusters.csv')

## QC by cluster

In [None]:
object_version = 'v7_2024-11-06'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

leiden_clus = pd.read_csv(f'{data_path}/thyAgeing_tSplit_scvi_{object_version}_leidenClusters.csv', index_col=0)
adata.obs = adata.obs.join(leiden_clus)
adata.obs[leiden_clus.columns] = adata.obs[leiden_clus.columns].astype('category')

# Add celltypist predictions to adata
celltypist_predictions = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_filtered_v3_2024-10-24_celltypist_taa_l1.csv', index_col=0)
adata.obs = adata.obs.join(celltypist_predictions, how = 'left')

adata

In [None]:
sc.pl.umap(adata, color = leiden_clus.columns, ncols = 2, legend_fontsize=6, wspace=0.5, return_fig=True, show= False)
plt.savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_tSplit_scvi_{object_version}_leidenClusters.png', dpi=300, bbox_inches='tight')

In [None]:
qc_cols = ['n_counts','n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'scrublet_score', 'celltypist_prob_taa_l1']

g = sns.FacetGrid(adata.obs.melt(id_vars='leiden_r2.5', value_vars=qc_cols), 
                    col='variable', col_wrap=1, sharey=False, height=4, aspect=2)
g.map_dataframe(sns.violinplot, x='leiden_r2.5', y='value', hue='leiden_r2.5', split=False, 
                palette=adata.uns['leiden_r2.5_colors'], inner=None)
g.add_legend()
plt.savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_tSplit_scvi_{object_version}_qcViolin.png', dpi=300, bbox_inches='tight')

In [None]:
sc.pl.umap(adata, color = qc_cols, ncols = 2, legend_fontsize=6, wspace=0.5, return_fig=True, show= False)
plt.savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_tSplit_scvi_{object_version}_qcUmap.png', dpi=300, bbox_inches='tight')

In [None]:
# Add cell type labels 
cell_labels = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_v2_2024-08-22_curatedAnno_v4.csv', index_col=0)
adata.obs = adata.obs.join(cell_labels[[c for c in cell_labels.columns if 'taa' in c]], how = 'left')

sc.pl.umap(adata, color = 'taa_l5')

Clusters to remove:
- 34, 30: high mito
- 24,28,37: high scrublet score
- 32: high genes
- 9 : high ribo

## Marker expression by cluster

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
import pickle

# Load marker data
with open(f'{general_data_path}/markers/allMarkers_lowGranularity_vk8.pkl', 'rb') as f:
    all_mrkrs = pickle.load(f)
    
# Plot markers
for k,l in all_mrkrs.items():
    sc.pl.DotPlot(adata, 
              groupby='leiden_r2.5',
              var_names=l,
              mean_only_expressed=True,
              cmap = 'viridis',).add_totals().savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_tSplit_{object_version}_{k}Markers_dotplot.png', dpi=300, bbox_inches='tight')

In [None]:
# Plot markers
for k,l in all_mrkrs.items():
    sc.pl.umap(adata, color = l, ncols = 3, legend_fontsize=6, wspace=0.5, return_fig=True, show= False)
    plt.savefig(f'{plots_path}/preprocessing/scvi/thyAgeing_tSplit_{object_version}_{k}Markers_umap.png', dpi=300, bbox_inches='tight')

Clusters to remove:
- 27: mix of stroma
- 5: B cell

## Clean and reintegrate

In [None]:
object_version = 'v7_2024-11-06'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

leiden_clus = pd.read_csv(f'{data_path}/thyAgeing_tSplit_scvi_{object_version}_leidenClusters.csv', index_col=0)
adata.obs = adata.obs.join(leiden_clus)
adata.obs[leiden_clus.columns] = adata.obs[leiden_clus.columns].astype('category')

# Add celltypist predictions to adata
celltypist_predictions = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_filtered_v3_2024-10-24_celltypist_taa_l1.csv', index_col=0)
adata.obs = adata.obs.join(celltypist_predictions, how = 'left')

adata

In [None]:
clusters_to_remove = [27,5,34,30,24,28,37,32,9]
sc.pl.umap(adata[~adata.obs['leiden_r2.5'].isin(clusters_to_remove)], color = ['leiden_r2.5'])

In [None]:
object_version = f'v8_{today}'
adata = adata[~adata.obs['leiden_r2.5'].isin(clusters_to_remove)].copy()

# Run scvi
scvi_run = run_scvi(adata, 
                    layer_raw = 'X', 
                    # Excluded genes
                    include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
                    exclude_vdjgenes = True, remove_cite = False,
                    # Highly variable gene selection
                    batch_hv="study", hvg = 3000, span = 0.5,
                    # scVI 
                    batch_scvi="sample",
                    cat_cov_scvi=["donor", "chemistry_simple", "sex"], 
                    cont_cov_scvi=[], 
                    max_epochs=400, batch_size=2000, early_stopping = True, early_stopping_patience = 15, early_stopping_min_delta = 10.0,
                    plan_kwargs = {'lr': 0.001, 'reduce_lr_on_plateau' : True, 'lr_patience' : 10, 'lr_threshold' : 20}, 
                    n_layers = 3, n_latent = 30, dispersion = 'gene-batch',
                    # Leiden clustering
                    leiden_clustering = None, col_cell_type = ['celltypist_pred_taa_l1'], 
                    fig_dir = f'{plots_path}/preprocessing', fig_prefix = f'thyAgeing_tSplit_scvi_{object_version}')

In [None]:
# Save adata and scvi model
overwrite = True

anno_cols = [c for c in scvi_run['data'].obs.columns if '_pred_' in c or '_prob_' in c or 'taa' in c]
if not os.path.exists(f'{data_path}/thyAgeing_tSplit_scvi_{object_version}.zarr') or overwrite:
    scvi_run['data'].obs = scvi_run['data'].obs.drop(columns=anno_cols)
    scvi_run['data'].write_h5ad(
        f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr',
        compression=hdf5plugin.FILTERS["zstd"],
        compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
    )
    scvi_run['vae'].save(f'{model_path}/thyAgeing_tSplit_scvi_{object_version}', save_anndata=False, overwrite=overwrite)
else:
    print('File already exists')

In [None]:
object_version = 'v8_2024-11-07'
adata = ad.read_h5ad(f'{data_path}/objects/rna/thyAgeing_tSplit_scvi_{object_version}.zarr')

adata

In [None]:
# Louvain clustering
res_list = [2.5]
for res in res_list:
    sc.tl.leiden(adata, resolution = res, key_added = f"leiden_r{res}")
    
adata.obs[[f'leiden_r{str(r)}' for r in res_list]].to_csv(f'{data_path}/thyAgeing_tSplit_scvi_{object_version}_leidenClusters.csv')