# Import modules

In [1]:
# this is for the jupyter notebook at JupyterLab
# to make sys.path between the terminal and Jupyter same
import sys

print(sys.path) # especially the path to site-packages are different... so thats why couldn't find correct package...

# sys.path from terminal at scvi0190_env
# obtained by excuting `python -c "import sys; print('\n'.join(sys.path))"`
sys.path = [
    "/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python39.zip",
    "/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python3.9",
    "/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python3.9/lib-dynload",
    "/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python3.9/site-packages"
]

print(sys.path)

['/nfs/team205/kk18/notebooks/Foetal/Trisomy21/RNA', '/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python39.zip', '/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python3.9', '/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python3.9/lib-dynload', '', '/nfs/users/nfs_k/kk18/.local/lib/python3.9/site-packages', '/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python3.9/site-packages']
['/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python39.zip', '/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python3.9', '/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python3.9/lib-dynload', '/nfs/team205/kk18/miniconda3/envs/scvi0190_env/lib/python3.9/site-packages']


In [2]:
# import warnings
# warnings.simplefilter("ignore", UserWarning)

import anndata
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import numpy.random as random
import pandas as pd
import scanpy as sc

import scvi

Global seed set to 0
  jax.tree_util.register_keypaths(
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


In [3]:
import gc
import torch

In [4]:
import session_info
session_info.show()

In [5]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white', color_map = 'RdPu',)

In [6]:
# import own function
import importlib.util
import sys
spec = importlib.util.spec_from_file_location("module.name", "/nfs/team205/kk18/function/python/utils.py")
utils = importlib.util.module_from_spec(spec)
sys.modules["module.name"] = utils
spec.loader.exec_module(utils)

In [7]:
import os
os.getcwd()

'/nfs/team205/kk18/notebooks/Foetal/Trisomy21/RNA'

# Read in anndata

In [8]:
adata = sc.read_h5ad('/nfs/team205/heart/anndata_objects/Foetal/trisomy21/Euploid_T21Hearts_Aug2024_sel_subsampled-per-cellstate-donor.h5ad')
adata

AnnData object with n_obs × n_vars = 83477 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'region', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'donor_by_library-prep', 'multiplexed', 'SOC | status', 'SOC | log_prob_singleton', 'SOC | log_prob_doublet', 'batch_key', '_scvi_batch', 'FACSgate', 'fine_grain', 'mid_grain', 'coarse_grain', 'sex', 'week', 'trimester', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', '_scvi_labels', 'stress_score', 'hb1_score', 'euploid_or_t21', 'donor_type', 'region_finest', 'flushed', 'scrublet_leiden', 'cluster_scrublet_score', 'leiden_scVI_0.1', 'leiden

# Run scVI, categorical covariates to correct: donor+region

In [15]:
# Setup anndata for scVI
bdata = adata.copy()
scvi.model.SCVI.setup_anndata(bdata, 
                              # layer="counts", 
                              batch_key="donor", # we want to correct for this 
                              categorical_covariate_keys=['region'],  # we want to correct for this
                              continuous_covariate_keys=['total_counts','pct_counts_mt','pct_counts_ribo']) 

# with three n_layer conditions
n_layer_list = [1,2,3]

for n_layers in n_layer_list:
    print(f'##### {n_layers} #####')
    # setup
    vae = scvi.model.SCVI(bdata, n_hidden = 128, n_latent = 50, n_layers = n_layers, dispersion = 'gene-batch')
    vae.view_anndata_setup(bdata)
    # train
    vae.train(max_epochs=400)
    # save trained model
    vae.save(f"/nfs/team205/heart/anndata_objects/Foetal/scVI/models/Euploid_T21Hearts_Aug2024_sel_subsampled-per-cellstate-donor_correcting-donor-region_n-layers-{str(n_layers)}/",
             overwrite=True)
    # save latent space
    latent = sc.AnnData(vae.get_latent_representation(),
                    obs=bdata.obs.copy())
    # drop all the obs columns, for simplicity
    latent.obs = latent.obs.drop(columns=latent.obs.columns, axis=1)
    latent.write(f'/nfs/team205/heart/anndata_objects/Foetal/scVI/latent_variables/Euploid_T21Hearts_Aug2024_sel_subsampled-per-cellstate-donor_correcting-donor-region_n-layers-{str(n_layers)}.h5ad')
    # Convergency plot
    vae.history['elbo_train']
    x = np.linspace(1, (len(vae.history['elbo_train'])), (len(vae.history['elbo_train'])))
    plt.plot(x, vae.history['elbo_train'], label="train")
    plt.xlabel('epoch')
    plt.ylabel('elbo_train')
    plt.show()
    
    del vae
    # free up GPU
    gc.collect()
    torch.cuda.empty_cache()
    print('')

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


# Run scVI, categorical covariates to correct: only region

In [None]:
# Setup anndata for scVI
bdata = adata.copy()
scvi.model.SCVI.setup_anndata(bdata, 
                              # layer="counts", 
                              batch_key="region", # we want to correct for this 
                              continuous_covariate_keys=['total_counts','pct_counts_mt','pct_counts_ribo'])

# with three n_layer conditions
n_layer_list = [1,2,3]

for n_layers in n_layer_list:
    print(f'##### {n_layers} #####')
    # setup
    vae = scvi.model.SCVI(bdata, n_hidden = 128, n_latent = 50, n_layers = n_layers, dispersion = 'gene-batch')
    vae.view_anndata_setup(bdata)
    # train
    vae.train(max_epochs=400)
    # save trained model
    vae.save(f"/nfs/team205/heart/anndata_objects/Foetal/scVI/models/Euploid_T21Hearts_Aug2024_sel_subsampled-per-cellstate-donor_correcting-region_n-layers-{str(n_layers)}/",
             overwrite=True)
    # save latent space
    latent = sc.AnnData(vae.get_latent_representation(),
                    obs=bdata.obs.copy())
    # drop all the obs columns, for simplicity
    latent.obs = latent.obs.drop(columns=latent.obs.columns, axis=1)
    latent.write(f'/nfs/team205/heart/anndata_objects/Foetal/scVI/latent_variables/Euploid_T21Hearts_Aug2024_sel_subsampled-per-cellstate-donor_correcting-region_n-layers-{str(n_layers)}.h5ad')
    # Convergency plot
    vae.history['elbo_train']
    x = np.linspace(1, (len(vae.history['elbo_train'])), (len(vae.history['elbo_train'])))
    plt.plot(x, vae.history['elbo_train'], label="train")
    plt.xlabel('epoch')
    plt.ylabel('elbo_train')
    plt.show()
    
    del vae
    # free up GPU
    gc.collect()
    torch.cuda.empty_cache()
    print('')

# Run scVI, categorical covariates to correct: only donor

In [None]:
# Setup anndata for scVI
bdata = adata.copy()
scvi.model.SCVI.setup_anndata(bdata, 
                              # layer="counts", 
                              batch_key="donor", # we want to correct for this 
                              continuous_covariate_keys=['total_counts','pct_counts_mt','pct_counts_ribo'])

# with three n_layer conditions
n_layer_list = [1,2,3]

for n_layers in n_layer_list:
    print(f'##### {n_layers} #####')
    # setup
    vae = scvi.model.SCVI(bdata, n_hidden = 128, n_latent = 50, n_layers = n_layers, dispersion = 'gene-batch')
    vae.view_anndata_setup(bdata)
    # train
    vae.train(max_epochs=400)
    # save trained model
    vae.save(f"/nfs/team205/heart/anndata_objects/Foetal/scVI/models/Euploid_T21Hearts_Aug2024_sel_subsampled-per-cellstate-donor_correcting-donor_n-layers-{str(n_layers)}/",
             overwrite=True)
    # save latent space
    latent = sc.AnnData(vae.get_latent_representation(),
                    obs=bdata.obs.copy())
    # drop all the obs columns, for simplicity
    latent.obs = latent.obs.drop(columns=latent.obs.columns, axis=1)
    latent.write(f'/nfs/team205/heart/anndata_objects/Foetal/scVI/latent_variables/Euploid_T21Hearts_Aug2024_sel_subsampled-per-cellstate-donor_correcting-donor_n-layers-{str(n_layers)}.h5ad')
    # Convergency plot
    vae.history['elbo_train']
    x = np.linspace(1, (len(vae.history['elbo_train'])), (len(vae.history['elbo_train'])))
    plt.plot(x, vae.history['elbo_train'], label="train")
    plt.xlabel('epoch')
    plt.ylabel('elbo_train')
    plt.show()
    
    del vae
    # free up GPU
    gc.collect()
    torch.cuda.empty_cache()
    print('')