In [1]:
import os
import sys

import random 
import numpy as np
import matplotlib.pyplot as plt

import scanpy as sc
import scvi
import scgen

import torch
from lightning.pytorch.loggers import CSVLogger

import pandas as pd

import session_info
import warnings
from pyprojroot.here import here

torch.set_float32_matmul_precision('medium')

random_seed = 42

#scvi.settings.dl_num_workers = 8
scvi.settings.seed = random_seed
print("scvi-tools version:", scvi.__version__)

[rank: 0] Seed set to 42


scvi-tools version: 1.1.2


### Parameters

In [2]:
n_latent = 30
batch_key = 'chemistry'
annotation = 'Level1'

## Loading data 

**MAIN adata**

In [3]:
adataM = sc.read_h5ad(here("03_downstream_analysis/02_gene_universe_definition/results/04_MAIN_geneUniverse_noRBCnPlatelets.log1p.h5ad"))
adataM

AnnData object with n_obs × n_vars = 4279352 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable'
    uns: 'log1p'

**scGen preprocessing**

In [4]:
adataM.obs['batch'] = adataM.obs[batch_key].tolist()
adataM.obs['cell_type'] = adataM.obs[annotation].tolist()
scgen.SCGEN.setup_anndata(adataM, batch_key="batch", labels_key="cell_type")

**Loading trained scGen model**

In [5]:
model_scGen = scgen.SCGEN.load(here('03_downstream_analysis/04_integration_with_annotation/results/scGen_model_noRBCnPlat'), adata = adataM)

[34mINFO    [0m File                                                                                                      
         [35m/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/03_downstream_analysis/03_scA[0m
         [35mNVI_integration_with_annotation/results/scGen_model_noRBCnPlat/[0m[95mmodel.pt[0m already downloaded                


/scratch_isilon/groups/singlecell/shared/conda_env/scvi-v112/lib/python3.9/site-packages/lightning/fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3.9 /scratch_isilon/groups/singlecell/shared/conda_en ...


## Generating embedding spaces

### Correcting adata

In [6]:
adata_scGen = model_scGen.batch_removal()
adata_scGen

  all_shared_ann = AnnData.concatenate(
  corrected.obsm["latent"] = all_shared_ann[corrected.obs_names,:].X


[34mINFO    [0m Input AnnData not setup with scvi-tools. attempting to transfer AnnData setup                             


AnnData object with n_obs × n_vars = 4279352 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2', 'batch', 'cell_type', '_scvi_batch', '_scvi_labels'
    uns: '_scvi_uuid', '_scvi_manager_uuid'
    obsm: 'latent', 'corrected_latent'

### PCA on corrected expression matrix

In [12]:
sc.pp.pca(adata_scGen, n_comps = n_latent)
adata_scGen

In [18]:
adata_scgen_emb = sc.AnnData(X=adata_scGen.obsm['corrected_latent'], 
                             layers = {'scgen_corrected_latent':adata_scGen.obsm['corrected_latent'],
                                       'scgen_corrected_expression_pc':adata_scGen.obsm['X_pca']},
                            obs = adata_scGen.obs[[]])
adata_scgen_emb

AnnData object with n_obs × n_vars = 4279352 × 30
    layers: 'scgen_corrected_latent', 'scgen_corrected_expression_pc'

### Save the results

In [7]:
adata_scGen.write(here('03_downstream_analysis/04_integration_with_annotation/results/02_scGen_batchRemoved_noRBCnPlat.h5ad'), compression='gzip')

In [12]:
adata_scgen_emb.write(here('03_downstream_analysis/04_integration_with_annotation/results/02_scGen_corr_latent_expression_noRBCnPlat.h5ad'), compression='gzip')