In [6]:
#All the following script and comments have been made in accordance to single cell  data ##

#Importing packages # Make sure that you activate correct environment

import os
import tempfile

import anndata
import muon
import numpy as np
import pooch
import scanpy as sc
import scvi
import seaborn as sns
import torch

In [7]:
##DATA LOADING AND PREPERATION##

#Loading PBMC3k data having only anndata
pbmc3k = sc.read_h5ad("./pbmc3k.h5ad")
pbmc3k

#To view var
pbmc3k.var.head()

Unnamed: 0_level_0,gene_ids
index,Unnamed: 1_level_1
MIR1302-10,ENSG00000243485
FAM138A,ENSG00000237613
OR4F5,ENSG00000186092
RP11-34P13.7,ENSG00000238009
RP11-34P13.8,ENSG00000239945


In [8]:
#Downloading PBMCK5 data having both Protein and RNA data

#Loading data
pbmc5k = muon.read_10x_h5("./pbmc5k_protein_filtered_feature_bc_matrix.h5")

#Making sure that gene names are unique for downstream processing
pbmc5k.var_names_make_unique()

pbmc5k

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [9]:
#Concatenating datasets
adata = anndata.concat([pbmc3k, pbmc5k.mod["rna"]], join="inner", label="batch")

#Data viewing
adata.obs.sample(n=5)

Unnamed: 0,batch
TGTAGTCTTGCACA-1,0
GCGGAAAGTTGTTTGG-1,1
TTGGATGCACCCTGAG-1,1
GAGTTGTGTATGCG-1,0
TGCTCCAAGGGAGGGT-1,1


In [10]:
##PREPROCESSING DATA##

#Removing outliers by filtration
print("# cells, # genes before filtering:", adata.shape)

sc.pp.filter_genes(adata, min_counts=3)
sc.pp.filter_cells(adata, min_counts=3)

print("# cells, # genes after filtering:", adata.shape)

#Saving count information
adata.layers["counts"] = adata.X.copy()


# cells, # genes before filtering: (7947, 20453)
# cells, # genes after filtering: (7947, 14309)


In [11]:
#Data Normalization using logarithmization - SCANPY #Saving as layer in anndata
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

#Saving data
adata.raw = adata

In [12]:
#Data Normalization using SCVI #Saving as new MuData modality
mdata = muon.MuData({"rna": adata.copy(), "log_norm_rna": adata.copy()}, axis=-1)
# Now rna is count-based and log_norm_rna is log-normalized
mdata.mod["rna"].X = mdata.mod["rna"].layers["counts"]
del mdata.mod["rna"].raw
del mdata.mod["rna"].layers["counts"]
del mdata.mod["log_norm_rna"].layers["counts"]
mdata

In [13]:
#DATA SETUP FOR RUNNING MODELS

#Mudata

pbmc5k.mod["prot"].X = np.asarray(pbmc5k.mod["prot"].X.A)
scvi.model.TOTALVI.setup_mudata(
    pbmc5k,
    protein_layer=None,
    rna_layer=None,
    modalities={"protein_layer": "prot", "rna_layer": "rna"},
)


For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.

For creation, use `anndata.experimental.sparse_dataset(X)` instead.

  return _abc_instancecheck(cls, instance)

For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.

For creation, use `anndata.experimental.sparse_dataset(X)` instead.

  return _abc_instancecheck(cls, instance)


In [14]:
#Anndata

adata_pbm5k = pbmc5k.mod["rna"]
adata_pbm5k.obsm["prot"] = pbmc5k.mod["prot"].to_df()

scvi.model.TOTALVI.setup_anndata(
    adata_pbm5k,
    protein_expression_obsm_key="prot",
)

[34mINFO    [0m Using column names from columns of adata.obsm[1m[[0m[32m'prot'[0m[1m][0m                                                     



For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.

For creation, use `anndata.experimental.sparse_dataset(X)` instead.

  return _abc_instancecheck(cls, instance)


In [15]:
#Train model of interest
model = scvi.model.TOTALVI(adata_pbm5k)
model.view_anndata_setup(adata_pbm5k)

[34mINFO    [0m Computing empirical prior initialization for protein background.                                          
