In [3]:
#Importing Packages

import os
import tempfile

import anndata
import muon
import numpy as np
import pooch
import scanpy as sc
import scvi
import seaborn as sns
import torch

In [9]:
##DATA LOADING AND PREPERATION##

#Downloading PBMC3k data having only anndata
data_dir = "/Users/srivalli/Desktop/Heart"
pbmc3k_path = os.path.join(data_dir, "pbmc3k.h5ad")

#Downloading and Loading data
pbmc3k = sc.read(
    filename=pbmc3k_path, backup_url="http://falexwolf.de/data/pbmc3k_raw.h5ad"
)
pbmc3k

#To view var
pbmc3k.var.head()

100%|██████████| 5.58M/5.58M [00:00<00:00, 29.9MB/s]


Unnamed: 0_level_0,gene_ids
index,Unnamed: 1_level_1
MIR1302-10,ENSG00000243485
FAM138A,ENSG00000237613
OR4F5,ENSG00000186092
RP11-34P13.7,ENSG00000238009
RP11-34P13.8,ENSG00000239945


In [13]:
#Downloading PBMCK5 data having both Protein and RNA data
def download_data(
    data_dir = "/Users/srivalli/Desktop/Heart" ,  fname: str = "pbmc5k_protein_filtered_feature_bc_matrix.h5"
) -> str:
    """Download the data files."""
    return pooch.retrieve(
        url="https://cf.10xgenomics.com/samples/cell-exp/3.0.2/5k_pbmc_protein_v3/5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5",
        known_hash="7695e6b1888bdae6f53b3a28a99f0a0cdf387d1685e330a597cdd4b5541f8abd",
        fname=fname,
        path=data_dir,
    )

h5_path = download_data(data_dir)

#Loading data
pbmc5k = muon.read_10x_h5(h5_path)

#Making sure that gene names are unique for downstream processing
pbmc5k.var_names_make_unique()

#Viewing variables
pbmc5k.var

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Unnamed: 0,gene_ids,feature_types,genome
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38
FAM138A,ENSG00000237613,Gene Expression,GRCh38
OR4F5,ENSG00000186092,Gene Expression,GRCh38
AL627309.1,ENSG00000238009,Gene Expression,GRCh38
AL627309.3,ENSG00000239945,Gene Expression,GRCh38
...,...,...,...
HLA-DR_TotalSeqB,HLA-DR,Antibody Capture,
TIGIT_TotalSeqB,TIGIT,Antibody Capture,
IgG1_control_TotalSeqB,IgG1,Antibody Capture,
IgG2a_control_TotalSeqB,IgG2a,Antibody Capture,


In [14]:
#Concatenating datasets
adata = anndata.concat([pbmc3k, pbmc5k.mod["rna"]], join="inner", label="batch")

#Data viewing
adata.obs.sample(n=5)

Unnamed: 0,batch
GAGGGATGGGAAAT-1,0
AAGTACCTCCGGTTCT-1,1
GCATTAGAGGTCATAA-1,1
CATCGCTGGGATCT-1,0
CAGGCCAAGAGGCCAT-1,1


In [18]:
##PREPROCESSING DATA##

#Removing outliers by filtration
print("# cells, # genes before filtering:", adata.shape)

sc.pp.filter_genes(adata, min_counts=3)
sc.pp.filter_cells(adata, min_counts=3)

print("# cells, # genes after filtering:", adata.shape)

#Saving count information
adata.layers["counts"] = adata.X.copy()


# cells, # genes before filtering: (7947, 14309)
# cells, # genes after filtering: (7947, 13482)


In [16]:
#Data Normalization using logarithmization - SCANPY #Saving as layer in anndata
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

#Saving data
adata.raw = adata

In [19]:
#Data Normalization using SCVI #Saving as new MuData modality
mdata = muon.MuData({"rna": adata.copy(), "log_norm_rna": adata.copy()}, axis=-1)
# Now rna is count-based and log_norm_rna is log-normalized
mdata.mod["rna"].X = mdata.mod["rna"].layers["counts"]
del mdata.mod["rna"].raw
del mdata.mod["rna"].layers["counts"]
del mdata.mod["log_norm_rna"].layers["counts"]
mdata