In [1]:
import scanpy as sc, warnings, numpy as np, pandas as pd
sc.settings.set_figure_params(dpi=80, facecolor="white", frameon=False)
from tqdm.auto import tqdm
sc.settings.verbosity = 3
warnings.filterwarnings("ignore")

### save sepreate datasets

In [2]:
%%time

adata = sc.read_h5ad("../data/Kidney_Combined_cellbender_v2.h5ad")
columns_to_drop = [col for col in adata.obs.columns if "RNA_snn" in col or "_prob" in col]
adata.obs.drop(columns=columns_to_drop, inplace=True)
adata

CPU times: user 4.87 s, sys: 8.88 s, total: 13.8 s
Wall time: 27.7 s


AnnData object with n_obs × n_vars = 2340396 × 36601
    obs: 'batch', 'harmonized_celltype', 'donor', 'Sample', 'nCount_RNA', 'nFeature_RNA', 'Cell_type', 'Subcategory', 'barcode', 'celltype', 'donor_id', 'sample_uuid', 'library_uuid', 'cell_type_ontology_term_id', 'author_cell_type', 'doublet_id', 'cell_type', 'assay', 'sex', 'tissue', 'development_stage', 'Patient', 'cancer', 'CD45', 'Stage annotations', 'Grade annotations', 'file integrity', 'orig.ident', 'seurat_clusters', 'patient', 'gene_clustering', 'l', 'original_seurat_clusters', 'CN_Column', 'anno', 'cluster', 'doublet_score', 'gene_per_1kUMI', 'label', 'no_genes', 'no_genes_log10', 'pct_MT', 'sample', 'total_UMI', 'Project_ID', 'percent.mt', 'percent.ribo', 'integrated_snn_res.0.2', 'SNV_GT', 'stim', 'ident', 'Suffix', 'type', 'region', 'Sample2', 'cluster_name', 'UMAP1', 'UMAP2', 'Sample_name'

In [4]:
sc.pp.filter_cells(adata, min_genes=1)
sc.pp.filter_genes(adata, min_counts=1)

filtered out 364830 cells that have less than 1 genes expressed
filtered out 2464 genes that are detected in less than 1 counts


In [5]:
snrna_dataset = [
    'Kidney_Muto2', 
    'Kidney_Muto',
    'Kidney_Wilson2',
    'Kidney_Wilson',
    'Kidney_Lake', 
    'Kidney_Wu'
]

for item in snrna_dataset:
    assert item in adata.obs["Project_ID"].values


adata.obs["modality"] = "scrna"
mask = [item in snrna_dataset for item in adata.obs["Project_ID"].values]
adata.obs.loc[mask, "modality"] = "snrna"

In [7]:
%%time
adata[adata.obs["modality"] == "snrna"].write("../data/Kidney_Combined_sn.h5ad", compression="gzip")

  df[key] = c


In [8]:
%%time
adata[adata.obs["modality"] == "scrna"].write("../data/Kidney_Combined_sc.h5ad", compression="gzip")

  df[key] = c


### Check runned integration

#### Single Nucleas

In [9]:
adata = sc.read_h5ad("../data/Kidney_Combined_sn_hvg5k.h5ad")
adata

AnnData object with n_obs × n_vars = 362175 × 5000
    obs: 'batch', 'harmonized_celltype', 'donor', 'Sample', 'nCount_RNA', 'nFeature_RNA', 'Cell_type', 'Subcategory', 'barcode', 'celltype', 'donor_id', 'sample_uuid', 'library_uuid', 'cell_type_ontology_term_id', 'author_cell_type', 'doublet_id', 'cell_type', 'assay', 'sex', 'tissue', 'development_stage', 'Patient', 'cancer', 'CD45', 'Stage annotations', 'Grade annotations', 'file integrity', 'orig.ident', 'seurat_clusters', 'patient', 'gene_clustering', 'l', 'original_seurat_clusters', 'CN_Column', 'anno', 'cluster', 'doublet_score', 'gene_per_1kUMI', 'label', 'no_genes', 'no_genes_log10', 'pct_MT', 'sample', 'total_UMI', 'Project_ID', 'percent.mt', 'percent.ribo', 'integrated_snn_res.0.2', 'SNV_GT', 'stim', 'ident', 'Suffix', 'type', 'region', 'Sample2', 'cluster_name', 'UMAP1', 'UMAP2', 'Sample_name', 'n_genes', 'modality', '_scvi_batch', '_scvi_labels', 'conditions_combined'
    var: 'n_counts', 'highly_variable', 'highly_variable

In [10]:
adata.obsm

AxisArrays with keys: Harmony, Scanorama, X_bbknn, X_pca, X_scANVI, X_scVI, X_tsne, X_umap, scPoli

In [11]:
del adata.obsm["X_scANVI"]
del adata.obsm["scPoli"]

In [12]:
adata.write_h5ad("../data/Kidney_Combined_sn_hvg5k.h5ad", compression="gzip")

In [12]:
selected_dataset = [
    'Kidney_BenPublished', 
    'Kidney_BenUnpublished',
    'Kidney_Krebs',
    'Kidney_Liao',
    'Kidney_Malone',
    'Kidney_Muto', 
    'Kidney_Raji', 
    'Kidney_Wilson',
    'Kidney_Krishna',
    'Kidney_Wu'
]

adata_labeled = adata[adata.obs["harmonized_celltype"] != 'nan']
adata_labeled = adata_labeled[[_batch in selected_dataset for _batch in adata_labeled.obs["Project_ID"]]]
adata_labeled

View of AnnData object with n_obs × n_vars = 46010 × 5000
    obs: 'batch', 'harmonized_celltype', 'donor', 'Sample', 'nCount_RNA', 'nFeature_RNA', 'Cell_type', 'Subcategory', 'barcode', 'celltype', 'donor_id', 'sample_uuid', 'library_uuid', 'cell_type_ontology_term_id', 'author_cell_type', 'doublet_id', 'cell_type', 'assay', 'sex', 'tissue', 'development_stage', 'Patient', 'cancer', 'CD45', 'Stage annotations', 'Grade annotations', 'file integrity', 'orig.ident', 'seurat_clusters', 'patient', 'gene_clustering', 'l', 'original_seurat_clusters', 'CN_Column', 'anno', 'cluster', 'doublet_score', 'gene_per_1kUMI', 'label', 'no_genes', 'no_genes_log10', 'pct_MT', 'sample', 'total_UMI', 'Project_ID', 'percent.mt', 'percent.ribo', 'integrated_snn_res.0.2', 'SNV_GT', 'stim', 'ident', 'Suffix', 'type', 'region', 'Sample2', 'cluster_name', 'UMAP1', 'UMAP2', 'Sample_name', 'n_genes', 'modality', '_scvi_batch', '_scvi_labels', 'conditions_combined'
    var: 'n_counts', 'highly_variable', 'highly_v

In [13]:
## saved labeled subset

# https://docs.google.com/spreadsheets/d/1yS3yNlVnIlsGclgao0R9EeTPQTKF8Nx4wPupjzM3mrg/edit?gid=0#gid=0

cell_types = [
    "SULF1+ EC-AEA", "EC-AEA", "Macro", "Macro", "cycMacro", "CCD-IC-A",
    "OMCD-IC-A", "CNT-PC", "CNT-PC", "DTL", "DTL", "DTL", "EC-DVR",
    "SLC6A6+ EC-AEA", "EC-GC", "EC-PTC", "EC-PTC", "EC-AVR", "IC-B",
    "VSMC", "Pericyte", "MAST", "MC", "MD", "cycNK/T", "cycNK/T", "DC1",
    "DC2", "IC-B|CNT doub", "POD", "POD", "PEC", "PT-S1/2", "PT-S1/2_nuc",
    "PT-S3", "dPT", "dPT", "TAL", "TAL", "TAL", "ATL", "Treg", "CD4 T",
    "CD4 T", "CD4 T", "Th17", "CD8 T", "CD8 T", "CD8 T", "ILC3", "NKT",
    "Ciliated", "DCT", "EC-LYM", "OMCD-PC", "IMCD-PC", "MFAP5+aFIB",
    "aFIB", "aFIB", "aFIB", "aPT", "aPT", "CCD-PC", "CCD-PC", "DCT",
    "Neuron", "RBC", "B", "B", "PL", "TAL", "VWF+ EC-AVR", "VWF+ EC-AVR",
    "cycEC", "cycPT", "CD16+ NK", "CD56bright NK", "pDC", "cMono", "ncMono",
    "cycPapE", "PapE", "γδT", "dC-IC-A"
]

len(np.unique(cell_types))

62

In [14]:
np.sum([celltype in cell_types for celltype in adata_labeled.obs["harmonized_celltype"]])

41465

In [15]:
adata_harmonized = adata_labeled[[celltype in cell_types for celltype in adata_labeled.obs["harmonized_celltype"]]]
adata_harmonized

View of AnnData object with n_obs × n_vars = 41465 × 5000
    obs: 'batch', 'harmonized_celltype', 'donor', 'Sample', 'nCount_RNA', 'nFeature_RNA', 'Cell_type', 'Subcategory', 'barcode', 'celltype', 'donor_id', 'sample_uuid', 'library_uuid', 'cell_type_ontology_term_id', 'author_cell_type', 'doublet_id', 'cell_type', 'assay', 'sex', 'tissue', 'development_stage', 'Patient', 'cancer', 'CD45', 'Stage annotations', 'Grade annotations', 'file integrity', 'orig.ident', 'seurat_clusters', 'patient', 'gene_clustering', 'l', 'original_seurat_clusters', 'CN_Column', 'anno', 'cluster', 'doublet_score', 'gene_per_1kUMI', 'label', 'no_genes', 'no_genes_log10', 'pct_MT', 'sample', 'total_UMI', 'Project_ID', 'percent.mt', 'percent.ribo', 'integrated_snn_res.0.2', 'SNV_GT', 'stim', 'ident', 'Suffix', 'type', 'region', 'Sample2', 'cluster_name', 'UMAP1', 'UMAP2', 'Sample_name', 'n_genes', 'modality', '_scvi_batch', '_scvi_labels', 'conditions_combined'
    var: 'n_counts', 'highly_variable', 'highly_v

In [16]:
adata_harmonized.write_h5ad("../data/Kidney_Combined_sn_hvg5k_LabeledSubset.h5ad", compression="gzip")

# Kidney_Combined_sn_hvg5k

#### single cell

In [13]:
adata = sc.read_h5ad("../data/Kidney_Combined_sc_hvg5k.h5ad")
adata

AnnData object with n_obs × n_vars = 1613391 × 5000
    obs: 'batch', 'harmonized_celltype', 'donor', 'Sample', 'nCount_RNA', 'nFeature_RNA', 'Cell_type', 'Subcategory', 'barcode', 'celltype', 'donor_id', 'sample_uuid', 'library_uuid', 'cell_type_ontology_term_id', 'author_cell_type', 'doublet_id', 'cell_type', 'assay', 'sex', 'tissue', 'development_stage', 'Patient', 'cancer', 'CD45', 'Stage annotations', 'Grade annotations', 'file integrity', 'orig.ident', 'seurat_clusters', 'patient', 'gene_clustering', 'l', 'original_seurat_clusters', 'CN_Column', 'anno', 'cluster', 'doublet_score', 'gene_per_1kUMI', 'label', 'no_genes', 'no_genes_log10', 'pct_MT', 'sample', 'total_UMI', 'Project_ID', 'percent.mt', 'percent.ribo', 'integrated_snn_res.0.2', 'SNV_GT', 'stim', 'ident', 'Suffix', 'type', 'region', 'Sample2', 'cluster_name', 'UMAP1', 'UMAP2', 'Sample_name', 'n_genes', 'modality', '_scvi_batch', '_scvi_labels', 'conditions_combined'
    var: 'n_counts', 'highly_variable', 'highly_variabl

In [14]:
del adata.obsm["X_scANVI"]
del adata.obsm["scPoli"]

In [15]:
%%time
adata.write_h5ad("../data/Kidney_Combined_sc_hvg5k.h5ad", compression="gzip")

CPU times: user 3min 12s, sys: 1.82 s, total: 3min 13s
Wall time: 3min 15s


In [19]:
adata_labeled = adata[adata.obs["harmonized_celltype"] != 'nan']
adata_labeled = adata_labeled[[_batch in selected_dataset for _batch in adata_labeled.obs["Project_ID"]]]
adata_labeled

View of AnnData object with n_obs × n_vars = 448627 × 5000
    obs: 'batch', 'harmonized_celltype', 'donor', 'Sample', 'nCount_RNA', 'nFeature_RNA', 'Cell_type', 'Subcategory', 'barcode', 'celltype', 'donor_id', 'sample_uuid', 'library_uuid', 'cell_type_ontology_term_id', 'author_cell_type', 'doublet_id', 'cell_type', 'assay', 'sex', 'tissue', 'development_stage', 'Patient', 'cancer', 'CD45', 'Stage annotations', 'Grade annotations', 'file integrity', 'orig.ident', 'seurat_clusters', 'patient', 'gene_clustering', 'l', 'original_seurat_clusters', 'CN_Column', 'anno', 'cluster', 'doublet_score', 'gene_per_1kUMI', 'label', 'no_genes', 'no_genes_log10', 'pct_MT', 'sample', 'total_UMI', 'Project_ID', 'percent.mt', 'percent.ribo', 'integrated_snn_res.0.2', 'SNV_GT', 'stim', 'ident', 'Suffix', 'type', 'region', 'Sample2', 'cluster_name', 'UMAP1', 'UMAP2', 'Sample_name', 'n_genes', 'modality', '_scvi_batch', '_scvi_labels', 'conditions_combined'
    var: 'n_counts', 'highly_variable', 'highly_

In [20]:
adata_harmonized = adata_labeled[[celltype in cell_types for celltype in adata_labeled.obs["harmonized_celltype"]]]
adata_harmonized

View of AnnData object with n_obs × n_vars = 370373 × 5000
    obs: 'batch', 'harmonized_celltype', 'donor', 'Sample', 'nCount_RNA', 'nFeature_RNA', 'Cell_type', 'Subcategory', 'barcode', 'celltype', 'donor_id', 'sample_uuid', 'library_uuid', 'cell_type_ontology_term_id', 'author_cell_type', 'doublet_id', 'cell_type', 'assay', 'sex', 'tissue', 'development_stage', 'Patient', 'cancer', 'CD45', 'Stage annotations', 'Grade annotations', 'file integrity', 'orig.ident', 'seurat_clusters', 'patient', 'gene_clustering', 'l', 'original_seurat_clusters', 'CN_Column', 'anno', 'cluster', 'doublet_score', 'gene_per_1kUMI', 'label', 'no_genes', 'no_genes_log10', 'pct_MT', 'sample', 'total_UMI', 'Project_ID', 'percent.mt', 'percent.ribo', 'integrated_snn_res.0.2', 'SNV_GT', 'stim', 'ident', 'Suffix', 'type', 'region', 'Sample2', 'cluster_name', 'UMAP1', 'UMAP2', 'Sample_name', 'n_genes', 'modality', '_scvi_batch', '_scvi_labels', 'conditions_combined'
    var: 'n_counts', 'highly_variable', 'highly_

In [21]:
adata_harmonized.write_h5ad("../data/Kidney_Combined_sc_hvg5k_LabeledSubset.h5ad", compression="gzip")