In [1]:
import rapids_singlecell as rsc
import scanpy as sc 
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import median_abs_deviation
import os
import warnings
warnings.filterwarnings('ignore', 'Expected')
warnings.simplefilter('ignore')

import anndata 
from anndata import AnnData
import matplotlib

import time
import cuda
import cudf
import cupy

from cuml.decomposition import PCA
from cuml.manifold import TSNE
from cuml.cluster import KMeans
from cuml.preprocessing import StandardScaler

In [2]:
import rmm
from rmm.allocators.cupy import rmm_cupy_allocator

rmm.reinitialize(
    managed_memory=True,
)

cupy.cuda.set_allocator(rmm_cupy_allocator)

In [3]:
directory = '/home/supakorn/rscript/results/GSE178265'
file = os.listdir(directory)
file

['GSE178265_results.h5ad',
 'GSE178265_violin_filtered.png',
 'GSE178265_tsne.png',
 'GSE178265_umap.png',
 'GS178265_hvg.png',
 'GSE178265_embedding.png']

In [3]:
h5ad1 = '/home/supakorn/rscript/results/GSE178265/GSE178265_results.h5ad'
h5ad2 = '/home/supakorn/rscript/results/GSE157783/GSE157783_results.h5ad'
h5ad3 = '/home/supakorn/rscript/results/GSE178146/GSE178146_results.h5ad'
h5ad4 = '/home/supakorn/rscript/results/GSE184950/GSE184950_results.h5ad'

In [4]:
adata1 = sc.read_h5ad(h5ad1)
adata2 = sc.read_h5ad(h5ad2)
adata3 = sc.read_h5ad(h5ad3)
adata4 = sc.read_h5ad(h5ad4)

In [5]:
adata1

AnnData object with n_obs × n_vars = 433186 × 3000
    obs: 'patient', 'status', 'age', 'sex', 'n_genes_by_counts', 'total_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_total_counts_mt', 'leiden', 'louvain', 'kmeans', 'umap_density_patient'
    var: 'gene_name', 'mt', 'n_cells_by_counts', 'total_counts', 'mean_counts', 'pct_dropout_by_counts', 'log1p_total_counts', 'log1p_mean_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg', 'kmeans_colors', 'leiden', 'leiden_colors', 'log1p', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups', 'tsne', 'umap', 'umap_density_patient_params'
    obsm: 'X_pca', 'X_pca_harmony', 'X_tsne', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [6]:
adata2

AnnData object with n_obs × n_vars = 41433 × 2000
    obs: 'patient', 'cell_ontology', 'condition', 'n_genes_by_counts', 'total_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_total_counts_mt', 'leiden', 'louvain', 'kmeans', 'umap_density_patient'
    var: 'gene', 'mt', 'n_cells_by_counts', 'total_counts', 'mean_counts', 'pct_dropout_by_counts', 'log1p_total_counts', 'log1p_mean_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'cell_ontology_colors', 'hvg', 'kmeans_colors', 'leiden', 'leiden_colors', 'log1p', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups', 'tsne', 'umap', 'umap_density_patient_params'
    obsm: 'X_pca', 'X_pca_harmony', 'X_tsne', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [7]:
adata3

AnnData object with n_obs × n_vars = 353569 × 5000
    obs: 'Patient', 'Conditions', 'n_genes_by_counts', 'total_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_total_counts_mt', 'leiden', 'louvain', 'kmeans', 'umap_density_Patient', 'cell_type'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'total_counts', 'mean_counts', 'pct_dropout_by_counts', 'log1p_total_counts', 'log1p_mean_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'cell_type_colors', 'dendrogram_leiden', 'hvg', 'kmeans_colors', 'leiden', 'leiden_colors', 'log1p', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'tsne', 'umap', 'umap_density_Patient_params'
    obsm: 'X_pca', 'X_pca_harmony', 'X_tsne', 'X_umap', 'ora_estimate', 'ora_pvals'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [8]:
adata4

AnnData object with n_obs × n_vars = 267550 × 5000
    obs: 'Patient', 'Type', 'Sex', 'n_genes_by_counts', 'total_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_total_counts_mt', 'leiden', 'louvain', 'kmeans', 'umap_density_Patient'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'total_counts', 'mean_counts', 'pct_dropout_by_counts', 'log1p_total_counts', 'log1p_mean_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg', 'kmeans_colors', 'leiden', 'leiden_colors', 'log1p', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups', 'tsne', 'umap', 'umap_density_Patient_params'
    obsm: 'X_pca', 'X_pca_harmony', 'X_tsne', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [10]:
adata.obs['patient']

Barcode
AATGCCACACAAGCAG-1          pPDCN4340DAPIA030419
AGACTCAGTCACAATC-1          pPDCN4340DAPIA030419
ATACTTCCAGCGTTGC-1          pPDCN4340DAPIA030419
ATATCCTGTGTGTTTG-1          pPDCN4340DAPIA030419
ATCCACCGTGGGTTGA-1          pPDCN4340DAPIA030419
                                 ...            
CATACTTGTCTGATCA-1    pPDsHSrSNxi3298d200429PosB
TTTGTTGTCCTTGAAG-1    pPDsHSrSNxi3298d200429PosB
ACACTGACACTAGAGG-1    pPDsHSrSNxi3298d200429PosB
CAACAACAGCTCGAAG-1    pPDsHSrSNxi3298d200429PosB
TGAGCATCAGACAAGC-1    pPDsHSrSNxi3298d200429PosB
Name: patient, Length: 433186, dtype: category
Categories (97, object): ['pPDCN3839DAPIA030419', 'pPDCN3839DAPIB030419', 'pPDCN3898DAPIA030419', 'pPDCN3898DAPIB030419', ..., 'pPDsHSrSNxi5610d200429Pos', 'pPDsHSrSNxi6173d200429DAPIA', 'pPDsHSrSNxi6173d200429PosA', 'pPDsHSrSNxi6173d200429PosB']