In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

In [2]:
adata = sc.read('../data/bone_marrow_integrated.h5ad')

In [3]:
adata.obs['timepoint'].value_counts()

healthy    115445
D0          97737
R           46450
MRD         33008
DG          31889
D113         2713
D29          1951
D31          1785
D18          1670
D171         1402
D14          1345
D15          1201
D20           949
D35           916
D41           386
D37           223
D34           200
D97            59
D49            52
Name: timepoint, dtype: int64

In [4]:
# subset healthy and diagnosis data
healthy = adata[adata.obs['timepoint'] == 'healthy'].copy()
print(healthy.shape)
d0 = adata[(adata.obs['timepoint'] == 'D0') | (adata.obs['timepoint'] == 'DG')].copy()
print(d0.shape)

(115445, 16723)
(129626, 16723)


In [5]:
# define function to subsample by cell types
def subsample_by_type(adata_obj, type_col):
    types = np.unique(adata_obj.obs[type_col])
    adatas = []
    for i in range(0, len(types)):
        sub = adata_obj[adata_obj.obs[type_col] == types[i]].copy()
        if sub.n_obs <= 1000:
            adatas.append(sub)
        else:
            s = sc.pp.subsample(sub, n_obs = 1000, copy = True, random_state=123)
            adatas.append(s)
    subsampled_adata = adatas[0].concatenate(adatas[1:])
    return(subsampled_adata)

In [6]:
# subsample healthy
healthy_sub = subsample_by_type(healthy, 'celltype_final')
healthy_sub

AnnData object with n_obs × n_vars = 16816 × 16723
    obs: 'dataset', 'sample', 'donor', 'timepoint', 'celltype', 'batch', 'size_factors', 'doublet', 'dbl_score', 'umap1', 'umap2', 'leiden', 'celltype1', 'dummy', 'n_counts', 'keep', 'leiden_r1.5', 'ct'
    obsm: 'X_umap'
    layers: 'counts'

In [7]:
# subsample diagnosis
d0_sub = subsample_by_type(d0, 'celltype_final')
d0_sub

AnnData object with n_obs × n_vars = 15860 × 16723
    obs: 'dataset', 'sample', 'donor', 'timepoint', 'celltype', 'batch', 'size_factors', 'doublet', 'dbl_score', 'umap1', 'umap2', 'leiden', 'celltype1', 'dummy', 'n_counts', 'keep', 'leiden_r1.5', 'ct'
    obsm: 'X_umap'
    layers: 'counts'

In [8]:
healthy_sub.obs = healthy_sub.obs[['dataset', 'sample', 'donor', 'timepoint', 'celltype_final']]
d0_sub.obs = d0_sub.obs[['dataset', 'sample', 'donor', 'timepoint', 'celltype_final']]

In [9]:
# save subsets
healthy_sub.write('../data/healthy_sub.h5ad')
d0_sub.write('../data/d0_sub.h5ad')

... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'donor' as categorical
... storing 'ct' as categorical
... storing 'dataset' as categorical
... storing 'sample' as categorical
... storing 'donor' as categorical
... storing 'ct' as categorical
