* INCLUDE mixed donor sample
* REMOVED 5-prime sample
* Add lower filtering threshhold for 'total_counts' (not only 'n_genes')
* Add AVN samples 

## Import modules

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import scrublet as scr
import session_info

In [2]:
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')
session_info.show()

## Functions

In [3]:
"""Functions for downstream work with outputs of remove-background."""

# to read cellbender outputs<br>
# https://github.com/broadinstitute/CellBender/issues/57

import tables
import numpy as np
import scipy.sparse as sp
from typing import Dict


def dict_from_h5(file: str) -> Dict[str, np.ndarray]:
    """Read in everything from an h5 file and put into a dictionary."""
    d = {}
    with tables.open_file(file) as f:
        # read in everything
        for array in f.walk_nodes("/", "Array"):
            d[array.name] = array.read()
    return d


def anndata_from_h5(file: str,
                    analyzed_barcodes_only: bool = True) -> 'anndata.AnnData':
    """Load an output h5 file into an AnnData object for downstream work.

    Args:
        file: The h5 file
        analyzed_barcodes_only: False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count matrix.
            True to load a limited set of barcodes: only those analyzed by the
            algorithm. This allows relevant latent variables to be loaded
            properly into adata.obs and adata.obsm, rather than adata.uns.

    Returns:
        adata: The anndata object, populated with inferred latent variables
            and metadata.

    """

    try:
        import anndata
    except ImportError:
        raise ImportError('The anndata package must be installed to use the '
                          'function anndata_from_h5()')

    d = dict_from_h5(file)
    X = sp.csc_matrix((d.pop('data'), d.pop('indices'), d.pop('indptr')),
                      shape=d.pop('shape')).transpose().tocsr()

    if analyzed_barcodes_only:
        if 'barcodes_analyzed_inds' in d.keys():
            X = X[d['barcodes_analyzed_inds'], :]
            d['barcodes'] = d['barcodes'][d['barcodes_analyzed_inds']]
        elif 'barcode_indices_for_latents' in d.keys():
            X = X[d['barcode_indices_for_latents'], :]
            d['barcodes'] = d['barcodes'][d['barcode_indices_for_latents']]
        else:
            print('Warning: analyzed_barcodes_only=True, but the key '
                  '"barcodes_analyzed_inds" or "barcode_indices_for_latents" '
                  'is missing from the h5 file. '
                  'Will output all barcodes, and proceed as if '
                  'analyzed_barcodes_only=False')

    # Construct the count matrix.
    adata = anndata.AnnData(X=X,
                            obs={'barcode': d.pop('barcodes').astype(str)},
                            var={'gene_name': (d.pop('gene_names') if 'gene_names' in d.keys()
                                               else d.pop('name')).astype(str)})
    adata.obs.set_index('barcode', inplace=True)
    adata.var.set_index('gene_name', inplace=True)

    # Add other information to the adata object in the appropriate slot.
    for key, value in d.items():
        try:
            value = np.asarray(value)
            if len(value.shape) == 0:
                adata.uns[key] = value
            elif value.shape[0] == X.shape[0]:
                if (len(value.shape) < 2) or (value.shape[1] < 2):
                    adata.obs[key] = value
                else:
                    adata.obsm[key] = value
            elif value.shape[0] == X.shape[1]:
                if value.dtype.name.startswith('bytes'):
                    adata.var[key] = value.astype(str)
                else:
                    adata.var[key] = value
            else:
                adata.uns[key] = value
        except Exception:
            print('Unable to load data into AnnData: ', key, value, type(value))

    if analyzed_barcodes_only:
        for col in adata.obs.columns[adata.obs.columns.str.startswith('barcodes_analyzed')
                                     | adata.obs.columns.str.startswith('barcode_indices')]:
            try:
                del adata.obs[col]
            except Exception:
                pass

    return adata

In [4]:
# from https://github.com/Teichlab/mapcloud/blob/master/scripts/starsolo/postprocess.py

from statsmodels.stats.multitest import multipletests
# from emptydrops.matrix import CountMatrix
# from emptydrops import find_nonambient_barcodes
import scrublet as scr
import scanpy as sc
import pandas as pd
import numpy as np
import scipy

#some functions that Ni uses in scanpy scripts to run scrublet
#which in turn are inspired by my original notebook on the matter
#(extracted from scanpy_scripts 0.2.10 to get around scanpy version incompatibility)
def test_outlier(x, upper_mad_only=True):
	med = np.median(x)
	if upper_mad_only:
		mad = np.median(x[x>med] - med) * 1.4826
	else:
		mad = np.median(np.abs(x - med)) * 1.4826
	pvals = 1 - scipy.stats.norm.cdf(x, loc=med, scale=mad)
	bh_pvals = multipletests(pvals, method='fdr_bh')[1]
	return pvals, bh_pvals

def run_scrublet(adata, resolution_function=None):
	old_verbosity = sc.settings.verbosity
	sc.settings.verbosity = 1
	if resolution_function is None:
		resolution_function = lambda x: np.maximum(np.maximum(np.log10(x)-1, 0)**2, 0.1)
	scrub = scr.Scrublet(adata.X)
	#this has the potential to brick for poor quality data
	#if so, abort it and everything downstream
	try:
		ds, pd = scrub.scrub_doublets(verbose=False)
	except:
		return
	adata.obs['scrublet_score'] = ds

	adata_copy = adata.copy()
	sc.pp.filter_genes(adata_copy, min_cells=3)
	sc.pp.normalize_total(adata_copy, target_sum=1e4)
	sc.pp.log1p(adata_copy)
	sc.pp.highly_variable_genes(adata_copy, min_mean=0.0125, max_mean=3, min_disp=0.5, subset=True)
	sc.pp.scale(adata_copy, zero_center=False)
	sc.pp.pca(adata_copy, svd_solver='arpack', zero_center=False)
	sc.pp.neighbors(adata_copy, n_pcs=30)
	sc.tl.umap(adata_copy)
	sc.tl.leiden(adata_copy, resolution=1)
	for clst in np.unique(adata_copy.obs['leiden']):
		clst_size = sum(adata_copy.obs['leiden'] == clst)
		sc.tl.leiden(adata_copy, restrict_to=('leiden', [clst]), resolution=resolution_function(clst_size), key_added='leiden_R')
		adata_copy.obs['leiden'] = adata_copy.obs['leiden_R']
	clst_meds = []
	for clst in np.unique(adata_copy.obs['leiden']):
		k = adata_copy.obs['leiden'] == clst
		clst_med = np.median(adata_copy.obs.loc[k, 'scrublet_score'])
		adata_copy.obs.loc[k, 'cluster_scrublet_score'] = clst_med
		clst_meds.append(clst_med)
	clst_meds = np.array(clst_meds)
	pvals, bh_pvals = test_outlier(clst_meds)
	for i, clst in enumerate(np.unique(adata_copy.obs['leiden'])):
		k = adata_copy.obs['leiden'] == clst
		adata_copy.obs.loc[k, 'pval'] = pvals[i]
		adata_copy.obs.loc[k, 'bh_pval'] = bh_pvals[i]
	sc.settings.verbosity = old_verbosity
	#need to also export the clustering, for soupx purposes
	adata.obs['scrublet_leiden'] = adata_copy.obs['leiden']
	adata.obs['scrublet_score'] = adata_copy.obs['scrublet_score']
	adata.obs['cluster_scrublet_score'] = adata_copy.obs['cluster_scrublet_score']
	adata.obs['doublet_pval'] = adata_copy.obs['pval']
	adata.obs['doublet_bh_pval'] = adata_copy.obs['bh_pval']
	del adata_copy

## Create anndata object

In [5]:
# read in metadata
metadata = pd.read_csv('/nfs/team205/heart/anndata_objects/8regions/metadata/HeartTeamSamples_Mappeddata_20220531.csv', sep = ',', index_col = None)
metadata = metadata[metadata['Publication']=='8regions']

print(metadata['modality'].value_counts())

snRNA            94
scRNA            54
Visium           46
Multiome-ATAC    30
Multiome-RNA     30
snATAC           21
Visium-FFPE       4
Name: modality, dtype: int64


In [6]:
# select modality
metadata = metadata[metadata['modality']=='scRNA']

In [7]:
########### editted: put id to var_names ###########

# read in
adatas=[]
for i in range(len(metadata)):
    print(metadata.iloc[i]['sangerID'])
    
    path=metadata.iloc[i]['CellBender_out']
    adatas.append(anndata_from_h5(path + '/' + path.split('/')[-1] + '_cellbender_out_filtered.h5', analyzed_barcodes_only=False))
    del path
    
    # replace var_names with ensembleID
    adatas[i].var.reset_index(inplace=True)
    if 'id' in adatas[i].var.columns:
        adatas[i].var.set_index('id',inplace=True)
    else:
        adatas[i].var.set_index('genes',inplace=True)
    
    # modify barcodes
    adatas[i].obs.index = metadata.iloc[i]['sangerID'] + '_' + adatas[i].obs.index 
    
    # add metadata
    for col in ['sangerID','combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age',
                'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x','flushed']:
        adatas[i].obs[col] = metadata.iloc[i][col]


# concatenate
adata = adatas[0].concatenate(adatas[1:], index_unique = None, batch_key=None)
adata.shape

HCAHeart7606896
HCAHeart7656534
HCAHeart7656535
HCAHeart7656536
HCAHeart7656537
HCAHeart7656538
HCAHeart7728604
HCAHeart7728605
HCAHeart7728606
HCAHeart7728607
HCAHeart7728608
HCAHeart7728609
HCAHeart7745966
HCAHeart7745967
HCAHeart7745968
HCAHeart7745969
HCAHeart7745970
HCAHeart7751845
HCAHeart7843999
HCAHeart7844000
HCAHeart7844001
HCAHeart7844002
HCAHeart7844003
HCAHeart7844004
HCAHeart7850539
HCAHeart7850540
HCAHeart7850541
HCAHeart7850542
HCAHeart7850543
HCAHeart7850544
HCAHeart7850545
HCAHeart7850546
HCAHeart7850547
HCAHeart7850548
HCAHeart7850549
HCAHeart7850551
HCAHeart7905327
HCAHeart7905328
HCAHeart7905329
HCAHeart7905330
HCAHeart7905331
HCAHeart7905332
HCAHeart8102857
HCAHeart8102858
HCAHeart8102859
HCAHeart8102860
HCAHeart8102861
HCAHeart8102862
HCAHeart8102863
HCAHeart8102864
HCAHeart8102865
HCAHeart8102866
HCAHeart8102867
HCAHeart8102868


(345070, 33538)

In [8]:
for i in range(len(adatas)):
    print(adatas[i].var.columns)

Index(['gene_name', 'ambient_expression'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression', 'feature_type'], dtype='object')
Index(['gene_name', 'ambient_expression',

In [9]:
adata.obs['sangerID'].value_counts()

HCAHeart8102861    15000
HCAHeart8102860    15000
HCAHeart8102862    15000
HCAHeart8102857    15000
HCAHeart7728605    15000
HCAHeart8102859    15000
HCAHeart7844003    15000
HCAHeart7844001    14995
HCAHeart8102866    14993
HCAHeart8102868    14993
HCAHeart8102858    14554
HCAHeart7844004    14277
HCAHeart7745968    12255
HCAHeart7728606    12014
HCAHeart7745969     8804
HCAHeart7656536     8019
HCAHeart8102865     7855
HCAHeart7843999     7738
HCAHeart8102867     7695
HCAHeart7905328     7632
HCAHeart7728607     6500
HCAHeart7905327     5900
HCAHeart7905332     5896
HCAHeart7850549     5412
HCAHeart7745967     5096
HCAHeart7844000     4851
HCAHeart7850551     4838
HCAHeart7850546     4758
HCAHeart7850548     4288
HCAHeart7850547     3910
HCAHeart8102864     3829
HCAHeart7850540     3826
HCAHeart7905331     3674
HCAHeart7844002     3605
HCAHeart7656538     2661
HCAHeart7745966     2332
HCAHeart7751845     2213
HCAHeart7905330     2166
HCAHeart7905329     1791
HCAHeart7850541     1731


## Run scrublet

In [12]:
%%time

# per rxn
for i,ID in enumerate(adata.obs['sangerID'].unique()):
    print(ID)
    
    ad = adata[adata.obs['sangerID'] == ID].copy()
    run_scrublet(ad)
    if i==0:
        meta = ad.obs
    else:
        meta = pd.concat([meta, ad.obs])
    del ad

HCAHeart7606896


  w.setdiag(float(target_total) / tots_use)


HCAHeart7656534


  w.setdiag(float(target_total) / tots_use)


HCAHeart7656535


  w.setdiag(float(target_total) / tots_use)


HCAHeart7656536


  w.setdiag(float(target_total) / tots_use)


HCAHeart7656537


  w.setdiag(float(target_total) / tots_use)


HCAHeart7656538


  w.setdiag(float(target_total) / tots_use)


HCAHeart7728604


  w.setdiag(float(target_total) / tots_use)


HCAHeart7728605
HCAHeart7728606


  w.setdiag(float(target_total) / tots_use)


HCAHeart7728607


  w.setdiag(float(target_total) / tots_use)


HCAHeart7728608


  w.setdiag(float(target_total) / tots_use)


HCAHeart7728609
HCAHeart7745966


  w.setdiag(float(target_total) / tots_use)
  w.setdiag(float(target_total) / tots_use)


HCAHeart7745967


  w.setdiag(float(target_total) / tots_use)


HCAHeart7745968


  w.setdiag(float(target_total) / tots_use)


HCAHeart7745969


  w.setdiag(float(target_total) / tots_use)


HCAHeart7745970


  w.setdiag(float(target_total) / tots_use)


HCAHeart7751845


  w.setdiag(float(target_total) / tots_use)


HCAHeart7843999


  w.setdiag(float(target_total) / tots_use)


HCAHeart7844000


  w.setdiag(float(target_total) / tots_use)


HCAHeart7844001
HCAHeart7844002


  w.setdiag(float(target_total) / tots_use)


HCAHeart7844003
HCAHeart7844004


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850539


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850540


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850541


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850542


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850543


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850544


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850545


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850546


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850547


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850548


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850549


  w.setdiag(float(target_total) / tots_use)


HCAHeart7850551


  w.setdiag(float(target_total) / tots_use)


HCAHeart7905327


  w.setdiag(float(target_total) / tots_use)


HCAHeart7905328


  w.setdiag(float(target_total) / tots_use)


HCAHeart7905329


  w.setdiag(float(target_total) / tots_use)


HCAHeart7905330


  w.setdiag(float(target_total) / tots_use)


HCAHeart7905331


  w.setdiag(float(target_total) / tots_use)


HCAHeart7905332


  w.setdiag(float(target_total) / tots_use)


HCAHeart8102857


  gLog = lambda input: np.log(input[1] * np.exp(-input[0]) + input[2])


HCAHeart8102858


  w.setdiag(float(target_total) / tots_use)


HCAHeart8102859
HCAHeart8102860
HCAHeart8102861
HCAHeart8102862
HCAHeart8102863


  w.setdiag(float(target_total) / tots_use)


HCAHeart8102864


  w.setdiag(float(target_total) / tots_use)


HCAHeart8102865


  w.setdiag(float(target_total) / tots_use)


HCAHeart8102866


  w.setdiag(float(target_total) / tots_use)


HCAHeart8102867


  w.setdiag(float(target_total) / tots_use)


HCAHeart8102868


  w.setdiag(float(target_total) / tots_use)


CPU times: user 41min 12s, sys: 22min 33s, total: 1h 3min 46s
Wall time: 31min 6s


In [13]:
meta[meta['sangerID']=='HCAHeart8102857']

Unnamed: 0_level_0,latent_RT_efficiency,latent_cell_probability,latent_scale,sangerID,combinedID,donor,donor_type,region,region_finest,age,...,facility,cell_or_nuclei,modality,kit_10x,flushed,scrublet_score,scrublet_leiden,cluster_scrublet_score,doublet_pval,doublet_bh_pval
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart8102857_CCACAAATCGAGCCAC,2.080856,0.995525,9312.976562,HCAHeart8102857,na,D11,DCD,RA,RA,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.050000,08,0.073798,0.972721,0.974016
HCAHeart8102857_TTTCATGTCGTTGCCT,2.068729,0.965059,9545.163086,HCAHeart8102857,na,D11,DCD,RA,RA,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.043252,08,0.073798,0.972721,0.974016
HCAHeart8102857_TCTACCGGTCACTTCC,1.973510,0.999803,6143.513672,HCAHeart8102857,na,D11,DCD,RA,RA,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.369803,0156,0.245342,0.088301,0.942501
HCAHeart8102857_AGAACCTAGAGCAGAA,1.948684,0.999806,6100.782227,HCAHeart8102857,na,D11,DCD,RA,RA,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.176895,239,0.223022,0.177393,0.942501
HCAHeart8102857_CACCAAAGTATCAGGG,1.912834,0.999845,6095.844238,HCAHeart8102857,na,D11,DCD,RA,RA,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.203753,025,0.209877,0.250000,0.942501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeart8102857_ACAAGCTAGCGATGAC,0.823448,0.998622,4563.112793,HCAHeart8102857,na,D11,DCD,RA,RA,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.047002,1111,0.121591,0.843827,0.958460
HCAHeart8102857_GGGTGAAGTCTAACTG,0.832393,0.999128,4355.219727,HCAHeart8102857,na,D11,DCD,RA,RA,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.073798,10,0.081897,0.961455,0.966597
HCAHeart8102857_CTGTCGTGTGGCTGAA,0.857743,0.998777,4552.802246,HCAHeart8102857,na,D11,DCD,RA,RA,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.039758,083,0.110919,0.887623,0.958460
HCAHeart8102857_GTAGGAGGTTAGGGTG,0.844368,0.998850,4245.892578,HCAHeart8102857,na,D11,DCD,RA,RA,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.133595,1109,0.132015,0.791429,0.958460


## Add scrublet outputs to adata

In [14]:
meta_scrub = meta[['scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval']].copy()
meta_scrub.shape

(345070, 5)

In [15]:
if meta_scrub.reindex(adata.obs.index).index.equals(adata.obs.index):
    adata.obs = pd.concat([adata.obs, meta_scrub.reindex(adata.obs.index)], axis=1)
else:
    raise Exception('Different barcodes in meta and adata')

In [16]:
adata.obs

Unnamed: 0_level_0,latent_RT_efficiency,latent_cell_probability,latent_scale,sangerID,combinedID,donor,donor_type,region,region_finest,age,...,facility,cell_or_nuclei,modality,kit_10x,flushed,scrublet_score,scrublet_leiden,cluster_scrublet_score,doublet_pval,doublet_bh_pval
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart7606896_CTACGTCTCCGTCAAA,8.889344,0.612278,886.427673,HCAHeart7606896,na,D1,DBD,AX,AX,50-55,...,Sanger,Cell,scRNA,3prime-v2,no,0.047619,60,0.047619,0.937444,0.937444
HCAHeart7606896_GATGAGGCACGGCTAC,8.281336,0.976983,615.699524,HCAHeart7606896,na,D1,DBD,AX,AX,50-55,...,Sanger,Cell,scRNA,3prime-v2,no,0.228571,20,0.184933,0.084434,0.832720
HCAHeart7606896_TCAGGATCAGCTCGAC,7.195173,0.934280,702.436768,HCAHeart7606896,na,D1,DBD,AX,AX,50-55,...,Sanger,Cell,scRNA,3prime-v2,no,0.228571,20,0.184933,0.084434,0.832720
HCAHeart7606896_CAAGATCGTCTCACCT,7.452532,0.982054,528.663391,HCAHeart7606896,na,D1,DBD,AX,AX,50-55,...,Sanger,Cell,scRNA,3prime-v2,no,0.172414,20,0.184933,0.084434,0.832720
HCAHeart7606896_GCAAACTAGCTAGCCC,6.711821,0.992885,490.788574,HCAHeart7606896,na,D1,DBD,AX,AX,50-55,...,Sanger,Cell,scRNA,3prime-v2,no,0.228571,20,0.184933,0.084434,0.832720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCAHeart8102868_CGATCGGTCCCTCGAT,0.723355,0.987070,3964.553711,HCAHeart8102868,na,D11,DCD,AX,AX,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.262295,121,0.197903,0.132598,0.771329
HCAHeart8102868_ATTGGGTCAGGTCCCA,0.951182,0.915336,3725.749756,HCAHeart8102868,na,D11,DCD,AX,AX,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.029891,520,0.047002,0.619535,0.771329
HCAHeart8102868_GCAGTTACAGCCCACA,0.936107,0.855016,3798.067139,HCAHeart8102868,na,D11,DCD,AX,AX,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.075342,510,0.067977,0.542644,0.771329
HCAHeart8102868_AGTACCACATGAATAG,0.930775,0.854279,3793.943604,HCAHeart8102868,na,D11,DCD,AX,AX,60-65,...,Sanger,Cell,scRNA,3prime-v3,no,0.029213,145,0.027888,0.685779,0.771329


In [17]:
adata.write('/nfs/team205/heart/anndata_objects/8regions/QC/scRNA_adult_prefilter.h5ad')

... storing 'sangerID' as categorical
... storing 'combinedID' as categorical
... storing 'donor' as categorical
... storing 'donor_type' as categorical
... storing 'region' as categorical
... storing 'region_finest' as categorical
... storing 'age' as categorical
... storing 'gender' as categorical
... storing 'facility' as categorical
... storing 'cell_or_nuclei' as categorical
... storing 'modality' as categorical
... storing 'kit_10x' as categorical
... storing 'flushed' as categorical
... storing 'scrublet_leiden' as categorical
... storing 'gene_name' as categorical
... storing 'feature_type-1' as categorical
... storing 'feature_type-10' as categorical
... storing 'feature_type-11' as categorical
... storing 'feature_type-12' as categorical
... storing 'feature_type-13' as categorical
... storing 'feature_type-14' as categorical
... storing 'feature_type-15' as categorical
... storing 'feature_type-16' as categorical
... storing 'feature_type-17' as categorical
... storing 'feat

In [18]:
adata

AnnData object with n_obs × n_vars = 345070 × 33538
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval'
    var: 'gene_name', 'ambient_expression-0', 'ambient_expression-1', 'feature_type-1', 'ambient_expression-10', 'feature_type-10', 'ambient_expression-11', 'feature_type-11', 'ambient_expression-12', 'feature_type-12', 'ambient_expression-13', 'feature_type-13', 'ambient_expression-14', 'feature_type-14', 'ambient_expression-15', 'feature_type-15', 'ambient_expression-16', 'feature_type-16', 'ambient_expression-17', 'feature_type-17', 'ambient_expression-18', 'feature_type-18', 'ambient_expression-19', 'feature_type-19', 'ambient_expression-2', 'feature_type-2', 'ambient_expression-20', 'feature_type-20', 'amb

In [19]:
adata.obs['donor'].value_counts()

D11    140122
D6      87525
D4      36411
D7      35120
D5      31537
D3      13734
D1        621
Name: donor, dtype: int64