* INCLUDE mixed donor sample
* REMOVED 5-prime sample
* Add lower filtering threshhold for 'total_counts' (not only 'n_genes')
* Add AVN samples 

## Import modules

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import scrublet as scr
import session_info

In [2]:
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')
session_info.show()

## Functions

In [3]:
"""Functions for downstream work with outputs of remove-background."""

# to read cellbender outputs<br>
# https://github.com/broadinstitute/CellBender/issues/57

import tables
import numpy as np
import scipy.sparse as sp
from typing import Dict


def dict_from_h5(file: str) -> Dict[str, np.ndarray]:
    """Read in everything from an h5 file and put into a dictionary."""
    d = {}
    with tables.open_file(file) as f:
        # read in everything
        for array in f.walk_nodes("/", "Array"):
            d[array.name] = array.read()
    return d


def anndata_from_h5(file: str,
                    analyzed_barcodes_only: bool = True) -> 'anndata.AnnData':
    """Load an output h5 file into an AnnData object for downstream work.

    Args:
        file: The h5 file
        analyzed_barcodes_only: False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count matrix.
            True to load a limited set of barcodes: only those analyzed by the
            algorithm. This allows relevant latent variables to be loaded
            properly into adata.obs and adata.obsm, rather than adata.uns.

    Returns:
        adata: The anndata object, populated with inferred latent variables
            and metadata.

    """

    try:
        import anndata
    except ImportError:
        raise ImportError('The anndata package must be installed to use the '
                          'function anndata_from_h5()')

    d = dict_from_h5(file)
    X = sp.csc_matrix((d.pop('data'), d.pop('indices'), d.pop('indptr')),
                      shape=d.pop('shape')).transpose().tocsr()

    if analyzed_barcodes_only:
        if 'barcodes_analyzed_inds' in d.keys():
            X = X[d['barcodes_analyzed_inds'], :]
            d['barcodes'] = d['barcodes'][d['barcodes_analyzed_inds']]
        elif 'barcode_indices_for_latents' in d.keys():
            X = X[d['barcode_indices_for_latents'], :]
            d['barcodes'] = d['barcodes'][d['barcode_indices_for_latents']]
        else:
            print('Warning: analyzed_barcodes_only=True, but the key '
                  '"barcodes_analyzed_inds" or "barcode_indices_for_latents" '
                  'is missing from the h5 file. '
                  'Will output all barcodes, and proceed as if '
                  'analyzed_barcodes_only=False')

    # Construct the count matrix.
    adata = anndata.AnnData(X=X,
                            obs={'barcode': d.pop('barcodes').astype(str)},
                            var={'gene_name': (d.pop('gene_names') if 'gene_names' in d.keys()
                                               else d.pop('name')).astype(str)})
    adata.obs.set_index('barcode', inplace=True)
    adata.var.set_index('gene_name', inplace=True)

    # Add other information to the adata object in the appropriate slot.
    for key, value in d.items():
        try:
            value = np.asarray(value)
            if len(value.shape) == 0:
                adata.uns[key] = value
            elif value.shape[0] == X.shape[0]:
                if (len(value.shape) < 2) or (value.shape[1] < 2):
                    adata.obs[key] = value
                else:
                    adata.obsm[key] = value
            elif value.shape[0] == X.shape[1]:
                if value.dtype.name.startswith('bytes'):
                    adata.var[key] = value.astype(str)
                else:
                    adata.var[key] = value
            else:
                adata.uns[key] = value
        except Exception:
            print('Unable to load data into AnnData: ', key, value, type(value))

    if analyzed_barcodes_only:
        for col in adata.obs.columns[adata.obs.columns.str.startswith('barcodes_analyzed')
                                     | adata.obs.columns.str.startswith('barcode_indices')]:
            try:
                del adata.obs[col]
            except Exception:
                pass

    return adata

In [4]:
# from https://github.com/Teichlab/mapcloud/blob/master/scripts/starsolo/postprocess.py

from statsmodels.stats.multitest import multipletests
# from emptydrops.matrix import CountMatrix
# from emptydrops import find_nonambient_barcodes
import scrublet as scr
import scanpy as sc
import pandas as pd
import numpy as np
import scipy

#some functions that Ni uses in scanpy scripts to run scrublet
#which in turn are inspired by my original notebook on the matter
#(extracted from scanpy_scripts 0.2.10 to get around scanpy version incompatibility)
def test_outlier(x, upper_mad_only=True):
	med = np.median(x)
	if upper_mad_only:
		mad = np.median(x[x>med] - med) * 1.4826
	else:
		mad = np.median(np.abs(x - med)) * 1.4826
	pvals = 1 - scipy.stats.norm.cdf(x, loc=med, scale=mad)
	bh_pvals = multipletests(pvals, method='fdr_bh')[1]
	return pvals, bh_pvals

def run_scrublet(adata, resolution_function=None):
	old_verbosity = sc.settings.verbosity
	sc.settings.verbosity = 1
	if resolution_function is None:
		resolution_function = lambda x: np.maximum(np.maximum(np.log10(x)-1, 0)**2, 0.1)
	scrub = scr.Scrublet(adata.X)
	#this has the potential to brick for poor quality data
	#if so, abort it and everything downstream
	try:
		ds, pd = scrub.scrub_doublets(verbose=False)
	except:
		return
	adata.obs['scrublet_score'] = ds

	adata_copy = adata.copy()
	sc.pp.filter_genes(adata_copy, min_cells=3)
	sc.pp.normalize_total(adata_copy, target_sum=1e4)
	sc.pp.log1p(adata_copy)
	sc.pp.highly_variable_genes(adata_copy, min_mean=0.0125, max_mean=3, min_disp=0.5, subset=True)
	sc.pp.scale(adata_copy, zero_center=False)
	sc.pp.pca(adata_copy, svd_solver='arpack', zero_center=False)
	sc.pp.neighbors(adata_copy, n_pcs=30)
	sc.tl.umap(adata_copy)
	sc.tl.leiden(adata_copy, resolution=1)
	for clst in np.unique(adata_copy.obs['leiden']):
		clst_size = sum(adata_copy.obs['leiden'] == clst)
		sc.tl.leiden(adata_copy, restrict_to=('leiden', [clst]), resolution=resolution_function(clst_size), key_added='leiden_R')
		adata_copy.obs['leiden'] = adata_copy.obs['leiden_R']
	clst_meds = []
	for clst in np.unique(adata_copy.obs['leiden']):
		k = adata_copy.obs['leiden'] == clst
		clst_med = np.median(adata_copy.obs.loc[k, 'scrublet_score'])
		adata_copy.obs.loc[k, 'cluster_scrublet_score'] = clst_med
		clst_meds.append(clst_med)
	clst_meds = np.array(clst_meds)
	pvals, bh_pvals = test_outlier(clst_meds)
	for i, clst in enumerate(np.unique(adata_copy.obs['leiden'])):
		k = adata_copy.obs['leiden'] == clst
		adata_copy.obs.loc[k, 'pval'] = pvals[i]
		adata_copy.obs.loc[k, 'bh_pval'] = bh_pvals[i]
	sc.settings.verbosity = old_verbosity
	#need to also export the clustering, for soupx purposes
	adata.obs['scrublet_leiden'] = adata_copy.obs['leiden']
	adata.obs['scrublet_score'] = adata_copy.obs['scrublet_score']
	adata.obs['cluster_scrublet_score'] = adata_copy.obs['cluster_scrublet_score']
	adata.obs['doublet_pval'] = adata_copy.obs['pval']
	adata.obs['doublet_bh_pval'] = adata_copy.obs['bh_pval']
	del adata_copy

## Create anndata object

In [5]:
# read in metadata
metadata = pd.read_csv('/nfs/team205/heart/anndata_objects/8regions/metadata/HeartTeamSamples_Mappeddata_20220531.csv', sep = ',', index_col = None)
metadata = metadata[metadata['Publication']=='8regions']

print(metadata['modality'].value_counts())

snRNA            94
scRNA            54
Visium           46
Multiome-RNA     30
Multiome-ATAC    30
snATAC           21
Visium-FFPE       4
Name: modality, dtype: int64


In [6]:
# select modality
metadata = metadata[metadata['modality']=='Multiome-RNA']

In [7]:
########### editted: put id to var_names ###########

# read in
adatas=[]
for i in range(len(metadata)):
    print(metadata.iloc[i]['combinedID'])
    
    path=metadata.iloc[i]['CellBender_out']
    adatas.append(anndata_from_h5(path + '/' + path.split('/')[-1] + '_cellbender_out_filtered.h5', analyzed_barcodes_only=False))
    del path
    
    # replace var_names with gene_id 
    adatas[i].var.reset_index(inplace=True)
    adatas[i].var.set_index('id',inplace=True)
    
    # modify barcodes
    adatas[i].obs.index = metadata.iloc[i]['combinedID'] + '_' + adatas[i].obs.index 
    
    # add metadata
    for col in ['sangerID','combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age',
                'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x','flushed']:
        adatas[i].obs[col] = metadata.iloc[i][col]


# concatenate
adata = adatas[0].concatenate(adatas[1:], index_unique = None, batch_key=None)
adata.shape

HCAHeart9508627_HCAHeart9508819
HCAHeart9508628_HCAHeart9508820
HCAHeart9508629_HCAHeart9508821
HCAHeart9845431_HCAHeart9917173
HCAHeart9845432_HCAHeart9917174
HCAHeart9845433_HCAHeart9917175
HCAHeart9845434_HCAHeart9917176
HCAHeart9845435_HCAHeart9917177
HCAHeart9845436_HCAHeart9917178
HCAHeartST10773165_HCAHeartST10781062
HCAHeartST10773166_HCAHeartST10781063
HCAHeartST10773167_HCAHeartST10781064
HCAHeartST10773168_HCAHeartST10781065
HCAHeartST10773169_HCAHeartST10781446
HCAHeartST10773170_HCAHeartST10781447
HCAHeartST10773171_HCAHeartST10781448
HCAHeartST11064574_HCAHeartST11023239
HCAHeartST11064575_HCAHeartST11023240
HCAHeartST11064576_HCAHeartST11023241
HCAHeartST11064577_HCAHeartST11023242
HCAHeartST11350184_HCAHeartST11121842
HCAHeartST11350186_HCAHeartST11121844
HCAHeartST11350187_HCAHeartST11121845
HCAHeartST11350192_HCAHeartST11445769
HCAHeartST11350193_HCAHeartST11445770
HCAHeartST11350194_HCAHeartST11445771
HCAHeartST11350195_HCAHeartST11445772
HCAHeartST11350196_HCAHeartS

(235093, 36601)

In [8]:
adata.obs['combinedID'].value_counts()

HCAHeartST11350194_HCAHeartST11445771    18132
HCAHeartST11064577_HCAHeartST11023242    17528
HCAHeartST10773168_HCAHeartST10781065    15618
HCAHeartST10773171_HCAHeartST10781448    15000
HCAHeartST10773166_HCAHeartST10781063    14353
HCAHeartST10773170_HCAHeartST10781447    13330
HCAHeart9508628_HCAHeart9508820          12703
HCAHeart9508629_HCAHeart9508821          10389
HCAHeartST11350192_HCAHeartST11445769     9707
HCAHeart9845434_HCAHeart9917176           9401
HCAHeart9845431_HCAHeart9917173           9020
HCAHeart9845436_HCAHeart9917178           8517
HCAHeartST10773167_HCAHeartST10781064     7800
HCAHeart9845435_HCAHeart9917177           7681
HCAHeartST11350195_HCAHeartST11445772     7333
HCAHeartST11064575_HCAHeartST11023240     7141
HCAHeartST10773165_HCAHeartST10781062     6743
HCAHeartST11350197_HCAHeartST11445774     6469
HCAHeartST11350198_HCAHeartST11445775     6090
HCAHeart9845433_HCAHeart9917175           5031
HCAHeart9845432_HCAHeart9917174           4583
HCAHeartST107

## Run scrublet

In [9]:
%%time

# per rxn
for i,ID in enumerate(adata.obs['combinedID'].unique()):
    print(ID)
    
    ad = adata[adata.obs['combinedID'] == ID].copy()
    run_scrublet(ad)
    if i==0:
        meta = ad.obs
    else:
        meta = pd.concat([meta, ad.obs])
    del ad

HCAHeart9508627_HCAHeart9508819


  w.setdiag(float(target_total) / tots_use)


HCAHeart9508628_HCAHeart9508820


  w.setdiag(float(target_total) / tots_use)


HCAHeart9508629_HCAHeart9508821


  w.setdiag(float(target_total) / tots_use)


HCAHeart9845431_HCAHeart9917173


  w.setdiag(float(target_total) / tots_use)


HCAHeart9845432_HCAHeart9917174
HCAHeart9845433_HCAHeart9917175
HCAHeart9845434_HCAHeart9917176


  w.setdiag(float(target_total) / tots_use)


HCAHeart9845435_HCAHeart9917177


  w.setdiag(float(target_total) / tots_use)


HCAHeart9845436_HCAHeart9917178


  w.setdiag(float(target_total) / tots_use)


HCAHeartST10773165_HCAHeartST10781062


  w.setdiag(float(target_total) / tots_use)


HCAHeartST10773166_HCAHeartST10781063


  w.setdiag(float(target_total) / tots_use)


HCAHeartST10773167_HCAHeartST10781064


  w.setdiag(float(target_total) / tots_use)


HCAHeartST10773168_HCAHeartST10781065


  w.setdiag(float(target_total) / tots_use)


HCAHeartST10773169_HCAHeartST10781446


  w.setdiag(float(target_total) / tots_use)


HCAHeartST10773170_HCAHeartST10781447


  w.setdiag(float(target_total) / tots_use)


HCAHeartST10773171_HCAHeartST10781448
HCAHeartST11064574_HCAHeartST11023239


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11064575_HCAHeartST11023240


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11064576_HCAHeartST11023241


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11064577_HCAHeartST11023242


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11350184_HCAHeartST11121842


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11350186_HCAHeartST11121844


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11350187_HCAHeartST11121845


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11350192_HCAHeartST11445769


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11350193_HCAHeartST11445770


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11350194_HCAHeartST11445771


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11350195_HCAHeartST11445772


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11350196_HCAHeartST11445773


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11350197_HCAHeartST11445774


  w.setdiag(float(target_total) / tots_use)


HCAHeartST11350198_HCAHeartST11445775


  w.setdiag(float(target_total) / tots_use)


CPU times: user 51min 30s, sys: 36min 27s, total: 1h 27min 57s
Wall time: 27min 2s


## Add scrublet outputs to adata

In [10]:
meta_scrub = meta[['scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval']].copy()
meta_scrub.shape

(235093, 5)

In [11]:
if meta_scrub.reindex(adata.obs.index).index.equals(adata.obs.index):
    adata.obs = pd.concat([adata.obs, meta_scrub.reindex(adata.obs.index)], axis=1)
else:
    raise Exception('Different barcodes in meta and adata')

In [12]:
adata.obs.head()

Unnamed: 0_level_0,latent_RT_efficiency,latent_cell_probability,latent_scale,sangerID,combinedID,donor,donor_type,region,region_finest,age,...,facility,cell_or_nuclei,modality,kit_10x,flushed,scrublet_score,scrublet_leiden,cluster_scrublet_score,doublet_pval,doublet_bh_pval
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart9508627_HCAHeart9508819_AGGAAACGTTTATCGC-1,10.730612,1.0,42174.875,HCAHeart9508627,HCAHeart9508627_HCAHeart9508819,D3,DBD,LV,LV,55-60,...,Sanger,Nuclei,Multiome-RNA,Multiome-v1,no,0.19685,12,0.208791,0.133687,0.776992
HCAHeart9508627_HCAHeart9508819_CACTAGCCATAATGAG-1,10.228636,1.0,42687.246094,HCAHeart9508627,HCAHeart9508627_HCAHeart9508819,D3,DBD,LV,LV,55-60,...,Sanger,Nuclei,Multiome-RNA,Multiome-v1,no,0.208791,12,0.208791,0.133687,0.776992
HCAHeart9508627_HCAHeart9508819_CGTTTCTCAACTAACT-1,9.976335,1.0,36491.726562,HCAHeart9508627,HCAHeart9508627_HCAHeart9508819,D3,DBD,LV,LV,55-60,...,Sanger,Nuclei,Multiome-RNA,Multiome-v1,no,0.166667,63,0.221902,0.108982,0.776992
HCAHeart9508627_HCAHeart9508819_TGTAAGTGTAAGCACC-1,9.076878,1.0,42754.671875,HCAHeart9508627,HCAHeart9508627_HCAHeart9508819,D3,DBD,LV,LV,55-60,...,Sanger,Nuclei,Multiome-RNA,Multiome-v1,no,0.19685,12,0.208791,0.133687,0.776992
HCAHeart9508627_HCAHeart9508819_ACGTTACAGCATTTCT-1,10.063547,1.0,31437.021484,HCAHeart9508627,HCAHeart9508627_HCAHeart9508819,D3,DBD,LV,LV,55-60,...,Sanger,Nuclei,Multiome-RNA,Multiome-v1,no,0.19685,612,0.19685,0.159319,0.776992


In [13]:
adata.write('/nfs/team205/heart/anndata_objects/8regions/QC/multiome_RNA_adult_prefilter.h5ad')

... storing 'sangerID' as categorical
... storing 'combinedID' as categorical
... storing 'donor' as categorical
... storing 'donor_type' as categorical
... storing 'region' as categorical
... storing 'region_finest' as categorical
... storing 'age' as categorical
... storing 'gender' as categorical
... storing 'facility' as categorical
... storing 'cell_or_nuclei' as categorical
... storing 'modality' as categorical
... storing 'kit_10x' as categorical
... storing 'flushed' as categorical
... storing 'scrublet_leiden' as categorical
... storing 'gene_name' as categorical
... storing 'feature_type' as categorical


In [14]:
adata

AnnData object with n_obs × n_vars = 235093 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval'
    var: 'gene_name', 'feature_type', 'ambient_expression-0', 'ambient_expression-1', 'ambient_expression-10', 'ambient_expression-11', 'ambient_expression-12', 'ambient_expression-13', 'ambient_expression-14', 'ambient_expression-15', 'ambient_expression-16', 'ambient_expression-17', 'ambient_expression-18', 'ambient_expression-19', 'ambient_expression-2', 'ambient_expression-20', 'ambient_expression-21', 'ambient_expression-22', 'ambient_expression-23', 'ambient_expression-24', 'ambient_expression-25', 'ambient_expression-26', 'ambient_expression-27', 'ambient_expression-28', 'ambient_expression-29', 'ambient

In [15]:
adata.obs['donor'].value_counts()

AH1        104998
A61         55325
D8          44233
D7          23092
AH1-A61      3748
D3           3697
Name: donor, dtype: int64