* only SAN/AVN cells
* only New data

## Import modules

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import scrublet as scr
import session_info

In [2]:
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')
session_info.show()

## Functions

In [3]:
"""Functions for downstream work with outputs of remove-background."""

# to read cellbender outputs<br>
# https://github.com/broadinstitute/CellBender/issues/57

import tables
import numpy as np
import scipy.sparse as sp
from typing import Dict


def dict_from_h5(file: str) -> Dict[str, np.ndarray]:
    """Read in everything from an h5 file and put into a dictionary."""
    d = {}
    with tables.open_file(file) as f:
        # read in everything
        for array in f.walk_nodes("/", "Array"):
            d[array.name] = array.read()
    return d


def anndata_from_h5(file: str,
                    analyzed_barcodes_only: bool = True) -> 'anndata.AnnData':
    """Load an output h5 file into an AnnData object for downstream work.

    Args:
        file: The h5 file
        analyzed_barcodes_only: False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count matrix.
            True to load a limited set of barcodes: only those analyzed by the
            algorithm. This allows relevant latent variables to be loaded
            properly into adata.obs and adata.obsm, rather than adata.uns.

    Returns:
        adata: The anndata object, populated with inferred latent variables
            and metadata.

    """

    try:
        import anndata
    except ImportError:
        raise ImportError('The anndata package must be installed to use the '
                          'function anndata_from_h5()')

    d = dict_from_h5(file)
    X = sp.csc_matrix((d.pop('data'), d.pop('indices'), d.pop('indptr')),
                      shape=d.pop('shape')).transpose().tocsr()

    if analyzed_barcodes_only:
        if 'barcodes_analyzed_inds' in d.keys():
            X = X[d['barcodes_analyzed_inds'], :]
            d['barcodes'] = d['barcodes'][d['barcodes_analyzed_inds']]
        elif 'barcode_indices_for_latents' in d.keys():
            X = X[d['barcode_indices_for_latents'], :]
            d['barcodes'] = d['barcodes'][d['barcode_indices_for_latents']]
        else:
            print('Warning: analyzed_barcodes_only=True, but the key '
                  '"barcodes_analyzed_inds" or "barcode_indices_for_latents" '
                  'is missing from the h5 file. '
                  'Will output all barcodes, and proceed as if '
                  'analyzed_barcodes_only=False')

    # Construct the count matrix.
    adata = anndata.AnnData(X=X,
                            obs={'barcode': d.pop('barcodes').astype(str)},
                            var={'gene_name': (d.pop('gene_names') if 'gene_names' in d.keys()
                                               else d.pop('name')).astype(str)})
    adata.obs.set_index('barcode', inplace=True)
    adata.var.set_index('gene_name', inplace=True)

    # Add other information to the adata object in the appropriate slot.
    for key, value in d.items():
        try:
            value = np.asarray(value)
            if len(value.shape) == 0:
                adata.uns[key] = value
            elif value.shape[0] == X.shape[0]:
                if (len(value.shape) < 2) or (value.shape[1] < 2):
                    adata.obs[key] = value
                else:
                    adata.obsm[key] = value
            elif value.shape[0] == X.shape[1]:
                if value.dtype.name.startswith('bytes'):
                    adata.var[key] = value.astype(str)
                else:
                    adata.var[key] = value
            else:
                adata.uns[key] = value
        except Exception:
            print('Unable to load data into AnnData: ', key, value, type(value))

    if analyzed_barcodes_only:
        for col in adata.obs.columns[adata.obs.columns.str.startswith('barcodes_analyzed')
                                     | adata.obs.columns.str.startswith('barcode_indices')]:
            try:
                del adata.obs[col]
            except Exception:
                pass

    return adata

In [4]:
# from https://github.com/Teichlab/mapcloud/blob/master/scripts/starsolo/postprocess.py

from statsmodels.stats.multitest import multipletests
# from emptydrops.matrix import CountMatrix
# from emptydrops import find_nonambient_barcodes
import scrublet as scr
import scanpy as sc
import pandas as pd
import numpy as np
import scipy

#some functions that Ni uses in scanpy scripts to run scrublet
#which in turn are inspired by my original notebook on the matter
#(extracted from scanpy_scripts 0.2.10 to get around scanpy version incompatibility)
def test_outlier(x, upper_mad_only=True):
	med = np.median(x)
	if upper_mad_only:
		mad = np.median(x[x>med] - med) * 1.4826
	else:
		mad = np.median(np.abs(x - med)) * 1.4826
	pvals = 1 - scipy.stats.norm.cdf(x, loc=med, scale=mad)
	bh_pvals = multipletests(pvals, method='fdr_bh')[1]
	return pvals, bh_pvals

def run_scrublet(adata, resolution_function=None):
	old_verbosity = sc.settings.verbosity
	sc.settings.verbosity = 1
	if resolution_function is None:
		resolution_function = lambda x: np.maximum(np.maximum(np.log10(x)-1, 0)**2, 0.1)
	scrub = scr.Scrublet(adata.X)
	#this has the potential to brick for poor quality data
	#if so, abort it and everything downstream
	try:
		ds, pd = scrub.scrub_doublets(verbose=False)
	except:
		return
	adata.obs['scrublet_score'] = ds

	adata_copy = adata.copy()
	sc.pp.filter_genes(adata_copy, min_cells=3)
	sc.pp.normalize_total(adata_copy, target_sum=1e4)
	sc.pp.log1p(adata_copy)
	sc.pp.highly_variable_genes(adata_copy, min_mean=0.0125, max_mean=3, min_disp=0.5, subset=True)
	sc.pp.scale(adata_copy, zero_center=False)
	sc.pp.pca(adata_copy, svd_solver='arpack', zero_center=False)
	sc.pp.neighbors(adata_copy, n_pcs=30)
	sc.tl.umap(adata_copy)
	sc.tl.leiden(adata_copy, resolution=1)
	for clst in np.unique(adata_copy.obs['leiden']):
		clst_size = sum(adata_copy.obs['leiden'] == clst)
		sc.tl.leiden(adata_copy, restrict_to=('leiden', [clst]), resolution=resolution_function(clst_size), key_added='leiden_R')
		adata_copy.obs['leiden'] = adata_copy.obs['leiden_R']
	clst_meds = []
	for clst in np.unique(adata_copy.obs['leiden']):
		k = adata_copy.obs['leiden'] == clst
		clst_med = np.median(adata_copy.obs.loc[k, 'scrublet_score'])
		adata_copy.obs.loc[k, 'cluster_scrublet_score'] = clst_med
		clst_meds.append(clst_med)
	clst_meds = np.array(clst_meds)
	pvals, bh_pvals = test_outlier(clst_meds)
	for i, clst in enumerate(np.unique(adata_copy.obs['leiden'])):
		k = adata_copy.obs['leiden'] == clst
		adata_copy.obs.loc[k, 'pval'] = pvals[i]
		adata_copy.obs.loc[k, 'bh_pval'] = bh_pvals[i]
	sc.settings.verbosity = old_verbosity
	#need to also export the clustering, for soupx purposes
	adata.obs['scrublet_leiden'] = adata_copy.obs['leiden']
	adata.obs['scrublet_score'] = adata_copy.obs['scrublet_score']
	adata.obs['cluster_scrublet_score'] = adata_copy.obs['cluster_scrublet_score']
	adata.obs['doublet_pval'] = adata_copy.obs['pval']
	adata.obs['doublet_bh_pval'] = adata_copy.obs['bh_pval']
	del adata_copy

## Create anndata object

In [6]:
# read in metadata
metadata = pd.read_csv('/nfs/team205/heart/anndata_objects/8regions/metadata/HeartTeamSamples_Mappeddata_20221208.csv', sep = ',', index_col = None)
metadata = metadata[metadata['Publication']=='8regions_revision']

print(metadata['modality'].value_counts())

Multiome-ATAC    17
Multiome-RNA     17
Visium            8
Name: modality, dtype: int64


In [7]:
# select modality
metadata = metadata[metadata['modality']=='Multiome-RNA']

# select SAN
metadata = metadata[metadata['region'].isin(['SAN','AVN'])]
metadata

Unnamed: 0,ID,CombinedID_Multiome,Publication,Foetal_or_Adult,Provider,Chemistry,Mapping_ver,Reference_genome,Mapping_date,Mapping_iRODS,...,region,region_finest,age,gender,facility,cell_or_nuclei,modality,kit_10x,flushed,status
473,HCAHeartST13146201,HCAHeartST13146201_HCAHeartST13129134,8regions_revision,Adult,"Sanger, Heart",Single Cell Multiome ATAC + Gene Expression v1,cellranger-arc-2.0.0,GRCh38-2020-A-2.0.0,,/seq/illumina/cellranger-arc/cellranger-arc200...,...,SAN,na,20-25,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,Healthy
474,HCAHeartST13146202,HCAHeartST13146202_HCAHeartST13129135,8regions_revision,Adult,"Sanger, Heart",Single Cell Multiome ATAC + Gene Expression v1,cellranger-arc-2.0.0,GRCh38-2020-A-2.0.0,,/seq/illumina/cellranger-arc/cellranger-arc200...,...,SAN,na,20-25,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,Healthy
475,HCAHeartST13146203,HCAHeartST13146203_HCAHeartST13129136,8regions_revision,Adult,"Sanger, Heart",Single Cell Multiome ATAC + Gene Expression v1,cellranger-arc-2.0.0,GRCh38-2020-A-2.0.0,,/seq/illumina/cellranger-arc/cellranger-arc200...,...,SAN,na,20-25,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,Healthy
476,HCAHeartST13146204,HCAHeartST13146204_HCAHeartST13129137,8regions_revision,Adult,"Sanger, Heart",Single Cell Multiome ATAC + Gene Expression v1,cellranger-arc-2.0.0,GRCh38-2020-A-2.0.0,,/seq/illumina/cellranger-arc/cellranger-arc200...,...,SAN,na,45-50,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,Healthy
477,HCAHeartST13146205,HCAHeartST13146205_HCAHeartST13129138,8regions_revision,Adult,"Sanger, Heart",Single Cell Multiome ATAC + Gene Expression v1,cellranger-arc-2.0.0,GRCh38-2020-A-2.0.0,,/seq/illumina/cellranger-arc/cellranger-arc200...,...,SAN,na,45-50,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,Healthy
478,HCAHeartST13146206,HCAHeartST13146206_HCAHeartST13129139,8regions_revision,Adult,"Sanger, Heart",Single Cell Multiome ATAC + Gene Expression v1,cellranger-arc-2.0.0,GRCh38-2020-A-2.0.0,,/seq/illumina/cellranger-arc/cellranger-arc200...,...,SAN,na,45-50,Male,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,Healthy
479,HCAHeartST13146207,HCAHeartST13146207_HCAHeartST13129140,8regions_revision,Adult,"Sanger, Heart",Single Cell Multiome ATAC + Gene Expression v1,cellranger-arc-2.0.0,GRCh38-2020-A-2.0.0,,/seq/illumina/cellranger-arc/cellranger-arc200...,...,SAN,na,60-65,Female,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,Healthy
480,HCAHeartST13146208,HCAHeartST13146208_HCAHeartST13129141,8regions_revision,Adult,"Sanger, Heart",Single Cell Multiome ATAC + Gene Expression v1,cellranger-arc-2.0.0,GRCh38-2020-A-2.0.0,,/seq/illumina/cellranger-arc/cellranger-arc200...,...,SAN,na,60-65,Female,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,Healthy
543,HCAHeartST13168796,HCAHeartST13168796_HCAHeartST13167740,8regions_revision,Adult,"Sanger, Heart",Single Cell Multiome ATAC + Gene Expression v1,cellranger-arc-2.0.0,GRCh38-2020-A-2.0.0,,/seq/illumina/cellranger-arc/cellranger-arc200...,...,SAN,SAN,45-50,Female,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,Healthy
548,HCAHeartST13189995,HCAHeartST13189995_HCAHeartST13188800,8regions_revision,Adult,"Sanger, Heart",Single Cell Multiome ATAC + Gene Expression v1,cellranger-arc-2.0.0,GRCh38-2020-A-2.0.0,,/seq/illumina/cellranger-arc/cellranger-arc200...,...,SAN,SAN,70-75,Female,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,Healthy


In [8]:
########### editted: put id to var_names ###########

# read in
adatas=[]
for i in range(len(metadata)):
    print(metadata.iloc[i]['combinedID'])
    
    path=metadata.iloc[i]['CellBender_out']
    adatas.append(anndata_from_h5(path + '/' + path.split('/')[-1] + '_cellbender_out_filtered.h5', analyzed_barcodes_only=False))
    del path
    
    # replace var_names with gene_id 
    adatas[i].var.reset_index(inplace=True)
    adatas[i].var.set_index('id',inplace=True)
    
    # modify barcodes
    adatas[i].obs.index = metadata.iloc[i]['combinedID'] + '_' + adatas[i].obs.index 
    
    # add metadata
    for col in ['sangerID','combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age',
                'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x','flushed']:
        adatas[i].obs[col] = metadata.iloc[i][col]


# concatenate
adata = adatas[0].concatenate(adatas[1:], index_unique = None, batch_key=None)
adata.shape

HCAHeartST13146201_HCAHeartST13129134
HCAHeartST13146202_HCAHeartST13129135
HCAHeartST13146203_HCAHeartST13129136
HCAHeartST13146204_HCAHeartST13129137
HCAHeartST13146205_HCAHeartST13129138
HCAHeartST13146206_HCAHeartST13129139
HCAHeartST13146207_HCAHeartST13129140
HCAHeartST13146208_HCAHeartST13129141
HCAHeartST13168796_HCAHeartST13167740
HCAHeartST13189995_HCAHeartST13188800
HCAHeartST13189996_HCAHeartST13188801
HCAHeartST13189997_HCAHeartST13188802
HCAHeartST13180615_HCAHeartST13177112
HCAHeartST13180616_HCAHeartST13177113
HCAHeartST13180617_HCAHeartST13177114
HCAHeartST13180618_HCAHeartST13177115
HCAHeartST13180619_HCAHeartST13177116


(146652, 36601)

In [9]:
adata.obs['combinedID'].value_counts()

HCAHeartST13180617_HCAHeartST13177114    16625
HCAHeartST13180615_HCAHeartST13177112    16497
HCAHeartST13189996_HCAHeartST13188801    15965
HCAHeartST13180616_HCAHeartST13177113    12300
HCAHeartST13180618_HCAHeartST13177115    10480
HCAHeartST13189997_HCAHeartST13188802    10159
HCAHeartST13189995_HCAHeartST13188800     9365
HCAHeartST13180619_HCAHeartST13177116     9001
HCAHeartST13168796_HCAHeartST13167740     8619
HCAHeartST13146207_HCAHeartST13129140     7593
HCAHeartST13146204_HCAHeartST13129137     6206
HCAHeartST13146205_HCAHeartST13129138     5833
HCAHeartST13146206_HCAHeartST13129139     4770
HCAHeartST13146208_HCAHeartST13129141     4392
HCAHeartST13146202_HCAHeartST13129135     4286
HCAHeartST13146203_HCAHeartST13129136     3515
HCAHeartST13146201_HCAHeartST13129134     1046
Name: combinedID, dtype: int64

## Run scrublet

In [10]:
%%time

# per rxn
for i,ID in enumerate(adata.obs['combinedID'].unique()):
    print(ID)
    
    ad = adata[adata.obs['combinedID'] == ID].copy()
    run_scrublet(ad)
    if i==0:
        meta = ad.obs
    else:
        meta = pd.concat([meta, ad.obs])
    del ad

HCAHeartST13146201_HCAHeartST13129134


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13146202_HCAHeartST13129135


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13146203_HCAHeartST13129136
HCAHeartST13146204_HCAHeartST13129137
HCAHeartST13146205_HCAHeartST13129138


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13146206_HCAHeartST13129139


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13146207_HCAHeartST13129140


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13146208_HCAHeartST13129141


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13168796_HCAHeartST13167740


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13189995_HCAHeartST13188800


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13189996_HCAHeartST13188801


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13189997_HCAHeartST13188802


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13180615_HCAHeartST13177112


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13180616_HCAHeartST13177113


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13180617_HCAHeartST13177114


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13180618_HCAHeartST13177115


  w.setdiag(float(target_total) / tots_use)


HCAHeartST13180619_HCAHeartST13177116


  w.setdiag(float(target_total) / tots_use)


CPU times: user 32min 23s, sys: 16min 30s, total: 48min 54s
Wall time: 12min 21s


## Add scrublet outputs to adata

In [11]:
meta_scrub = meta[['scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval']].copy()
meta_scrub.shape

(146652, 5)

In [12]:
if meta_scrub.reindex(adata.obs.index).index.equals(adata.obs.index):
    adata.obs = pd.concat([adata.obs, meta_scrub.reindex(adata.obs.index)], axis=1)
else:
    raise Exception('Different barcodes in meta and adata')

In [13]:
adata.obs.head()

Unnamed: 0_level_0,latent_RT_efficiency,latent_cell_probability,latent_scale,sangerID,combinedID,donor,donor_type,region,region_finest,age,...,facility,cell_or_nuclei,modality,kit_10x,flushed,scrublet_score,scrublet_leiden,cluster_scrublet_score,doublet_pval,doublet_bh_pval
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST13146201_HCAHeartST13129134_GCAAACCGTCAAGTGC-1,3.620071,1.0,24020.279297,HCAHeartST13146201,HCAHeartST13146201_HCAHeartST13129134,AV10,DBD,SAN,na,20-25,...,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,0.225806,63,0.254438,4.3e-05,0.000586
HCAHeartST13146201_HCAHeartST13129134_ACAGCCGGTTAGGCGT-1,3.613875,1.0,23438.445312,HCAHeartST13146201,HCAHeartST13146201_HCAHeartST13129134,AV10,DBD,SAN,na,20-25,...,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,0.136531,63,0.254438,4.3e-05,0.000586
HCAHeartST13146201_HCAHeartST13129134_AGTGTGGCAAAGCTCC-1,3.296565,1.0,15958.253906,HCAHeartST13146201,HCAHeartST13146201_HCAHeartST13129134,AV10,DBD,SAN,na,20-25,...,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,0.254438,62,0.254438,4.3e-05,0.000586
HCAHeartST13146201_HCAHeartST13129134_CAAGTGAAGCATGAAG-1,3.371856,1.0,18051.541016,HCAHeartST13146201,HCAHeartST13146201_HCAHeartST13129134,AV10,DBD,SAN,na,20-25,...,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,0.289474,70,0.225806,0.000674,0.006062
HCAHeartST13146201_HCAHeartST13129134_TGACTCCTCGTAATCA-1,3.205887,1.0,15712.057617,HCAHeartST13146201,HCAHeartST13146201_HCAHeartST13129134,AV10,DBD,SAN,na,20-25,...,Sanger,Nuclei,Multiome-RNA,Multiome-v1,yes,0.254438,72,0.213888,0.001828,0.014099


In [14]:
adata.write('/nfs/team205/heart/anndata_objects/8regions/QC/multiome_RNA_adult_new-SAN-AVN_prefilter.h5ad')

... storing 'sangerID' as categorical
... storing 'combinedID' as categorical
... storing 'donor' as categorical
... storing 'donor_type' as categorical
... storing 'region' as categorical
... storing 'region_finest' as categorical
... storing 'age' as categorical
... storing 'gender' as categorical
... storing 'facility' as categorical
... storing 'cell_or_nuclei' as categorical
... storing 'modality' as categorical
... storing 'kit_10x' as categorical
... storing 'flushed' as categorical
... storing 'scrublet_leiden' as categorical
... storing 'gene_name' as categorical
... storing 'feature_type' as categorical


In [15]:
adata

AnnData object with n_obs × n_vars = 146652 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval'
    var: 'gene_name', 'feature_type', 'ambient_expression-0', 'ambient_expression-1', 'ambient_expression-10', 'ambient_expression-11', 'ambient_expression-12', 'ambient_expression-13', 'ambient_expression-14', 'ambient_expression-15', 'ambient_expression-16', 'ambient_expression-2', 'ambient_expression-3', 'ambient_expression-4', 'ambient_expression-5', 'ambient_expression-6', 'ambient_expression-7', 'ambient_expression-8', 'ambient_expression-9'
    obsm: 'latent_gene_encoding'

In [16]:
adata.obs['donor'].value_counts()

AH2     64903
AV13    35489
AV14    16809
AV3     11985
AV10     8847
AH1      8619
Name: donor, dtype: int64