# Prepare Compass inputs, random groups within cell states

Due to large numbers of micropool clusters being contaminated with cells from other annotated groups, we will divide up cells within each celltype into groups of 20 (CM) and 50 (all cells)

## Setup

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import rc_context
import bbknn
import re
import sctk
import milopy
import milopy.core as milo

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sc.settings.verbosity = 0            # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white', color_map = 'Reds', figsize = (5, 5),dpi_save=300)

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.22.4 scipy==1.8.1 pandas==1.4.3 scikit-learn==1.1.1 statsmodels==0.13.2 python-igraph==0.9.11 pynndescent==0.5.7


In [3]:
# Perform pseudobulk on sc data

import scipy.sparse as sp

def pseudo_bulk(
        adata, groupby, use_rep='X', highly_variable=False, FUN=np.mean):
    """Make pseudo bulk data from grouped sc data
    """
    if adata.obs[groupby].dtype.name == 'category':
        group_attr = adata.obs[groupby].values
        groups = adata.obs[groupby].cat.categories.values
    else:
        group_attr = adata.obs[groupby].astype(str).values
        groups = np.unique(group_attr)
    n_level = len(groups)
    if highly_variable:
        if isinstance(highly_variable, (list, tuple)):
            if use_rep == 'raw':
                k_hv = adata.raw.var_names.isin(highly_variable)
            else:
                k_hv = adata.var_names.isin(highly_variable)
        else:
            k_hv = adata.var['highly_variable'].values
    if use_rep == 'X':
        x = adata.X
        features = adata.var_names.values
        if highly_variable:
            x = x[:, k_hv]
            features = features[k_hv]
    elif use_rep == 'raw':
        x = adata.raw.X
        features = adata.raw.var_names.values
        if highly_variable:
            x = x[:, k_hv]
            features = features[k_hv]
    elif use_rep in adata.layers.keys():
        x = adata.layers[use_rep]
        features = adata.var_names.values
        if highly_variable:
            x = x[:, k_hv]
            features = features[k_hv]
    elif use_rep in adata.obsm.keys():
        x = adata.obsm[use_rep]
        features = np.arange(x.shape[1])
    elif (isinstance(use_rep, np.ndarray) and
            use_rep.shape[0] == adata.shape[0]):
        x = use_rep
        features = np.arange(x.shape[1])
    else:
        raise KeyError(f'{use_rep} invalid.')
    summarised = np.zeros((n_level, x.shape[1]))
    for i, grp in enumerate(groups):
        k_grp = group_attr == grp
        if sp.issparse(x):
            summarised[i] = FUN(x[k_grp, :], axis=0)
        else:
            summarised[i] = FUN(x[k_grp, :], axis=0, keepdims=True)
    return pd.DataFrame(summarised.T, columns=groups, index=features)

def random_partition(
    adata,
    partition_size,
    groupby=None,
    method="random_even",
    key_added="partition_labels",
    random_state=0,
):
    np.random.seed(random_state)
    if groupby:
        if groupby not in adata.obs.columns:
            raise KeyError(f"{groupby} is not a valid obs annotation.")
        groups = adata.obs[groupby].unique()
        label_df = adata.obs[[groupby]].astype(str).rename(columns={groupby: key_added})
        for grp in groups:
            k = adata.obs[groupby] == grp
            grp_size = sum(k)
            n_partition = max(np.round(grp_size / partition_size).astype(int), 1)
            if method == "random":
                part_idx = np.random.randint(low=0, high=n_partition, size=grp_size)
            elif method == "random_even":
                part_sizes = list(map(len, np.array_split(np.arange(grp_size), n_partition)))
                part_idx = np.repeat(np.arange(n_partition), part_sizes)
                np.random.shuffle(part_idx)
            else:
                raise NotImplementedError(method)
            label_df.loc[k, key_added] = [f"{grp},{i}" for i in part_idx]
        adata.obs[key_added] = label_df[key_added]
    else:
        n_partition = max(np.round(adata.n_obs / partition_size).astype(int), 1)
        if method == "random":
            part_idx = np.random.randint(low=0, high=n_partition, size=adata.n_obs)
        elif method == "random_even":
            part_sizes = list(map(len, np.array_split(np.arange(adata.n_obs), n_partition)))
            part_idx = np.repeat(np.arange(n_partition), part_sizes)
            np.random.shuffle(part_idx)
        else:
            raise NotImplementedError(method)
        adata.obs[key_added] = part_idx.astype(str)

## Import suspension data

In [4]:
fetal = sc.read('/nfs/team205/heart/anndata_objects/Foetal/final_annotation/FetalHearts13Aug.h5ad')
fetal

AnnData object with n_obs × n_vars = 176050 × 3311
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'donor_type', 'region_finest', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'batch_key', '_scvi_batch', '_scvi_labels', 'annotation_Semih', 'leiden_scVI', 'cell_type_pre', 'pth', 'ito', 'FACSgate', 'leiden_scVI_0.05', 'leiden_scVI_0.01', 'leiden_scVI_0.1', 'leiden_scVI_0.15', 'leiden_scVI_0.2', 'leiden_scVI_0.25', 'leiden_scVI_0.3', 'leiden_scVI_0.35', 'leiden_scVI_0.4', 'cell_type', 'coarse_grain', 'fine_grain', 'region', 'sex', 'age', 'cycling', 'S_score', 'G2M_scor

In [5]:
# Recover log-normed data for all genes to adata.X (allows Compass to calculate reactions across all genes, not just HVG-subsetted!)
print(fetal.shape)
fetal=fetal.raw.to_adata()
print(fetal.shape)

(176050, 3311)
(176050, 36601)


In [6]:
# Check var_names are gene symbols, and the data is log-normed
print(fetal.var.index[:2])
print(fetal.X.data[:5])

Index(['MIR1302-2HG', 'FAM138A'], dtype='object', name='gene_name_multiome-1')
[0.09802545 0.09802545 0.41538525 0.41538525 0.6009918 ]


In [7]:
adult = sc.read('/nfs/team205/heart/anndata_objects/HCAv1_6_region/global_raw.h5ad')
adult

AnnData object with n_obs × n_vars = 486134 × 33538
    obs: 'NRP', 'age_group', 'cell_source', 'cell_type', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'source', 'type', 'version', 'cell_states', 'Used'
    var: 'gene_ids-Harvard-Nuclei', 'feature_types-Harvard-Nuclei', 'gene_ids-Sanger-Nuclei', 'feature_types-Sanger-Nuclei', 'gene_ids-Sanger-Cells', 'feature_types-Sanger-Cells', 'gene_ids-Sanger-CD45', 'feature_types-Sanger-CD45'
    uns: 'cell_type_colors'
    obsm: 'X_pca', 'X_umap'

In [8]:
# Check var_names are gene syvalue_countsnd the data is log-normed
print(adult.var.index[:2])
print(adult.X.data[:5])

Index(['MIR1302-2HG', 'FAM138A'], dtype='object')
[2. 1. 1. 1. 1.]


# Subset to CMs only

In [9]:
print(adult.shape)
adult=adult[adult.obs['cell_type'].isin(['Atrial_Cardiomyocyte','Ventricular_Cardiomyocyte'])]
print(adult.shape)

(486134, 33538)
(148772, 33538)


In [10]:
print(fetal.shape)
fetal=fetal[fetal.obs['coarse_grain'].isin(['Cardiomyocytes'])]
print(fetal.shape)

(176050, 36601)
(64751, 36601)


In [11]:
# Concatenate adult and fetal CM data

In [32]:
adult.obs['cell_states'].unique()

['vCM1', 'vCM2', 'vCM3', 'vCM4', 'vCM5', 'aCM1', 'aCM4', 'aCM2', 'aCM5', 'aCM3']
Categories (10, object): ['aCM1', 'aCM2', 'aCM3', 'aCM4', ..., 'vCM2', 'vCM3', 'vCM4', 'vCM5']

In [33]:
fetal.obs['cell_states'] = fetal.obs['fine_grain']

In [34]:
print(adult.shape)
print(fetal.shape)

(148772, 33538)
(64751, 36601)


In [35]:
adult.obs['type']='adult'
fetal.obs['type']='fetal'
combined = ad.concat([adult, fetal], join="inner", index_unique=None)
combined

AnnData object with n_obs × n_vars = 213523 × 31915
    obs: 'cell_type', 'donor', 'n_counts', 'n_genes', 'region', 'scrublet_score', 'type', 'cell_states'
    obsm: 'X_umap'

In [36]:
combined.obs['cell_states'].value_counts()

vCM1                                     71162
vCM2                                     25658
vCM3                                     20168
VentricularCardiomyocytesCompact         13740
aCM1                                     12885
VentricularCardiomyocytesPRRX1pos        11627
VentricularCardiomyocytesCycling         11297
vCM4                                      7651
VentricularCardiomyocytesTrabeculated     6936
AtrialCardiomyocytesLeft                  6304
AtrialCardiomyocytesRight                 5452
Cardiofibromyocytes                       5278
aCM2                                      4733
aCM3                                      4096
aCM4                                      1581
VentricularConductionSystemDistal         1568
SinoatrialNodeCardiomyocytes               814
VentricularConductionSystemProximal        730
AtrioventricularNodeCardiomyocytes         689
vCM5                                       650
AtrialCardiomyocytesCycling                316
aCM5         

In [37]:
adata = combined

In [38]:
# Make random partition using sctk
random_partition(adata, partition_size=20, groupby="cell_states", key_added="partition_20")

In [39]:
adata.obs

Unnamed: 0,cell_type,donor,n_counts,n_genes,region,scrublet_score,type,cell_states,partition_20
AAACCCAAGCAAACAT-1-H0015_apex,Ventricular_Cardiomyocyte,H5,3216.0,1365,AX,0.147122,adult,vCM1,"vCM1,2959"
AAACCCAAGCTACTGT-1-H0015_apex,Ventricular_Cardiomyocyte,H5,3182.0,1521,AX,0.185751,adult,vCM1,"vCM1,1294"
AAACCCATCAAACCCA-1-H0015_apex,Ventricular_Cardiomyocyte,H5,3804.0,1584,AX,0.108062,adult,vCM2,"vCM2,422"
AAACCCATCTGGTCAA-1-H0015_apex,Ventricular_Cardiomyocyte,H5,3906.0,1677,AX,0.102990,adult,vCM2,"vCM2,960"
AAACGAAAGATAGGGA-1-H0015_apex,Ventricular_Cardiomyocyte,H5,3139.0,1523,AX,0.155556,adult,vCM2,"vCM2,552"
...,...,...,...,...,...,...,...,...,...
BHF_F_Hea11933675_BHF_F_Hea11596628_CCTGTTGGTTGTGACA-1,Cardiomyocyte,C98,2086.0,1081,WholeHeart,0.034335,fetal,VentricularCardiomyocytesPRRX1pos,"VentricularCardiomyocytesPRRX1pos,37"
BHF_F_Hea11933675_BHF_F_Hea11596628_GCGCTAGGTGTTGCTT-1,Cardiomyocyte,C98,1806.0,1298,WholeHeart,0.131579,fetal,Cardiofibromyocytes,"Cardiofibromyocytes,57"
BHF_F_Hea11933675_BHF_F_Hea11596628_AGCTACGTCTTGTCTG-1,Cardiomyocyte,C98,1334.0,889,WholeHeart,0.041667,fetal,VentricularCardiomyocytesCycling,"VentricularCardiomyocytesCycling,129"
BHF_F_Hea11933675_BHF_F_Hea11596628_CTGCTATGTTTGGTTC-1,Cardiomyocyte,C98,1251.0,792,WholeHeart,0.037862,fetal,VentricularCardiomyocytesPRRX1pos,"VentricularCardiomyocytesPRRX1pos,222"


In [40]:
# Pseudobulk use Ni's psudobulk function
pseudo_bulk = pseudo_bulk(adata,
                         groupby='partition_20')

In [41]:
print(pseudo_bulk.shape)
pseudo_bulk.head(3)

(31915, 10675)


Unnamed: 0,"AtrialCardiomyocytesCycling,0","AtrialCardiomyocytesCycling,1","AtrialCardiomyocytesCycling,10","AtrialCardiomyocytesCycling,11","AtrialCardiomyocytesCycling,12","AtrialCardiomyocytesCycling,13","AtrialCardiomyocytesCycling,14","AtrialCardiomyocytesCycling,15","AtrialCardiomyocytesCycling,2","AtrialCardiomyocytesCycling,3",...,"vCM5,29","vCM5,3","vCM5,30","vCM5,31","vCM5,4","vCM5,5","vCM5,6","vCM5,7","vCM5,8","vCM5,9"
MIR1302-2HG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM138A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
# make sample_names.tsv
pseudo_bulk.columns.to_frame().to_csv('/nfs/team205/heart/compass/fetal_adult_combined_rand20/sample_names.tsv', sep="\t", index=False, header=False)


# make genes.tsv
genes = pseudo_bulk.index.to_list()
genes = pd.DataFrame(genes)
genes.to_csv('/nfs/team205/heart/compass/fetal_adult_combined_rand20/genes.tsv', sep="\t", index = False, header=False)

# make expression.mtx
from scipy.io import mmwrite, mmread
from scipy.sparse import csr_matrix

# Count Matrix (log normed, all genes, not HVG subsetted)
sparce_mtx = csr_matrix(pseudo_bulk)
sparce_mtx = sparce_mtx
mmwrite('/nfs/team205/heart/compass/fetal_adult_combined_rand20/expression.mtx', sparce_mtx)

# Export cell metadata

In [43]:
# Alter so it includes things you need (a reference e.g. barcode and a cell type e.g. in fine_grain)
adata.obs.to_csv('/nfs/team205/heart/compass/fetal_adult_combined_rand20/cell_metadata.csv')