# Prepare Compass inputs, random groups within cell states

Due to large numbers of micropool clusters being contaminated with cells from other annotated groups, we will divide up cells within each celltype into groups of 20 (CM) and 50 (all cells)

## Setup

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import rc_context
import bbknn
import re
import sctk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sc.settings.verbosity = 0            # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white', color_map = 'Reds', figsize = (5, 5),dpi_save=300)

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.22.4 scipy==1.8.1 pandas==1.4.3 scikit-learn==1.1.1 statsmodels==0.13.2 python-igraph==0.9.11 pynndescent==0.5.7


In [3]:
# Perform pseudobulk on sc data

import scipy.sparse as sp

def pseudo_bulk(
        adata, groupby, use_rep='X', highly_variable=False, FUN=np.mean):
    """Make pseudo bulk data from grouped sc data
    """
    if adata.obs[groupby].dtype.name == 'category':
        group_attr = adata.obs[groupby].values
        groups = adata.obs[groupby].cat.categories.values
    else:
        group_attr = adata.obs[groupby].astype(str).values
        groups = np.unique(group_attr)
    n_level = len(groups)
    if highly_variable:
        if isinstance(highly_variable, (list, tuple)):
            if use_rep == 'raw':
                k_hv = adata.raw.var_names.isin(highly_variable)
            else:
                k_hv = adata.var_names.isin(highly_variable)
        else:
            k_hv = adata.var['highly_variable'].values
    if use_rep == 'X':
        x = adata.X
        features = adata.var_names.values
        if highly_variable:
            x = x[:, k_hv]
            features = features[k_hv]
    elif use_rep == 'raw':
        x = adata.raw.X
        features = adata.raw.var_names.values
        if highly_variable:
            x = x[:, k_hv]
            features = features[k_hv]
    elif use_rep in adata.layers.keys():
        x = adata.layers[use_rep]
        features = adata.var_names.values
        if highly_variable:
            x = x[:, k_hv]
            features = features[k_hv]
    elif use_rep in adata.obsm.keys():
        x = adata.obsm[use_rep]
        features = np.arange(x.shape[1])
    elif (isinstance(use_rep, np.ndarray) and
            use_rep.shape[0] == adata.shape[0]):
        x = use_rep
        features = np.arange(x.shape[1])
    else:
        raise KeyError(f'{use_rep} invalid.')
    summarised = np.zeros((n_level, x.shape[1]))
    for i, grp in enumerate(groups):
        k_grp = group_attr == grp
        if sp.issparse(x):
            summarised[i] = FUN(x[k_grp, :], axis=0)
        else:
            summarised[i] = FUN(x[k_grp, :], axis=0, keepdims=True)
    return pd.DataFrame(summarised.T, columns=groups, index=features)

def random_partition(
    adata,
    partition_size,
    groupby=None,
    method="random_even",
    key_added="partition_labels",
    random_state=0,
):
    np.random.seed(random_state)
    if groupby:
        if groupby not in adata.obs.columns:
            raise KeyError(f"{groupby} is not a valid obs annotation.")
        groups = adata.obs[groupby].unique()
        label_df = adata.obs[[groupby]].astype(str).rename(columns={groupby: key_added})
        for grp in groups:
            k = adata.obs[groupby] == grp
            grp_size = sum(k)
            n_partition = max(np.round(grp_size / partition_size).astype(int), 1)
            if method == "random":
                part_idx = np.random.randint(low=0, high=n_partition, size=grp_size)
            elif method == "random_even":
                part_sizes = list(map(len, np.array_split(np.arange(grp_size), n_partition)))
                part_idx = np.repeat(np.arange(n_partition), part_sizes)
                np.random.shuffle(part_idx)
            else:
                raise NotImplementedError(method)
            label_df.loc[k, key_added] = [f"{grp},{i}" for i in part_idx]
        adata.obs[key_added] = label_df[key_added]
    else:
        n_partition = max(np.round(adata.n_obs / partition_size).astype(int), 1)
        if method == "random":
            part_idx = np.random.randint(low=0, high=n_partition, size=adata.n_obs)
        elif method == "random_even":
            part_sizes = list(map(len, np.array_split(np.arange(adata.n_obs), n_partition)))
            part_idx = np.repeat(np.arange(n_partition), part_sizes)
            np.random.shuffle(part_idx)
        else:
            raise NotImplementedError(method)
        adata.obs[key_added] = part_idx.astype(str)

## Import suspension data

In [4]:
adata = sc.read('/nfs/team205/heart/anndata_objects/Foetal/final_annotation/FetalHearts13Aug.h5ad')
adata

AnnData object with n_obs × n_vars = 176050 × 3311
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'donor_type', 'region_finest', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'batch_key', '_scvi_batch', '_scvi_labels', 'annotation_Semih', 'leiden_scVI', 'cell_type_pre', 'pth', 'ito', 'FACSgate', 'leiden_scVI_0.05', 'leiden_scVI_0.01', 'leiden_scVI_0.1', 'leiden_scVI_0.15', 'leiden_scVI_0.2', 'leiden_scVI_0.25', 'leiden_scVI_0.3', 'leiden_scVI_0.35', 'leiden_scVI_0.4', 'cell_type', 'coarse_grain', 'fine_grain', 'region', 'sex', 'age', 'cycling', 'S_score', 'G2M_scor

In [5]:
# Recover log-normed data for all genes to adata.X (allows Compass to calculate reactions across all genes, not just HVG-subsetted!)
print(adata.shape)
adata=adata.raw.to_adata()
print(adata.shape)

(176050, 3311)
(176050, 36601)


In [6]:
# Check var_names are gene symbols, and the data is log-normed
print(adata.var.index[:2])
print(adata.X.data[:5])

Index(['MIR1302-2HG', 'FAM138A'], dtype='object', name='gene_name_multiome-1')
[0.09802545 0.09802545 0.41538525 0.41538525 0.6009918 ]


# Subset to CMs only

In [8]:
print(adata.shape)
adata=adata[adata.obs['coarse_grain']=='Cardiomyocytes']
print(adata.shape)

(176050, 36601)
(64751, 36601)


# Prepare data for Compass
"The input gene expression matrix can be either a tab-delimited text file (tsv) or a matrix market format (mtx) containing gene expression estimates (CPM, TPM, or similar scaled units) with one row per gene, one column per sample."

## .mtx method (gene x sampleID, 3 files)
This allows you to specify the 'samples', so rather than calculating reactions in all single cells individually (or micro-pooled) we can make our own "samples" (pseudobulking them)

Need to ensure the samples are representative of the sample. Here we use a new batch key of cell state and the previous batch key  to form the 'samples'
This requires 3 files, to inputted thus ```--data expression.mtx genes.tsv sample_names.tsv```

Another option would be to use Milo or Metacell to make small samples, below I am going to just use the batch key to make samples within each cell state.

In [9]:
# make random groups of size 20 within each cell state
adata.obs['fine_grain'].value_counts()

VentricularCardiomyocytesCompact         13740
VentricularCardiomyocytesPRRX1pos        11627
VentricularCardiomyocytesCycling         11297
VentricularCardiomyocytesTrabeculated     6936
AtrialCardiomyocytesLeft                  6304
AtrialCardiomyocytesRight                 5452
Cardiofibromyocytes                       5278
VentricularConductionSystemDistal         1568
SinoatrialNodeCardiomyocytes               814
VentricularConductionSystemProximal        730
AtrioventricularNodeCardiomyocytes         689
AtrialCardiomyocytesCycling                316
Name: fine_grain, dtype: int64

In [10]:
# Make random partition using sctk
random_partition(adata, partition_size=10, groupby="fine_grain", key_added="partition_10")

  adata.obs[key_added] = label_df[key_added]


In [11]:
adata.obs

Unnamed: 0_level_0,latent_RT_efficiency,latent_cell_probability,latent_scale,sangerID,combinedID,donor,donor_type,region_finest,facility,cell_or_nuclei,...,region,sex,age,cycling,S_score,G2M_score,phase,stress_score,hb1_score,partition_10
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BHF_F_Hea10402917_GATCGATAGACCCACC-1,3.064704,0.999994,10902.040039,BHF_F_Hea10402917,na,C86,na,Heart,Sanger,Cell,...,WholeHeart,Male,12W5D,No,-0.057931,-0.131408,G1,0.640494,-0.196018,"VentricularCardiomyocytesPRRX1pos,965"
BHF_F_Hea10402917_CATCAAGCATGGGACA-1,3.019039,0.999993,9811.644531,BHF_F_Hea10402917,na,C86,na,Heart,Sanger,Cell,...,WholeHeart,Male,12W5D,Yes,1.306764,0.343963,S,-0.167570,-0.169170,"VentricularCardiomyocytesCycling,900"
BHF_F_Hea10402917_CACCACTAGAGTGAGA-1,2.600170,0.999989,8616.544922,BHF_F_Hea10402917,na,C86,na,Heart,Sanger,Cell,...,WholeHeart,Male,12W5D,No,-0.029281,-0.075899,G1,-0.012796,-0.177667,"VentricularCardiomyocytesCompact,955"
BHF_F_Hea10402917_TCTTTCCTCTGGTGTA-1,2.481375,0.999992,8874.579102,BHF_F_Hea10402917,na,C86,na,Heart,Sanger,Cell,...,WholeHeart,Male,12W5D,No,-0.015760,-0.130530,G1,-0.192063,-0.178246,"VentricularCardiomyocytesTrabeculated,176"
BHF_F_Hea10402917_AAATGCCGTCAACATC-1,2.400682,0.999959,7570.651855,BHF_F_Hea10402917,na,C86,na,Heart,Sanger,Cell,...,WholeHeart,Male,12W5D,No,-0.198699,-0.225604,G1,1.522051,-0.289346,"Cardiofibromyocytes,18"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BHF_F_Hea11933675_BHF_F_Hea11596628_CCTGTTGGTTGTGACA-1,0.977603,0.998388,2734.687744,BHF_F_Hea11933675,BHF_F_Hea11933675_BHF_F_Hea11596628,C98,na,Heart,Sanger,Nuclei,...,WholeHeart,Male,9W1D,No,-0.071211,-0.173691,G1,-0.367560,0.247539,"VentricularCardiomyocytesPRRX1pos,1079"
BHF_F_Hea11933675_BHF_F_Hea11596628_GCGCTAGGTGTTGCTT-1,0.861791,0.997482,4039.140381,BHF_F_Hea11933675,BHF_F_Hea11933675_BHF_F_Hea11596628,C98,na,Heart,Sanger,Nuclei,...,WholeHeart,Male,9W1D,No,0.054354,-0.197448,S,-0.389992,0.570969,"Cardiofibromyocytes,157"
BHF_F_Hea11933675_BHF_F_Hea11596628_AGCTACGTCTTGTCTG-1,0.811526,0.990161,4187.259766,BHF_F_Hea11933675,BHF_F_Hea11933675_BHF_F_Hea11596628,C98,na,Heart,Sanger,Nuclei,...,WholeHeart,Male,9W1D,Yes,-0.081391,-0.079086,G1,0.232271,0.579496,"VentricularCardiomyocytesCycling,468"
BHF_F_Hea11933675_BHF_F_Hea11596628_CTGCTATGTTTGGTTC-1,0.807019,0.987438,3982.637451,BHF_F_Hea11933675,BHF_F_Hea11933675_BHF_F_Hea11596628,C98,na,Heart,Sanger,Nuclei,...,WholeHeart,Male,9W1D,Yes,-0.070203,-0.121220,G1,-0.365374,0.868231,"VentricularCardiomyocytesPRRX1pos,273"


In [12]:
# Pseudobulk use Ni's psudobulk function
pseudo_bulk = pseudo_bulk(adata,
                         groupby='partition_10')

In [13]:
print(pseudo_bulk.shape)
pseudo_bulk.head(3)

(36601, 6476)


Unnamed: 0,"AtrialCardiomyocytesCycling,0","AtrialCardiomyocytesCycling,1","AtrialCardiomyocytesCycling,10","AtrialCardiomyocytesCycling,11","AtrialCardiomyocytesCycling,12","AtrialCardiomyocytesCycling,13","AtrialCardiomyocytesCycling,14","AtrialCardiomyocytesCycling,15","AtrialCardiomyocytesCycling,16","AtrialCardiomyocytesCycling,17",...,"VentricularConductionSystemProximal,66","VentricularConductionSystemProximal,67","VentricularConductionSystemProximal,68","VentricularConductionSystemProximal,69","VentricularConductionSystemProximal,7","VentricularConductionSystemProximal,70","VentricularConductionSystemProximal,71","VentricularConductionSystemProximal,72","VentricularConductionSystemProximal,8","VentricularConductionSystemProximal,9"
MIR1302-2HG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM138A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# make sample_names.tsv
pseudo_bulk.columns.to_frame().to_csv('/nfs/team205/heart/compass/fetal_allCM_random10/sample_names.tsv', sep="\t", index=False, header=False)


# make genes.tsv
genes = pseudo_bulk.index.to_list()
genes = pd.DataFrame(genes)
genes.to_csv('/nfs/team205/heart/compass/fetal_allCM_random10/genes.tsv', sep="\t", index = False, header=False)

# make expression.mtx
from scipy.io import mmwrite, mmread
from scipy.sparse import csr_matrix

# Count Matrix (log normed, all genes, not HVG subsetted)
sparce_mtx = csr_matrix(pseudo_bulk)
sparce_mtx = sparce_mtx
mmwrite('/nfs/team205/heart/compass/fetal_allCM_random10/expression.mtx', sparce_mtx)

# Export cell metadata

In [15]:
# Alter so it includes things you need (a reference e.g. barcode and a cell type e.g. in fine_grain)
adata.obs.to_csv('/nfs/team205/heart/compass/fetal_allCM_random10/cell_metadata.csv')

# Run Compass

## bsub command

```
bsub \
-G teichlab \
-q basement \
-M300000 \
-R "select[mem>300000] rusage[mem=300000] span[hosts=1]" \
-n 30 \
-J all_pool50 \
-o 'log/compass.%J.out' \
-e 'log/compass.%J.err' \
'/software/singularity-v3.6.4/bin/singularity run -B /nfs,/lustre -B ~/COMPASSResources:/usr/local/lib/python3.8/site-packages/compass/Resources /nfs/cellgeni/singularity/images/compass-v0.9.10.2-cplex-v20.10.sif compass --data /nfs/team205/heart/compass/fetal_allcells/fetal_allcells_count_matrix.tsv --species homo_sapiens --precache --calc-metabolites --output-dir /nfs/team205/heart/compass/fetal_allcells/out/ --detailed-perf --num-processes 30 --microcluster-size 50'
```

remove --microcluster-size flag to run on single-cell data

if using .mtx methods then need to change --data input e.g. "--data expression.mtx genes.tsv sample_names.tsv"