In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import scipy
import os

from scipy.io import mmwrite, mmread
from scipy.sparse import csr_matrix

import sctk

In [2]:
import session_info
session_info.show()

In [3]:
sc.settings.set_figure_params(dpi=120)

# Variables

In [17]:
job_name="aCMs_AnyRegion" #  corresponds to name on input adata(s) .h5ad files. E.g. cell type or bespoke job like "aCMs_WholeSample"

In [18]:
path_to_RNA_object='/nfs/team205/heart/anndata_objects/Foetal/Feb28ObjectRaw.h5ad' # global object
path_to_ATAC_object='/nfs/team205/heart/anndata_objects/Foetal/multiome_ATAC/ArchR/project_output/PeakMatrix/Foetal_Peaks.h5ad' # global object

In [19]:
celltype_key = 'fine_grain'
celltypes_to_analyse = ['AtrialCardiomyocytesRight','AtrialCardiomyocytesLeft']

In [20]:
region_subset=False

if region_subset:
    region_key='region'
    regions_to_analyse = ['whole sample']

In [21]:
obs_to_keep=['sangerID', 'combinedID', 'donor', 
                       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 
                       'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts',
                       'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
                       'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt',
                       'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo',
                       'pct_counts_ribo', 'HB_score', 'multiplexed',
                        'batch_key','region','FACSgate','sex','week','trimester',
                     'fine_grain','mid_grain','coarse_grain',
                     'heart_or_greatvessels','cycling','S_score','G2M_score','phase','stress_score','hb1_score']

In [22]:
RNA_var_to_keep=['gene_name_scRNA-0','gene_id']

In [23]:
paired_adata_out_path=os.path.join("/lustre/scratch126/cellgen/team205/heart/objects/fetal/RNA",f"{job_name}_paired.h5ad")
paired_adata_out_path

'/lustre/scratch126/cellgen/team205/heart/objects/fetal/RNA/aCMs_AnyRegion_paired.h5ad'

In [24]:
multiome_only=True

# Prepare paired anndata

In [25]:
# rna
adata_rna = sc.read_h5ad(path_to_RNA_object)

# subset cells
adata_rna = adata_rna[adata_rna.obs[celltype_key].isin(celltypes_to_analyse)]

# subset regions
if region_subset:
    adata_rna = adata_rna[adata_rna.obs[region_key].isin(regions_to_analyse)]

# add modality to .var
adata_rna.var['modality']='Gene Expression'

adata_rna

  adata_rna.var['modality']='Gene Expression'


AnnData object with n_obs × n_vars = 17302 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'region', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'donor_by_library-prep', 'multiplexed', 'SOC | status', 'SOC | log_prob_singleton', 'SOC | log_prob_doublet', 'batch_key', '_scvi_batch', 'FACSgate', 'fine_grain', 'mid_grain', 'coarse_grain', 'sex', 'week', 'trimester', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', '_scvi_labels', 'stress_score', 'hb1_score'
    var: 'gene_name_scRNA-0', 'gene_id', 'modality'
    uns: 'FACSgate_colors', '_scvi_manager_uuid', '_scvi_uuid', 'age_colors', 'c

In [26]:
pd.crosstab(adata_rna.obs[celltype_key],adata_rna.obs['region'])

region,aorta,apex,atria,atrial septum,base,whole sample,heart without node,node,outflow tract,pulmonary arches and branches
fine_grain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AtrialCardiomyocytesLeft,8,16,474,956,342,4017,360,273,5,155
AtrialCardiomyocytesRight,159,9,457,162,665,3738,100,5389,6,11


In [27]:
pd.crosstab(adata_rna.obs[celltype_key],adata_rna.obs['kit_10x'])

kit_10x,3prime-v3,5prime-v2,Multiome-v1
fine_grain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AtrialCardiomyocytesLeft,358,443,5805
AtrialCardiomyocytesRight,674,661,9361


In [28]:
pd.crosstab(adata_rna.obs[celltype_key],adata_rna.obs['cycling'])

cycling,no,yes
fine_grain,Unnamed: 1_level_1,Unnamed: 2_level_1
AtrialCardiomyocytesLeft,4802,1804
AtrialCardiomyocytesRight,7574,3122


In [29]:
# atac
# ie only Multiome data
adata_atac = sc.read_h5ad(path_to_ATAC_object)

# subset cells
adata_atac = adata_atac[adata_atac.obs[celltype_key].isin(celltypes_to_analyse)]
print(adata_atac.obs[celltype_key].value_counts())

# subset regions
if region_subset:
    adata_atac = adata_atac[adata_atac.obs[region_key].isin(regions_to_analyse)]

# add modality to .var
adata_atac.var['modality']='Peaks'

adata_atac

AtrialCardiomyocytesRight    8187
AtrialCardiomyocytesLeft     4777
Name: fine_grain, dtype: int64


  adata_atac.var['modality']='Peaks'


AnnData object with n_obs × n_vars = 12964 × 508040
    obs: 'sangerID', 'combinedID', 'donor', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'multiplexed', 'batch_key', 'region', 'FACSgate', 'sex', 'week', 'trimester', 'fine_grain', 'mid_grain', 'coarse_grain', 'heart_or_greatvessels'
    var: 'seqnames', 'start', 'end', 'width', 'strand', 'score', 'replicateScoreQuantile', 'groupScoreQuantile', 'Reproducibility', 'GroupReplicate', 'distToGeneStart', 'nearestGene', 'peakType', 'distToTSS', 'nearestTSS', 'GC', 'idx', 'N', 'modality'

In [30]:
pd.crosstab(adata_atac.obs[celltype_key],adata_atac.obs['region'])

region,aorta,atria,atrial septum,heart without node,node,pulmonary arches and branches,whole sample
fine_grain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AtrialCardiomyocytesLeft,2,430,668,286,245,105,3041
AtrialCardiomyocytesRight,75,431,109,80,4862,7,2623


In [31]:
# shared barcodes, ie Multiome data (not all, since MultiomeRNA has a bit more data than MultiomeATAC)
barcodes_inter=list(set(adata_rna.obs_names).intersection(adata_atac.obs_names))
len(barcodes_inter)

# take shared cells
multiome_rna=adata_rna[barcodes_inter,:]
multiome_atac=adata_atac[barcodes_inter,:]

# concatenate
adata_paired=anndata.AnnData(
    X=scipy.sparse.hstack([multiome_rna.X,multiome_atac.X]).tocsr(), # concatenate sparse matrix and convert to Compressed Sparse Row format
    obs=multiome_rna.obs[obs_to_keep],
    var=pd.concat([multiome_rna.var[['modality']+RNA_var_to_keep], \
                   multiome_atac.var[['modality']]
                  ],axis=0)
)
adata_paired

AnnData object with n_obs × n_vars = 12964 × 544641
    obs: 'sangerID', 'combinedID', 'donor', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'multiplexed', 'batch_key', 'region', 'FACSgate', 'sex', 'week', 'trimester', 'fine_grain', 'mid_grain', 'coarse_grain', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', 'stress_score', 'hb1_score'
    var: 'modality', 'gene_name_scRNA-0', 'gene_id'

In [32]:
# modify modality columns in .obs
adata_paired.obs['modality'] = 'Multiome'
adata_paired.obs['modality'].value_counts()

Multiome    12964
Name: modality, dtype: int64

In [33]:
adata_paired.obs

Unnamed: 0_level_0,sangerID,combinedID,donor,facility,cell_or_nuclei,modality,kit_10x,scrublet_score,doublet_pval,doublet_bh_pval,...,fine_grain,mid_grain,coarse_grain,heart_or_greatvessels,cycling,S_score,G2M_score,phase,stress_score,hb1_score
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BHF_F_Hea11933666_BHF_F_Hea11596619_GCTTTACGTTGCATCT-1,BHF_F_Hea11933666,BHF_F_Hea11933666_BHF_F_Hea11596619,Hst33,Sanger,nuclei,Multiome,Multiome-v1,0.036105,0.772012,0.913587,...,AtrialCardiomyocytesLeft,AtrialCardiomyocytes,Cardiomyocytes,heart,no,-0.119442,-0.146085,G1,-0.420342,-0.254106
BHF_F_Hea11933668_BHF_F_Hea11596621_CCTTGTTCAATTGACT-1,BHF_F_Hea11933668,BHF_F_Hea11933668_BHF_F_Hea11596621,Hst33,Sanger,nuclei,Multiome,Multiome-v1,0.028430,0.869914,0.925448,...,AtrialCardiomyocytesLeft,AtrialCardiomyocytes,Cardiomyocytes,heart,no,-0.041229,-0.138849,G1,-0.364514,-0.265534
HCAHeartST13386009_HCAHeartST13303419_GTTTACCGTTGCATCT-1,HCAHeartST13386009,HCAHeartST13386009_HCAHeartST13303419,Hst41,Sanger,nuclei,Multiome,Multiome-v1,0.054002,0.620244,0.733829,...,AtrialCardiomyocytesLeft,AtrialCardiomyocytes,Cardiomyocytes,heart,no,-0.055126,-0.117691,G1,-0.311239,0.444615
HCAHeartST13386009_HCAHeartST13303419_GTCATGAGTAGCAGCT-1,HCAHeartST13386009,HCAHeartST13386009_HCAHeartST13303419,Hst41,Sanger,nuclei,Multiome,Multiome-v1,0.035455,0.670376,0.733829,...,AtrialCardiomyocytesLeft,AtrialCardiomyocytes,Cardiomyocytes,heart,no,-0.040533,-0.100181,G1,-0.228039,0.719630
7089STDY13216927_BHF_F_Hea13242534_AGGTGAGGTTAGTTGG-1,7089STDY13216927,7089STDY13216927_BHF_F_Hea13242534,Hst36,Sanger,nuclei,Multiome,Multiome-v1,0.030818,0.639032,0.841935,...,AtrialCardiomyocytesRight,AtrialCardiomyocytes,Cardiomyocytes,heart,yes,0.197882,-0.118771,S,-0.299509,-0.173325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7089STDY13216920_BHF_F_Hea13242527_AACGGTAAGTTAGGCT-1,7089STDY13216920,7089STDY13216920_BHF_F_Hea13242527,Hst40,Sanger,nuclei,Multiome,Multiome-v1,0.079812,0.563518,0.722573,...,AtrialCardiomyocytesRight,AtrialCardiomyocytes,Cardiomyocytes,heart,yes,-0.014605,0.002866,G2M,-0.297443,-0.144262
BHF_F_Hea13188317_BHF_F_Hea13187619_GCTTAGTAGCTTAACA-1,BHF_F_Hea13188317,BHF_F_Hea13188317_BHF_F_Hea13187619,Hst41,Sanger,nuclei,Multiome,Multiome-v1,0.092437,0.639402,0.901964,...,AtrialCardiomyocytesRight,AtrialCardiomyocytes,Cardiomyocytes,heart,no,-0.084742,-0.185395,G1,-0.481188,-0.147605
BHF_F_Hea11933668_BHF_F_Hea11596621_TATTTGCTCCTTAAGA-1,BHF_F_Hea11933668,BHF_F_Hea11933668_BHF_F_Hea11596621,Hst33,Sanger,nuclei,Multiome,Multiome-v1,0.037171,0.841468,0.925448,...,AtrialCardiomyocytesLeft,AtrialCardiomyocytes,Cardiomyocytes,heart,no,-0.059559,-0.124683,G1,-0.041850,-0.163431
BHF_F_Hea11933669_BHF_F_Hea11596622_TAGTTTGAGGTCAAAG-1,BHF_F_Hea11933669,BHF_F_Hea11933669_BHF_F_Hea11596622,Hst33,Sanger,nuclei,Multiome,Multiome-v1,0.038835,0.778765,0.881283,...,AtrialCardiomyocytesRight,AtrialCardiomyocytes,Cardiomyocytes,heart,no,-0.090159,-0.061663,G1,-0.293251,-0.271267


In [34]:
adata_paired.var

Unnamed: 0,modality,gene_name_scRNA-0,gene_id
MIR1302-2HG,Gene Expression,MIR1302-2HG,ENSG00000243485
FAM138A,Gene Expression,FAM138A,ENSG00000237613
OR4F5,Gene Expression,OR4F5,ENSG00000186092
AL627309.1,Gene Expression,AL627309.1,ENSG00000238009
AL627309.3,Gene Expression,AL627309.3,ENSG00000239945
...,...,...,...
chrX:155820072_155820572,Peaks,,
chrX:155874490_155874990,Peaks,,
chrX:155880548_155881048,Peaks,,
chrX:155881064_155881564,Peaks,,


In [35]:
adata_paired.X.data[:10]

array([1., 1., 1., 1., 1., 1., 2., 1., 2., 3.], dtype=float32)

In [36]:
# save
adata_paired.write(paired_adata_out_path)
print(f"saved to {paired_adata_out_path}")

saved to /lustre/scratch126/cellgen/team205/heart/objects/fetal/RNA/aCMs_AnyRegion_paired.h5ad


# Prepare unimodality data

In [37]:
# If there is an object of different modality e.g. scRNAseq then make that too
if multiome_only:
    print("only making a single multiome object...")
else:
    print("making unimodality data too")
    modality_col="scRNA" # define what you want to go into modality column
    unimodality_adata_out_path=os.path.join("/lustre/scratch126/cellgen/team205/heart/objects/fetal/RNA",f"{job_name}_unimodality.h5ad")

    # select single-cell data (3p and 5p)
    adata_unimodality = adata_rna[adata_rna.obs['kit_10x'].isin(['3prime-v3','5prime-v2'])]
    adata_unimodality.obs['kit_10x'].value_counts()

    # make adata
    adata_unimodality=anndata.AnnData(
        X=adata_scrna.X, # concatenate sparse matrix and convert to Compressed Sparse Row format
        obs=adata_scrna.obs[obs_to_keep],
        var=adata_scrna.var[['modality']+RNA_var_to_keep]
    )

    # modify modality columns in .obs
    adata_unimodality.obs['modality'] = modality_col
    adata_unimodality.obs['modality'].value_counts()

    adata_unimodality.write(unimodality_adata_out_path)
    print(f"saved to {unimodality_adata_out_path}")

only making a single multiome object...
