# Import modules

In [1]:
import warnings
warnings.simplefilter("ignore", UserWarning)

import anndata
import matplotlib.pyplot as plt
import matplotlib as mpl
# enables correct plotting of text
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42 
import seaborn as sns

import numpy as np
import numpy.random as random
import pandas as pd
import scanpy as sc

In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white', color_map = 'RdPu',)

In [3]:
# import own function
import importlib.util
import sys
spec = importlib.util.spec_from_file_location("module.name", "/nfs/team205/kk18/function/python/utils.py")
utils = importlib.util.module_from_spec(spec)
sys.modules["module.name"] = utils
spec.loader.exec_module(utils)

In [4]:
figdir = '/nfs/team205/heart/fetal_ms_figures/raw_plots_kk/Aug2024'
table_dir = '/nfs/team205/heart/fetal_ms_figures/tables/Aug2024'

In [5]:
import os
os.getcwd()

'/nfs/team205/kk18/notebooks/Foetal/Trisomy21/RNA'

# Read in anndata

## Euploid

In [6]:
# euploid
eup = sc.read_h5ad('/nfs/team205/heart/anndata_objects/Foetal/Feb28ObjectRaw_finegrain_updated.h5ad')
eup

AnnData object with n_obs × n_vars = 297473 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'region', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'donor_by_library-prep', 'multiplexed', 'SOC | status', 'SOC | log_prob_singleton', 'SOC | log_prob_doublet', 'batch_key', '_scvi_batch', 'FACSgate', 'fine_grain', 'mid_grain', 'coarse_grain', 'sex', 'week', 'trimester', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', '_scvi_labels', 'stress_score', 'hb1_score'
    var: 'gene_name_scRNA-0', 'gene_id'
    uns: 'FACSgate_colors', '_scvi_manager_uuid', '_scvi_uuid', 'age_colors', 'cell_or_nucl

In [7]:
# subset multiome
eup = eup[eup.obs['kit_10x']=='Multiome-v1']
eup.shape

(211145, 36601)

In [8]:
pd.crosstab(eup.obs['age'],eup.obs['sex'])

sex,female,male
age,Unnamed: 1_level_1,Unnamed: 2_level_1
4W3D,0,2490
5W4D,13770,0
7W0D,0,2660
8W4D,3907,0
9W2D,2436,0
9W3D,408,0
12W0D,7268,0
13W0D,38234,0
14W0D,0,9708
15W0D,25386,0


In [9]:
pd.crosstab(eup.obs['age'],eup.obs['region'])

region,aorta,apex,atria,atrial septum,ductus arteriosus,whole sample,heart without node,left ventricle,node,outflow tract,pericardium,pulmonary arches and branches,right ventricle,right ventricle and papillaries
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
4W3D,0,986,0,0,0,0,0,0,0,1504,0,0,0,0
5W4D,0,0,0,0,0,13770,0,0,0,0,0,0,0,0
7W0D,0,0,0,0,0,2660,0,0,0,0,0,0,0,0
8W4D,0,0,0,0,0,3907,0,0,0,0,0,0,0,0
9W2D,0,0,0,0,0,2436,0,0,0,0,0,0,0,0
9W3D,0,0,0,0,0,408,0,0,0,0,0,0,0,0
12W0D,0,0,0,0,0,7268,0,0,0,0,0,0,0,0
13W0D,0,0,0,0,0,38234,0,0,0,0,0,0,0,0
14W0D,0,0,0,0,0,0,4273,0,5435,0,0,0,0,0
15W0D,15766,0,0,0,0,0,5384,0,4236,0,0,0,0,0


In [10]:
pd.crosstab(eup.obs['donor'],eup.obs['region'])

region,aorta,apex,atria,atrial septum,ductus arteriosus,whole sample,heart without node,left ventricle,node,outflow tract,pericardium,pulmonary arches and branches,right ventricle,right ventricle and papillaries
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C82,0,0,0,0,0,408,0,0,0,0,0,0,0,0
C85,0,0,0,0,0,2436,0,0,0,0,0,0,0,0
C87,0,0,0,0,0,3907,0,0,0,0,0,0,0,0
C92,0,0,0,0,0,13770,0,0,0,0,0,0,0,0
C98,0,0,0,0,0,2660,0,0,0,0,0,0,0,0
C104,0,986,0,0,0,0,0,0,0,1504,0,0,0,0
Hst33,0,0,0,0,0,38234,0,0,0,0,0,0,0,0
Hst36,0,0,0,0,0,7268,0,0,0,0,0,0,0,0
Hst39,15766,0,0,0,0,0,5384,0,4236,0,0,0,0,0
Hst40,0,0,0,0,0,0,4273,0,5435,0,0,0,0,0


In [11]:
pd.crosstab(eup.obs['donor'],eup.obs['age'])

age,4W3D,5W4D,7W0D,8W4D,9W2D,9W3D,12W0D,13W0D,14W0D,15W0D,20W0D
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C82,0,0,0,0,0,408,0,0,0,0,0
C85,0,0,0,0,2436,0,0,0,0,0,0
C87,0,0,0,3907,0,0,0,0,0,0,0
C92,0,13770,0,0,0,0,0,0,0,0,0
C98,0,0,2660,0,0,0,0,0,0,0,0
C104,2490,0,0,0,0,0,0,0,0,0,0
Hst33,0,0,0,0,0,0,0,38234,0,0,0
Hst36,0,0,0,0,0,0,7268,0,0,0,0
Hst39,0,0,0,0,0,0,0,0,0,25386,0
Hst40,0,0,0,0,0,0,0,0,9708,0,0


## Trisomy21

In [12]:
t21 = sc.read_h5ad('/nfs/team205/heart/anndata_objects/Foetal/trisomy21/T21Hearts_cleaned_July2024.h5ad')
t21

AnnData object with n_obs × n_vars = 110693 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'batch_key', 'leiden_scVI_0.1', 'leiden_scVI_0.2', 'leiden_scVI_0.3', 'leiden_scVI_0.5', 'leiden_scVI_0.7', 'leiden_scVI_1', 'celltypist_coarse', 'conf_score_coarse', 'cells_to_remove', 'week', 'trimester', 'sex', 'celltypist_fine', 'conf_score_fine'
    var: 'feature_type', 'ambient_expression-0', 'ambient_expression-1', 'ambient_expression-10', 'ambient_expression-11', 'amb

In [13]:
pd.crosstab(t21.obs['age'],t21.obs['sex'])

sex,female,male
age,Unnamed: 1_level_1,Unnamed: 2_level_1
11W0D,0,5209
13W0D,16730,22386
14W0D,0,66368


In [14]:
pd.crosstab(t21.obs['age'],t21.obs['region'])

region,Great vessels,Heart,Left atria,Left ventricle,Pericardial sac + RA + LA + IAS,Pericardium,Right ventricle
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11W0D,0,5209,0,0,0,0,0
13W0D,0,39116,0,0,0,0,0
14W0D,6042,0,16452,6755,10761,18409,7949


In [15]:
display(pd.crosstab(t21.obs['donor'],t21.obs['region']))
display(pd.crosstab(t21.obs['donor'],t21.obs['sex']))
display(pd.crosstab(t21.obs['donor'],t21.obs['age']))

region,Great vessels,Heart,Left atria,Left ventricle,Pericardial sac + RA + LA + IAS,Pericardium,Right ventricle
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Hst32,0,16730,0,0,0,0,0
Hst42,6042,0,16452,6755,10761,18409,7949
Hst44,0,22386,0,0,0,0,0
Hst48,0,5209,0,0,0,0,0


sex,female,male
donor,Unnamed: 1_level_1,Unnamed: 2_level_1
Hst32,16730,0
Hst42,0,66368
Hst44,0,22386
Hst48,0,5209


age,11W0D,13W0D,14W0D
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Hst32,0,16730,0
Hst42,0,0,66368
Hst44,0,22386,0
Hst48,5209,0,0


# Select samples

## Euploid

* Age-matched with T21 samples
* Region: heart regions, not pericardium

In [16]:
age_sel = ['12W0D','13W0D','14W0D']
region_sel = ['apex','atria','atrial septum','heart without node',
                 'left ventricle',
                 'node',
                 'outflow tract','right ventricle',
                 'right ventricle and papillaries',
                 'whole sample']

mask1 = eup.obs['age'].isin(age_sel)
mask2 = eup.obs['region'].isin(region_sel)
eup = eup[mask1&mask2]
eup

View of AnnData object with n_obs × n_vars = 55210 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'region', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'donor_by_library-prep', 'multiplexed', 'SOC | status', 'SOC | log_prob_singleton', 'SOC | log_prob_doublet', 'batch_key', '_scvi_batch', 'FACSgate', 'fine_grain', 'mid_grain', 'coarse_grain', 'sex', 'week', 'trimester', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', '_scvi_labels', 'stress_score', 'hb1_score'
    var: 'gene_name_scRNA-0', 'gene_id'
    uns: 'FACSgate_colors', '_scvi_manager_uuid', '_scvi_uuid', 'age_colors', 'cell_

In [17]:
display(pd.crosstab(eup.obs['donor'],eup.obs['region']))
display(pd.crosstab(eup.obs['donor'],eup.obs['sex']))
display(pd.crosstab(eup.obs['donor'],eup.obs['age']))

region,whole sample,heart without node,node
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Hst33,38234,0,0
Hst36,7268,0,0
Hst40,0,4273,5435


sex,female,male
donor,Unnamed: 1_level_1,Unnamed: 2_level_1
Hst33,38234,0
Hst36,7268,0
Hst40,0,9708


age,12W0D,13W0D,14W0D
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Hst33,0,38234,0
Hst36,7268,0,0
Hst40,0,0,9708


## Trisomy 21

* Region: remove "pericardium" sample
* "Pericardium" region is not represented in "Heart" and other region samples. (so its really different) (ref: notebook no2)

In [18]:
mask = t21.obs['region'].isin(['Pericardium'])==False
t21 = t21[mask]
t21

View of AnnData object with n_obs × n_vars = 92284 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'batch_key', 'leiden_scVI_0.1', 'leiden_scVI_0.2', 'leiden_scVI_0.3', 'leiden_scVI_0.5', 'leiden_scVI_0.7', 'leiden_scVI_1', 'celltypist_coarse', 'conf_score_coarse', 'cells_to_remove', 'week', 'trimester', 'sex', 'celltypist_fine', 'conf_score_fine'
    var: 'feature_type', 'ambient_expression-0', 'ambient_expression-1', 'ambient_expression-10', 'ambient_expression-11

In [19]:
display(pd.crosstab(t21.obs['donor'],t21.obs['region']))
display(pd.crosstab(t21.obs['donor'],t21.obs['sex']))
display(pd.crosstab(t21.obs['donor'],t21.obs['age']))

region,Great vessels,Heart,Left atria,Left ventricle,Pericardial sac + RA + LA + IAS,Right ventricle
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Hst32,0,16730,0,0,0,0
Hst42,6042,0,16452,6755,10761,7949
Hst44,0,22386,0,0,0,0
Hst48,0,5209,0,0,0,0


sex,female,male
donor,Unnamed: 1_level_1,Unnamed: 2_level_1
Hst32,16730,0
Hst42,0,47959
Hst44,0,22386
Hst48,0,5209


age,11W0D,13W0D,14W0D
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Hst32,0,16730,0
Hst42,0,0,47959
Hst44,0,22386,0
Hst48,5209,0,0


# Concatenate

In [20]:
eup.obs['euploid_or_t21'] = 'euploid'
t21.obs['euploid_or_t21'] = 'tri21'
t21.obs.replace({'cell_or_nuclei':{'Nuclei':'nuclei'}},inplace=True)

adata = eup.concatenate(t21, 
            join='inner', 
            batch_key=None, 
            index_unique=None)
adata


See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


AnnData object with n_obs × n_vars = 147494 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'region', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'donor_by_library-prep', 'multiplexed', 'SOC | status', 'SOC | log_prob_singleton', 'SOC | log_prob_doublet', 'batch_key', '_scvi_batch', 'FACSgate', 'fine_grain', 'mid_grain', 'coarse_grain', 'sex', 'week', 'trimester', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', '_scvi_labels', 'stress_score', 'hb1_score', 'euploid_or_t21', 'donor_type', 'region_finest', 'flushed', 'scrublet_leiden', 'cluster_scrublet_score', 'leiden_scVI_0.1', 'leide

In [24]:
# merge coarse grain labels
adata.obs['coarse_grain_merged'] = adata.obs['coarse_grain'].astype('str').copy()
t21_obsnames = adata.obs_names[adata.obs['euploid_or_t21']=='tri21']
adata.obs.loc[t21_obsnames,'coarse_grain_merged'] = adata.obs.loc[t21_obsnames,'celltypist_coarse']
set(adata.obs['coarse_grain_merged'])

{'Cardiomyocytes',
 'Endothelium',
 'Epicardium',
 'Leukocytes',
 'Mesenchymal',
 'Neural'}

In [25]:
# merge fine grain labels
adata.obs['fine_grain_merged'] = adata.obs['fine_grain'].astype('str').copy()
t21_obsnames = adata.obs_names[adata.obs['euploid_or_t21']=='tri21']
adata.obs.loc[t21_obsnames,'fine_grain_merged'] = adata.obs.loc[t21_obsnames,'celltypist_fine']
print(f"number of NaN in the merged label: {sum(adata.obs['fine_grain_merged'].isna())}")
with pd.option_context('display.max_rows', None):
    display(pd.crosstab(adata.obs['fine_grain_merged'],adata.obs['euploid_or_t21']))

number of NaN in the merged label: 0


euploid_or_t21,euploid,tri21
fine_grain_merged,Unnamed: 1_level_1,Unnamed: 2_level_1
AtrialCardiomyocytesCycling,972,2487
AtrialCardiomyocytesLeft,2989,8392
AtrialCardiomyocytesRight,4901,5629
AtrioventricularNodePacemakerCells,432,650
BCells,15,31
BCellsMS4A1pos,21,6
ChromaffinCells,14,16
CoronaryArterialEndothelialCells,70,97
CoronaryCapillaryEndothelialCells,1222,2174
CoronaryPericytes,337,595


In [26]:
# save concatenated object
adata.write('/nfs/team205/heart/anndata_objects/Foetal/trisomy21/Euploid_T21Hearts_Aug2024_sel.h5ad')

# Subsample cells

* Since the tissue disection strategy is not consistent between euploid and trisomy21 and among donors, we cannot compare abundance of each cell state between euploid and trisomy21.
* For each fine-grained cell type, if we balance the nucleus numbers between euploid and trisomy 21, the nucleus-neighbourhood-level abundance test will be valid, since the scale of dissection is generally larger than that of the fine-grained cell type level.
* Take nuclei which also passed ATAC QC

In [33]:
# read in cell IDs which passed ATAC QC: trisomy21
tri21_atac_cells = utils.txt2list("/nfs/team205/heart/anndata_objects/Foetal/trisomy21/ArchR/project_output/project_cellIDs_removed-low-cellnumer-celltypes.txt")
print(len(tri21_atac_cells))
print(tri21_atac_cells[:5])

# read in cell IDs which passed ATAC QC: euploid
eup_atac_cells = utils.txt2list('/nfs/team205/heart/anndata_objects/Foetal/multiome_ATAC/ArchR/project_output_peak-revised-Feb2024/project_cellIDs.txt')
print(len(eup_atac_cells))
print(eup_atac_cells[:5])

# conbine
atac_cells = eup_atac_cells + tri21_atac_cells
print(len(atac_cells))

89416
['BHF_F_Hea13188320_BHF_F_Hea13187622_GGGTCACTCATAGCCG-1', 'BHF_F_Hea13188320_BHF_F_Hea13187622_TAATCACCATTGGGAG-1', 'BHF_F_Hea13188320_BHF_F_Hea13187622_CGAGCTGGTCTCACTG-1', 'BHF_F_Hea13188320_BHF_F_Hea13187622_AAGGTATAGGCTACTG-1', 'BHF_F_Hea13188320_BHF_F_Hea13187622_GCGCGATTCATAATCG-1']
167022
['BHF_F_Hea11064670_BHF_F_Hea11031823_CAGCCTAAGTCTTGAA-1', 'BHF_F_Hea11064670_BHF_F_Hea11031823_GGTGTTGTCAGGCCTA-1', 'BHF_F_Hea11064670_BHF_F_Hea11031823_CGTGTGTCAAGCCACT-1', 'BHF_F_Hea11064670_BHF_F_Hea11031823_AGCTTTAAGCTAAAGG-1', 'BHF_F_Hea11064670_BHF_F_Hea11031823_GTTTCCTCAGGTCCTG-1']
256438


In [36]:
shared_obsnames = list(set(adata.obs_names).intersection(atac_cells))
print(len(shared_obsnames))

122174


In [37]:
print(adata.shape)
adata_sub = adata[shared_obsnames]
print(adata_sub.shape)

(147494, 36601)
(122174, 36601)


In [38]:
with pd.option_context('display.max_rows', None):
    display(pd.crosstab(adata_sub.obs['fine_grain_merged'],adata_sub.obs['euploid_or_t21']))

euploid_or_t21,euploid,tri21
fine_grain_merged,Unnamed: 1_level_1,Unnamed: 2_level_1
AtrialCardiomyocytesCycling,863,1992
AtrialCardiomyocytesLeft,2505,6555
AtrialCardiomyocytesRight,4302,4575
AtrioventricularNodePacemakerCells,405,539
BCells,14,27
BCellsMS4A1pos,19,0
ChromaffinCells,13,0
CoronaryArterialEndothelialCells,60,82
CoronaryCapillaryEndothelialCells,1060,1844
CoronaryPericytes,309,523


In [42]:
with pd.option_context('display.max_rows', None):
    display(pd.crosstab(adata_sub.obs['fine_grain_merged'],adata_sub.obs['donor']))

donor,Hst32,Hst33,Hst36,Hst40,Hst42,Hst44,Hst48
fine_grain_merged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AtrialCardiomyocytesCycling,450,229,72,562,1205,214,123
AtrialCardiomyocytesLeft,529,2057,193,255,4951,922,153
AtrialCardiomyocytesRight,1226,1868,179,2255,2537,472,340
AtrioventricularNodePacemakerCells,111,127,6,272,325,67,36
BCells,2,13,1,0,5,17,3
BCellsMS4A1pos,0,12,4,3,0,0,0
ChromaffinCells,0,7,5,1,0,0,0
CoronaryArterialEndothelialCells,12,19,22,19,34,25,11
CoronaryCapillaryEndothelialCells,432,714,185,161,941,403,68
CoronaryPericytes,135,204,74,31,181,175,32


In [62]:
# select cell states which has more than 30 cells in both euploid and trisomy21
ctab = pd.crosstab(adata_sub.obs['fine_grain_merged'],adata_sub.obs['euploid_or_t21'])
mask = (ctab>=30).sum(axis=1)==2
celltype_sel = ctab.index[mask]

# for the celltypes which passed above, subset nuclei per donor to be the 3rd largest number in the list of nuclei number per donor
selected_obsnames = []
for celltype in celltype_sel:
    ad = adata_sub[adata_sub.obs['fine_grain_merged']==celltype]
    n_subsample = ad.obs['donor'].value_counts()[2] # get 3rd largest number
    ad = utils.sctk_subsample(ad, fraction=1, groupby='donor', max_n=n_subsample, random_state=0)
    selected_obsnames = selected_obsnames+list(ad.obs_names)
    del ad
adata_sub = adata_sub[selected_obsnames]
adata_sub

View of AnnData object with n_obs × n_vars = 83477 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'region', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'donor_by_library-prep', 'multiplexed', 'SOC | status', 'SOC | log_prob_singleton', 'SOC | log_prob_doublet', 'batch_key', '_scvi_batch', 'FACSgate', 'fine_grain', 'mid_grain', 'coarse_grain', 'sex', 'week', 'trimester', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', '_scvi_labels', 'stress_score', 'hb1_score', 'euploid_or_t21', 'donor_type', 'region_finest', 'flushed', 'scrublet_leiden', 'cluster_scrublet_score', 'leiden_scVI_0.1',

In [63]:
with pd.option_context('display.max_rows', None):
    display(pd.crosstab(adata_sub.obs['fine_grain_merged'],adata_sub.obs['euploid_or_t21']))

with pd.option_context('display.max_rows', None):
    display(pd.crosstab(adata_sub.obs['fine_grain_merged'],adata_sub.obs['donor']))

euploid_or_t21,euploid,tri21
fine_grain_merged,Unnamed: 1_level_1,Unnamed: 2_level_1
AtrialCardiomyocytesCycling,751,1237
AtrialCardiomyocytesLeft,1370,2526
AtrialCardiomyocytesRight,3915,3906
AtrioventricularNodePacemakerCells,260,341
CoronaryArterialEndothelialCells,60,67
CoronaryCapillaryEndothelialCells,778,1335
CoronaryPericytes,280,517
CoronarySmoothMuscleCells,70,117
CoronaryVenousEndothelialCells,147,241
CoronaryVesselAdventitialFibroblasts,712,872


donor,Hst32,Hst33,Hst36,Hst40,Hst42,Hst44,Hst48
fine_grain_merged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AtrialCardiomyocytesCycling,450,229,72,450,450,214,123
AtrialCardiomyocytesLeft,529,922,193,255,922,922,153
AtrialCardiomyocytesRight,1226,1868,179,1868,1868,472,340
AtrioventricularNodePacemakerCells,111,127,6,127,127,67,36
CoronaryArterialEndothelialCells,12,19,22,19,22,22,11
CoronaryCapillaryEndothelialCells,432,432,185,161,432,403,68
CoronaryPericytes,135,175,74,31,175,175,32
CoronarySmoothMuscleCells,38,38,29,3,38,34,7
CoronaryVenousEndothelialCells,69,83,19,45,83,83,6
CoronaryVesselAdventitialFibroblasts,218,285,213,214,285,285,84


In [64]:
pd.crosstab(adata_sub.obs['donor'],adata_sub.obs['euploid_or_t21'])

euploid_or_t21,euploid,tri21
donor,Unnamed: 1_level_1,Unnamed: 2_level_1
Hst32,0,14270
Hst33,18171,0
Hst36,6262,0
Hst40,7922,0
Hst42,0,18334
Hst44,0,14444
Hst48,0,4074


In [65]:
adata_sub.X.data[:10]

array([1., 1., 2., 1., 2., 2., 1., 1., 1., 1.], dtype=float32)

In [66]:
# save
adata_sub.write('/nfs/team205/heart/anndata_objects/Foetal/trisomy21/Euploid_T21Hearts_Aug2024_sel_subsampled-per-cellstate-donor.h5ad')

In [76]:
# save obs for later use
adata_sub.obs.to_csv('/nfs/team205/heart/anndata_objects/Foetal/trisomy21/Euploid_T21Hearts_Aug2024_sel_subsampled-per-cellstate-donor.obs.csv')