# Notebook for merging annotated datasets- Cardiomyocytes

**Created by** : Srivalli Kolla

**Created on** : 19 November, 2024

**Modified on** : 19 November, 2024

**Institute of Systems Immunology, University of Würzburg**

Env : scanpy

# Import Packages

In [1]:
import anndata as ad
import scanpy as sc
import numpy as np
import datetime

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

timestamp = datetime.datetime.now().strftime("%d_%m_%y,%H:%M")

-----
anndata     0.10.8
scanpy      1.10.2
-----
PIL                 10.3.0
asttokens           NA
attr                23.2.0
cffi                1.16.0
colorama            0.4.6
comm                0.2.2
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.8.2
decorator           5.1.1
defusedxml          0.7.1
distutils           3.12.4
django              5.0.6
executing           2.0.1
h5py                3.11.0
igraph              0.11.5
ipykernel           6.29.5
ipython_genutils    0.2.0
ipywidgets          8.1.3
jedi                0.19.1
joblib              1.4.2
kiwisolver          1.4.5
legacy_api_wrap     NA
leidenalg           0.10.2
llvmlite            0.43.0
louvain             0.8.2
matplotlib          3.8.4
mpl_toolkits        NA
natsort             8.4.0
numba               0.60.0
numexpr             2.10.1
numpy               1.26.4
packaging           24.1
pandas              2.2.2
parso               0.8.4
pkg_res

# Functions

In [3]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

# Import data

In [4]:
human_wt = sc.read_h5ad('../../data/dmd_annotated_human_wt_cmc_1k_hvg_25_10_24,11:40.h5ad')
human_wt

AnnData object with n_obs × n_vars = 56602 × 32285
    obs: 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'batch_key', 'leiden_scVI', 'cell_type', 'cell_state_HCAv1', 'cell_state_scNym', 'cell_state_scNym_confidence', 'cell_state', 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'n_counts', '_scvi_batch', '_scvi_labels', 'clus20', 'doublet_cls', 'original_or_new', 'batch', 'scANVI_predictions', 'leiden_scArches', 'sample', 'seed_labels', 'genotype', 'doublet_scores', 'predicted_doublets', 'percent_chrY', 'XIST-counts', 'XIST-percentage', 'gender_check_cov', 'S_score', 'G2M_score', 'phase', 'C_scANVI'
    var: 'mt', 'ribo', 'n_cells_by_count

In [5]:
X_is_raw(human_wt)

True

In [6]:
wt_others = sc.read_h5ad('../../data/dmd_annotated_wt_others_cmc_5k_hvg_25_10_24,11:49.h5ad')
wt_others

AnnData object with n_obs × n_vars = 28090 × 32285
    obs: 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'batch_key', 'leiden_scVI', 'cell_type', 'cell_state_HCAv1', 'cell_state_scNym', 'cell_state_scNym_confidence', 'cell_state', 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'n_counts', '_scvi_batch', '_scvi_labels', 'clus20', 'doublet_cls', 'original_or_new', 'batch', 'scANVI_predictions', 'leiden_scArches', 'sample', 'seed_labels', 'genotype', 'doublet_scores', 'predicted_doublets', 'percent_chrY', 'XIST-counts', 'XIST-percentage', 'gender_check_cov', 'S_score', 'G2M_score', 'phase', 'C_scANVI'
    var: 'mt', 'ribo', 'n_cells_by_count

In [7]:
X_is_raw(wt_others)

False

In [8]:
wt_others.X = wt_others.layers['raw_counts']
X_is_raw(wt_others)

True

# Concatenation

In [16]:
all_cmc = sc.concat([human_wt,wt_others], join= 'inner',merge='same', label ='ref',keys=['human','wt'])
all_cmc

AnnData object with n_obs × n_vars = 84692 × 29378
    obs: 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'batch_key', 'leiden_scVI', 'cell_type', 'cell_state_HCAv1', 'cell_state_scNym', 'cell_state_scNym_confidence', 'cell_state', 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'n_counts', '_scvi_batch', '_scvi_labels', 'clus20', 'doublet_cls', 'original_or_new', 'batch', 'scANVI_predictions', 'leiden_scArches', 'sample', 'seed_labels', 'genotype', 'doublet_scores', 'predicted_doublets', 'percent_chrY', 'XIST-counts', 'XIST-percentage', 'gender_check_cov', 'S_score', 'G2M_score', 'phase', 'C_scANVI', 'ref'
    var: 'mt', 'ribo', 'n_cells_b

In [17]:
X_is_raw(all_cmc)

True

In [18]:
all_cmc.obs

Unnamed: 0,sangerID,combinedID,donor,donor_type,region,region_finest,age,gender,facility,cell_or_nuclei,...,predicted_doublets,percent_chrY,XIST-counts,XIST-percentage,gender_check_cov,S_score,G2M_score,phase,C_scANVI,ref
CTCAACCAGATCCAAA-1,,,,,,,,,,,...,False,0.039135,0.0,0.000000,Male,-0.055892,-0.207692,G1,vCM1,human
CGGCAGTGTGGCCTCA-1,,,,,,,,,,,...,False,0.022938,0.0,0.000000,Male,-0.119855,-0.174615,G1,vCM1,human
CCTTCAGGTCCTTGTC-1,,,,,,,,,,,...,False,0.028959,0.0,0.000000,Male,-0.015537,-0.200513,G1,vCM1,human
GAAGGACCAGCAGACA-1,,,,,,,,,,,...,False,0.034706,2.0,0.011569,Male,0.010291,-0.034103,S,vCM5,human
TATCGCCAGCATTTGC-1,,,,,,,,,,,...,False,0.052641,0.0,0.000000,Male,-0.088075,-0.216282,G1,vCM1,human
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGGATCATCCCAAGTA-1-query-query,,,,,,,,,,,...,False,0.000000,0.0,0.000000,,-0.002119,-0.008333,G1,vCM1,wt
TGTTCATGTAGGACTG-1-query-query,,,,,,,,,,,...,False,0.000000,0.0,0.000000,,-0.006356,0.014231,G2M,vCM1,wt
TTTAGTCAGGATGGCT-1-query-query,,,,,,,,,,,...,False,0.000000,0.0,0.000000,,-0.006356,-0.006667,G1,vCM1,wt
GACTCAACACGTCATA-1-query-query,,,,,,,,,,,...,False,0.000000,0.0,0.000000,,-0.006356,0.012564,G2M,vCM1,wt


In [19]:
all_cmc.var

Unnamed: 0,mt,ribo,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts
Xkr4,False,False,2530,0.043516,98.656214,8193
Gm1992,False,False,300,0.002071,99.840658,390
Gm19938,False,False,317,0.001859,99.831628,350
Gm37381,False,False,72,0.000600,99.961758,113
Rp1,False,False,892,0.009895,99.526222,1863
...,...,...,...,...,...,...
Il11ra2-2,False,False,0,0.000000,100.000000,0
Ccl19-1,False,False,0,0.000000,100.000000,0
Ccl21a-1,False,False,0,0.000000,100.000000,0
Gm10931,False,False,80,0.000446,99.957509,84


In [20]:
all_cmc.obs['cell_state'].value_counts()

cell_state
vCM1    50283
vCM4    29192
vCM2     3599
vCM3     1617
vCM5        1
Name: count, dtype: int64

In [21]:
all_cmc.obs['cell_type'].value_counts()

cell_type
Ventricular Cardiomyocyte    84692
Name: count, dtype: int64

In [22]:
all_cmc.obs['ref'].value_counts()

ref
human    56602
wt       28090
Name: count, dtype: int64

In [23]:
all_cmc.write_h5ad(f'../../data/dmd_annotated_cmc_concatenated_{timestamp}.h5ad')