In [24]:
import pandas as pd
import scanpy as sc

In [25]:
cmc = sc.read_h5ad('../data/heart_mm_nuclei-23-0092_CMC_states_ctl240131.raw.h5ad')
cmc

AnnData object with n_obs × n_vars = 8257 × 32285
    obs: 'cell_source', 'cell_type', 'donor', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'cell_states', 'seed_labels', 'genotype', 'batch', 'doublet_scores', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    obsm: 'X_scANVI', 'X_scVI', 'X_umap'

In [26]:
cmc.obs['sample'].value_counts()

sample
A10_2    2656
B1_2     2100
A12_2    1294
A11_2    1215
A9_2      763
B2_2      229
Name: count, dtype: int64

In [27]:
cmc.obs['C_scANVI'].value_counts()

C_scANVI
vCM2    2869
vCM1    2733
vCM4    2548
vCM3     107
Name: count, dtype: int64

In [28]:
cmc.obs['cell_type'] = cmc.obs['C_scANVI']
cmc.obs['cell_type'].value_counts()

cell_type
vCM2    2869
vCM1    2733
vCM4    2548
vCM3     107
Name: count, dtype: int64

In [29]:
pd.crosstab(cmc.obs['cell_type'],cmc.obs['genotype'])

genotype,Mdx,MdxSCID,WT
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
vCM1,1038,838,857
vCM2,1435,980,454
vCM3,7,34,66
vCM4,29,477,2042


In [30]:
cardiomyocyte_types = ['vCM1', 'vCM2', 'vCM3', 'vCM4']

cardiomyocytes = cmc[cmc.obs['cell_type'].isin(cardiomyocyte_types)]

gene_counts = {cell_type: {} for cell_type in cardiomyocyte_types}

genotypes = cardiomyocytes.obs['genotype'].unique()

In [31]:
for cell_type in cardiomyocyte_types:
    for genotype in genotypes:
        genotype_cells = cardiomyocytes[(cardiomyocytes.obs['genotype'] == genotype) & 
                                         (cardiomyocytes.obs['cell_type'] == cell_type)]
        
        gene_expression = pd.DataFrame(genotype_cells.X.toarray(), columns=genotype_cells.var_names)

        unique_genes_count = (gene_expression > 0).any(axis=0).sum()

        gene_counts[cell_type][genotype] = unique_genes_count

counts_df = pd.DataFrame(gene_counts)

print("Unique Gene Counts DataFrame:")
print(counts_df)

Unique Gene Counts DataFrame:
          vCM1   vCM2   vCM3   vCM4
WT       17975  18411  12120  17948
Mdx      17183  20129   7597   8644
MdxSCID  18426  20335  11741  16327
