# Notebook to download heart data from 'CellxGene' 

**Developed by** :Srivalli Kolla

**Created on** : 27 June, 2024

**Last modified** : 27 June, 2024

**Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**

# Importing packages

In [1]:
import cellxgene_census
import numpy as np
import pandas as pd
import scanpy as sc
import os

# Data import and overview

## Query to check the census about dataset required

In [2]:
census = cellxgene_census.open_soma()
summary_table = census["census_info"]["summary_cell_counts"].read().concat().to_pandas()

summary_table.query("organism == 'Homo sapiens' & label =='heart'")

The "stable" release is currently 2023-12-15. Specify 'census_version="2023-12-15"' in future calls to open_soma() to ensure data consistency.


Unnamed: 0,soma_joinid,organism,category,ontology_term_id,unique_cell_count,total_cell_count,label
783,783,Homo sapiens,tissue,UBERON:0000948,112532,137624,heart
1000,1000,Homo sapiens,tissue_general,UBERON:0000948,1776351,3629952,heart


## Loading data from census

In [3]:
heart_obs = (
    census["census_data"]["homo_sapiens"]
    .obs.read(value_filter="tissue_general == 'heart' or tissue == 'heart' and is_primary_data == True")
    .concat()
    .to_pandas()
)
heart_obs

Unnamed: 0,soma_joinid,dataset_id,assay,assay_ontology_term_id,cell_type,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,...,suspension_type,tissue,tissue_ontology_term_id,tissue_general,tissue_general_ontology_term_id,raw_sum,nnz,raw_mean_nnz,raw_variance_nnz,n_measured_vars
0,2517346,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,endothelial cell,CL:0000115,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,7123.0,2467,2.887313,62.875374,31071
1,2517347,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,mural cell,CL:0008034,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,6091.0,2054,2.965433,282.782536,31071
2,2517348,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,endothelial cell,CL:0000115,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,6248.0,2426,2.575433,44.868741,31071
3,2517349,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,mural cell,CL:0008034,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,5862.0,1819,3.222650,94.875042,31071
4,2517350,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,endothelial cell,CL:0000115,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,4920.0,2330,2.111588,23.759119,31071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3629947,62887152,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,560.0,402,1.393035,2.174315,26454
3629948,62887153,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,758.0,515,1.471845,2.724400,26454
3629949,62887154,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,568.0,414,1.371981,1.648220,26454
3629950,62887155,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,418.0,300,1.393333,2.312999,26454


## Data overview

In [4]:
heart_obs['assay'].unique()

array(["10x 3' v2", "10x 3' v3", 'Smart-seq2', 'sci-RNA-seq',
       'microwell-seq'], dtype=object)

In [5]:
heart_obs['disease'].unique()

array(['normal', 'myocardial infarction', 'dilated cardiomyopathy',
       'arrhythmogenic right ventricular cardiomyopathy',
       'non-compaction cardiomyopathy'], dtype=object)

In [6]:
heart_obs[["suspension_type"]].value_counts()

suspension_type
nucleus            3151688
cell                478264
Name: count, dtype: int64

In [7]:
heart_obs[["cell_type"]].value_counts().head(20)

cell_type                          
cardiac muscle cell                    746616
mural cell                             430361
regular ventricular cardiac myocyte    393506
endothelial cell                       365728
fibroblast of cardiac tissue           337813
fibroblast                             214619
pericyte                               173035
myeloid cell                           150168
capillary endothelial cell             120582
native cell                            102501
regular atrial cardiac myocyte          73489
cardiac muscle myoblast                 66335
lymphocyte                              50971
cardiac endothelial cell                44999
smooth muscle cell                      43926
endothelial cell of artery              42405
macrophage                              34963
immature innate lymphoid cell           28651
stromal cell                            19564
vein endothelial cell                   17868
Name: count, dtype: int64

In [8]:
heart_obs[["tissue"]].value_counts()

tissue                         
heart left ventricle               1349023
heart right ventricle               698008
interventricular septum             645583
apex of heart                       353385
left cardiac atrium                 219093
heart                               137624
right cardiac atrium                127638
anterior wall of left ventricle      36574
cardiac atrium                       24837
basal zone of heart                  18775
coronary artery                       9734
cardiac ventricle                     9678
Name: count, dtype: int64

# Adding dataset information

## Collection of dataset information

In [9]:
census_datasets = (
    census["census_info"]["datasets"]
    .read(column_names=["collection_name", "dataset_title", "dataset_id", "soma_joinid"])
    .concat()
    .to_pandas()
)
census_datasets = census_datasets.set_index("dataset_id")
census_datasets

Unnamed: 0_level_0,collection_name,dataset_title,soma_joinid
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2bdd3a2c-2ff4-4314-adf3-8a06b797a33a,Comparative transcriptomics reveals human-spec...,Human: Great apes study,0
f5b0810c-1664-4a62-ad06-be1d9964aa8b,Transcriptomic cytoarchitecture reveals princi...,Dissection: Angular gyrus (AnG),1
e4ddac12-f48f-4455-8e8d-c2a48a683437,Transcriptomic cytoarchitecture reveals princi...,Supercluster: CGE-derived interneurons,2
e2808a6e-e2ea-41b9-b38c-4a08f1677f02,Transcriptomic cytoarchitecture reveals princi...,Dissection: Primary auditory cortex(A1),3
d01c9dff-abd1-4825-bf30-2eb2ba74597e,Transcriptomic cytoarchitecture reveals princi...,Supercluster: Deep layer (non-IT) excitatory n...,4
...,...,...,...
f9ad5649-f372-43e1-a3a8-423383e5a8a2,Molecular characterization of selectively vuln...,Molecular characterization of selectively vuln...,646
456e8b9b-f872-488b-871d-94534090a865,Single-cell atlas of peripheral immune respons...,Single-cell atlas of peripheral immune respons...,647
2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,Construction of a human cell landscape at sing...,Construction of a human cell landscape at sing...,648
e04daea4-4412-45b5-989e-76a9be070a89,A molecular cell atlas of the human lung from ...,"Krasnow Lab Human Lung Cell Atlas, Smart-seq2",649


## Addition of dataset information to our heart data

In [10]:
dataset_cell_counts = pd.DataFrame(heart_obs[["dataset_id"]].value_counts())
dataset_cell_counts = dataset_cell_counts.rename(columns={0: "cell_counts"})
dataset_cell_counts = dataset_cell_counts.merge(census_datasets, on="dataset_id")

dataset_cell_counts

Unnamed: 0_level_0,count,collection_name,dataset_title,soma_joinid
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
65badd7a-9262-4fd1-9ce2-eb5dc0ca8039,881081,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: All cells,397
d567b692-c374-4628-a508-8008f6778f22,493236,Spatially resolved multiomics of human cardiac...,Combined single cell and single nuclei RNA-Seq...,15
d4e69e01-3ba2-4d6b-a15d-e7048f78f22e,486134,Cells of the adult human heart,All — Cells of the adult human heart,563
f7995301-7551-4e1d-8396-ffe3c9497ace,311418,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: Cardiomyocytes,392
572f3f3e-d3e4-4d13-8e2b-88215e508481,195395,Cells of the adult human heart,Vascular — Cells of the adult human heart,556
1c739a3e-c3f5-49d5-98e0-73975e751201,191795,Spatial multi-omic map of human myocardial inf...,All-snRNA-Spatial multi-omic map of human myoc...,391
1252c5fb-945f-42d6-b1a8-8a3bd864384b,170281,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: Mural cells,398
ed2b673b-0279-454a-998c-3eec361edf54,142816,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: Fibroblasts,393
78fd69d2-75e4-4207-819a-563139f273c6,125289,Cells of the adult human heart,Ventricular cardiomyocytes — Cells of the adul...,560
1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d,115548,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: Endothelial cells,399


# Adding gene information

## Collection of gene information for all Human

In [11]:
all_var = census["census_data"]["homo_sapiens"].ms["RNA"].var.read().concat().to_pandas()
all_var

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length,nnz,n_measured_obs
0,0,ENSG00000233576,HTR3C2P,1057,69370,19581263
1,1,ENSG00000121410,A1BG,3999,5640476,62641311
2,2,ENSG00000268895,A1BG-AS1,3374,3071864,61946057
3,3,ENSG00000148584,A1CF,9603,734347,58195911
4,4,ENSG00000175899,A2M,6318,7894261,62704378
...,...,...,...,...,...,...
60659,60659,ENSG00000288719,RP4-669P10.21,4252,2826,1248980
60660,60660,ENSG00000288720,RP11-852E15.3,7007,99,1248980
60661,60661,ENSG00000288721,RP5-973N23.5,7765,0,0
60662,60662,ENSG00000288723,RP11-553N16.6,1015,18,1248980


## Collection of gene information specific to our dataset based on 'soma_joinid'

In [12]:
presence_matrix = cellxgene_census.get_presence_matrix(census, "Homo sapiens", "RNA")
presence_matrix = presence_matrix[dataset_cell_counts.soma_joinid, :]
presence_matrix.sum(axis=1).A1genes_measured = presence_matrix.sum(axis=1).A1

In [13]:
genes_measured = presence_matrix.sum(axis=1).A1
dataset_cell_counts["genes_measured"] = genes_measured
dataset_cell_counts

Unnamed: 0_level_0,count,collection_name,dataset_title,soma_joinid,genes_measured
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
65badd7a-9262-4fd1-9ce2-eb5dc0ca8039,881081,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: All cells,397,30805
d567b692-c374-4628-a508-8008f6778f22,493236,Spatially resolved multiomics of human cardiac...,Combined single cell and single nuclei RNA-Seq...,15,31071
d4e69e01-3ba2-4d6b-a15d-e7048f78f22e,486134,Cells of the adult human heart,All — Cells of the adult human heart,563,31303
f7995301-7551-4e1d-8396-ffe3c9497ace,311418,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: Cardiomyocytes,392,30149
572f3f3e-d3e4-4d13-8e2b-88215e508481,195395,Cells of the adult human heart,Vascular — Cells of the adult human heart,556,30156
1c739a3e-c3f5-49d5-98e0-73975e751201,191795,Spatial multi-omic map of human myocardial inf...,All-snRNA-Spatial multi-omic map of human myoc...,391,29012
1252c5fb-945f-42d6-b1a8-8a3bd864384b,170281,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: Mural cells,398,6445
ed2b673b-0279-454a-998c-3eec361edf54,142816,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: Fibroblasts,393,29329
78fd69d2-75e4-4207-819a-563139f273c6,125289,Cells of the adult human heart,Ventricular cardiomyocytes — Cells of the adul...,560,28956
1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d,115548,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: Endothelial cells,399,28681


# Adding gene metadata

## Genes info from all datasets

In [14]:
var_somaid = np.nonzero(presence_matrix.sum(axis=0).A1 == presence_matrix.shape[0])[0].tolist()

## Filtering genes matching our soma_joinid

In [15]:
heart_var = all_var.query(f"soma_joinid in {var_somaid}")
heart_var

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length,nnz,n_measured_obs
37,37,ENSG00000165029,ABCA1,11343,9001659,62902272
38,38,ENSG00000154263,ABCA10,8405,11705270,62845653
40,40,ENSG00000144452,ABCA12,9614,976134,60881315
51,51,ENSG00000231749,ABCA9-AS1,2990,2783190,55808577
55,55,ENSG00000005471,ABCB4,7728,1112933,62817959
...,...,...,...,...,...,...
43728,43728,ENSG00000270792,RP11-103J8.1,2623,2109618,55288866
44236,44236,ENSG00000269124,AC007193.10,472,1045514,49989821
44515,44515,ENSG00000229999,SGSM3-AS1,893,3074526,55144540
44616,44616,ENSG00000224905,AP001347.6,1797,9868751,56221168


# Fetching all single-cell human heart data from the Census

In [16]:
heart_sample_ids = heart_obs["soma_joinid"].to_numpy()
heart_gene_ids = heart_var["soma_joinid"].to_numpy()
heart_adata = cellxgene_census.get_anndata(
    census,
    organism="Homo sapiens",
    obs_coords=heart_sample_ids,
    var_coords=heart_gene_ids,
)

heart_adata

AnnData object with n_obs × n_vars = 3629952 × 1035
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'

In [17]:
heart_adata.obs

Unnamed: 0,soma_joinid,dataset_id,assay,assay_ontology_term_id,cell_type,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,...,suspension_type,tissue,tissue_ontology_term_id,tissue_general,tissue_general_ontology_term_id,raw_sum,nnz,raw_mean_nnz,raw_variance_nnz,n_measured_vars
0,2517346,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,endothelial cell,CL:0000115,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,7123.0,2467,2.887313,62.875374,31071
1,2517347,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,mural cell,CL:0008034,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,6091.0,2054,2.965433,282.782536,31071
2,2517348,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,endothelial cell,CL:0000115,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,6248.0,2426,2.575433,44.868741,31071
3,2517349,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,mural cell,CL:0008034,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,5862.0,1819,3.222650,94.875042,31071
4,2517350,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,endothelial cell,CL:0000115,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,4920.0,2330,2.111588,23.759119,31071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3629947,62887152,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,560.0,402,1.393035,2.174315,26454
3629948,62887153,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,758.0,515,1.471845,2.724400,26454
3629949,62887154,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,568.0,414,1.371981,1.648220,26454
3629950,62887155,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,418.0,300,1.393333,2.312999,26454


In [None]:
heart_adata.obs('suspension_type').unique

In [18]:
heart_adata.var

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length,nnz,n_measured_obs
0,37,ENSG00000165029,ABCA1,11343,9001659,62902272
1,38,ENSG00000154263,ABCA10,8405,11705270,62845653
2,40,ENSG00000144452,ABCA12,9614,976134,60881315
3,51,ENSG00000231749,ABCA9-AS1,2990,2783190,55808577
4,55,ENSG00000005471,ABCB4,7728,1112933,62817959
...,...,...,...,...,...,...
1030,43728,ENSG00000270792,RP11-103J8.1,2623,2109618,55288866
1031,44236,ENSG00000269124,AC007193.10,472,1045514,49989821
1032,44515,ENSG00000229999,SGSM3-AS1,893,3074526,55144540
1033,44616,ENSG00000224905,AP001347.6,1797,9868751,56221168


In [19]:
heart_adata.var_names = heart_adata.var["feature_name"]

### Closing census

In [20]:
census.close()
del census

## Data saving

In [21]:
sc.write('data/cg_heart_all_tissue.h5ad',heart_adata)