# Notebook to download heart data from 'CellxGene' 

**Developed by** :Srivalli Kolla

**Created on** : 20 June, 2024

**Last modified** : 20 June, 2024

**Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**

# Importing packages

In [2]:
import cellxgene_census
import numpy as np
import pandas as pd
import scanpy as sc
import os


# Data import and overview

## Query to check the census about dataset required

In [3]:
census = cellxgene_census.open_soma()
summary_table = census["census_info"]["summary_cell_counts"].read().concat().to_pandas()

summary_table.query("organism == 'Homo sapiens' & category == 'tissue_general' & label =='heart'")

The "stable" release is currently 2023-12-15. Specify 'census_version="2023-12-15"' in future calls to open_soma() to ensure data consistency.


Unnamed: 0,soma_joinid,organism,category,ontology_term_id,unique_cell_count,total_cell_count,label
1000,1000,Homo sapiens,tissue_general,UBERON:0000948,1776351,3629952,heart


## Loading data from census

In [4]:
heart_obs = (
    census["census_data"]["homo_sapiens"]
    .obs.read(value_filter="tissue_general == 'heart' and is_primary_data == True")
    .concat()
    .to_pandas()
)
heart_obs

Unnamed: 0,soma_joinid,dataset_id,assay,assay_ontology_term_id,cell_type,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,...,suspension_type,tissue,tissue_ontology_term_id,tissue_general,tissue_general_ontology_term_id,raw_sum,nnz,raw_mean_nnz,raw_variance_nnz,n_measured_vars
0,2517383,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,endothelial cell,CL:0000115,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,1958.0,1073,1.824790,29.730467,31071
1,2517385,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,endothelial cell,CL:0000115,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,1559.0,755,2.064901,29.954668,31071
2,2517393,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,mural cell,CL:0008034,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,1552.0,722,2.149584,146.340978,31071
3,2517400,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,mural cell,CL:0008034,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,1275.0,686,1.858601,25.736181,31071
4,2517401,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,myeloid cell,CL:0000763,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,1472.0,845,1.742012,8.833837,31071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1776346,62887152,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,560.0,402,1.393035,2.174315,26454
1776347,62887153,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,758.0,515,1.471845,2.724400,26454
1776348,62887154,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,568.0,414,1.371981,1.648220,26454
1776349,62887155,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,418.0,300,1.393333,2.312999,26454


## Data overview

In [5]:
heart_obs['suspension_type'].unique()

array(['cell', 'nucleus'], dtype=object)

In [None]:
heart_obs['assay'].unique()

array(["10x 3' v2", "10x 3' v3", 'Smart-seq2', 'sci-RNA-seq',
       'microwell-seq'], dtype=object)

In [None]:
heart_obs['disease'].unique()

array(['normal', 'myocardial infarction', 'dilated cardiomyopathy',
       'arrhythmogenic right ventricular cardiomyopathy',
       'non-compaction cardiomyopathy'], dtype=object)

In [None]:
heart_obs[["suspension_type"]].value_counts()

suspension_type
nucleus            1566011
cell                210340
Name: count, dtype: int64

In [None]:
heart_obs[["cell_type"]].value_counts().head(20)

cell_type                          
cardiac muscle cell                    284321
regular ventricular cardiac myocyte    208662
mural cell                             179000
fibroblast of cardiac tissue           170070
endothelial cell                       145334
fibroblast                              98697
pericyte                                92277
native cell                             84552
cardiac muscle myoblast                 66335
myeloid cell                            65888
capillary endothelial cell              59326
cardiac endothelial cell                39669
regular atrial cardiac myocyte          33541
immature innate lymphoid cell           28651
smooth muscle cell                      24091
lymphocyte                              21357
endothelial cell of artery              21337
macrophage                              19370
stromal cell                            16413
endothelial cell of vascular tree       11757
Name: count, dtype: int64

In [None]:
heart_obs[["tissue"]].value_counts()

tissue                         
heart left ventricle               695656
heart right ventricle              300993
interventricular septum            296836
apex of heart                      143972
heart                              112532
left cardiac atrium                 99189
right cardiac atrium                55452
anterior wall of left ventricle     36574
basal zone of heart                 18775
cardiac atrium                       8279
coronary artery                      4867
cardiac ventricle                    3226
Name: count, dtype: int64

# Adding dataset information

## Collection of dataset information

In [None]:
census_datasets = (
    census["census_info"]["datasets"]
    .read(column_names=["collection_name", "dataset_title", "dataset_id", "soma_joinid"])
    .concat()
    .to_pandas()
)
census_datasets = census_datasets.set_index("dataset_id")
census_datasets

Unnamed: 0_level_0,collection_name,dataset_title,soma_joinid
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2bdd3a2c-2ff4-4314-adf3-8a06b797a33a,Comparative transcriptomics reveals human-spec...,Human: Great apes study,0
f5b0810c-1664-4a62-ad06-be1d9964aa8b,Transcriptomic cytoarchitecture reveals princi...,Dissection: Angular gyrus (AnG),1
e4ddac12-f48f-4455-8e8d-c2a48a683437,Transcriptomic cytoarchitecture reveals princi...,Supercluster: CGE-derived interneurons,2
e2808a6e-e2ea-41b9-b38c-4a08f1677f02,Transcriptomic cytoarchitecture reveals princi...,Dissection: Primary auditory cortex(A1),3
d01c9dff-abd1-4825-bf30-2eb2ba74597e,Transcriptomic cytoarchitecture reveals princi...,Supercluster: Deep layer (non-IT) excitatory n...,4
...,...,...,...
f9ad5649-f372-43e1-a3a8-423383e5a8a2,Molecular characterization of selectively vuln...,Molecular characterization of selectively vuln...,646
456e8b9b-f872-488b-871d-94534090a865,Single-cell atlas of peripheral immune respons...,Single-cell atlas of peripheral immune respons...,647
2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,Construction of a human cell landscape at sing...,Construction of a human cell landscape at sing...,648
e04daea4-4412-45b5-989e-76a9be070a89,A molecular cell atlas of the human lung from ...,"Krasnow Lab Human Lung Cell Atlas, Smart-seq2",649


## Addition of dataset information to our heart data

In [None]:
dataset_cell_counts = pd.DataFrame(heart_obs[["dataset_id"]].value_counts())
dataset_cell_counts = dataset_cell_counts.rename(columns={0: "cell_counts"})
dataset_cell_counts = dataset_cell_counts.merge(census_datasets, on="dataset_id")

dataset_cell_counts

Unnamed: 0_level_0,count,collection_name,dataset_title,soma_joinid
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
65badd7a-9262-4fd1-9ce2-eb5dc0ca8039,665955,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: All cells,397
d4e69e01-3ba2-4d6b-a15d-e7048f78f22e,486134,Cells of the adult human heart,All — Cells of the adult human heart,563
d567b692-c374-4628-a508-8008f6778f22,216378,Spatially resolved multiomics of human cardiac...,Combined single cell and single nuclei RNA-Seq...,15
1c739a3e-c3f5-49d5-98e0-73975e751201,191795,Spatial multi-omic map of human myocardial inf...,All-snRNA-Spatial multi-omic map of human myoc...,391
f7c1c579-2dc0-47e2-ba19-8165c5a0e353,101749,A human cell atlas of fetal gene expression,Survey of human embryonic development,541
4ed927e9-c099-49af-b8ce-a2652d069333,36574,Single-nucleus cross-tissue molecular referenc...,Single-nucleus cross-tissue molecular referenc...,426
5500c673-1610-40a0-86d9-64d987ae50e6,30889,Integrated adult and foetal heart single-cell ...,Integrated adult and foetal hearts,187
f15e263b-6544-46cb-a46e-e33ab7ce8347,19722,Spatial multi-omic map of human myocardial inf...,Ischemia-snRNA-Spatial multi-omic map of human...,390
53d208b0-2cfd-4366-9866-c3c6114081bc,16372,Tabula Sapiens,Tabula Sapiens - All Cells,533
2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,10783,Construction of a human cell landscape at sing...,Construction of a human cell landscape at sing...,648


# Adding gene information

## Collection of gene information for all Human

In [None]:
all_var = census["census_data"]["homo_sapiens"].ms["RNA"].var.read().concat().to_pandas()
all_var

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length,nnz,n_measured_obs
0,0,ENSG00000233576,HTR3C2P,1057,69370,19581263
1,1,ENSG00000121410,A1BG,3999,5640476,62641311
2,2,ENSG00000268895,A1BG-AS1,3374,3071864,61946057
3,3,ENSG00000148584,A1CF,9603,734347,58195911
4,4,ENSG00000175899,A2M,6318,7894261,62704378
...,...,...,...,...,...,...
60659,60659,ENSG00000288719,RP4-669P10.21,4252,2826,1248980
60660,60660,ENSG00000288720,RP11-852E15.3,7007,99,1248980
60661,60661,ENSG00000288721,RP5-973N23.5,7765,0,0
60662,60662,ENSG00000288723,RP11-553N16.6,1015,18,1248980


## Collection of gene information specific to our dataset based on 'soma_joinid'

In [None]:
presence_matrix = cellxgene_census.get_presence_matrix(census, "Homo sapiens", "RNA")
presence_matrix = presence_matrix[dataset_cell_counts.soma_joinid, :]
presence_matrix.sum(axis=1).A1genes_measured = presence_matrix.sum(axis=1).A1

In [None]:
genes_measured = presence_matrix.sum(axis=1).A1
dataset_cell_counts["genes_measured"] = genes_measured
dataset_cell_counts

Unnamed: 0_level_0,count,collection_name,dataset_title,soma_joinid,genes_measured
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
65badd7a-9262-4fd1-9ce2-eb5dc0ca8039,665955,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: All cells,397,30805
d4e69e01-3ba2-4d6b-a15d-e7048f78f22e,486134,Cells of the adult human heart,All — Cells of the adult human heart,563,31303
d567b692-c374-4628-a508-8008f6778f22,216378,Spatially resolved multiomics of human cardiac...,Combined single cell and single nuclei RNA-Seq...,15,31071
1c739a3e-c3f5-49d5-98e0-73975e751201,191795,Spatial multi-omic map of human myocardial inf...,All-snRNA-Spatial multi-omic map of human myoc...,391,29012
f7c1c579-2dc0-47e2-ba19-8165c5a0e353,101749,A human cell atlas of fetal gene expression,Survey of human embryonic development,541,44150
4ed927e9-c099-49af-b8ce-a2652d069333,36574,Single-nucleus cross-tissue molecular referenc...,Single-nucleus cross-tissue molecular referenc...,426,29620
5500c673-1610-40a0-86d9-64d987ae50e6,30889,Integrated adult and foetal heart single-cell ...,Integrated adult and foetal hearts,187,27470
f15e263b-6544-46cb-a46e-e33ab7ce8347,19722,Spatial multi-omic map of human myocardial inf...,Ischemia-snRNA-Spatial multi-omic map of human...,390,20726
53d208b0-2cfd-4366-9866-c3c6114081bc,16372,Tabula Sapiens,Tabula Sapiens - All Cells,533,57042
2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,10783,Construction of a human cell landscape at sing...,Construction of a human cell landscape at sing...,648,26454


# Adding gene metadata

## Genes info from all datasets

In [None]:
var_somaid = np.nonzero(presence_matrix.sum(axis=0).A1 == presence_matrix.shape[0])[0].tolist()

## Filtering genes matching our soma_joinid

In [None]:
heart_var = all_var.query(f"soma_joinid in {var_somaid}")
heart_var

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length,nnz,n_measured_obs
1,1,ENSG00000121410,A1BG,3999,5640476,62641311
2,2,ENSG00000268895,A1BG-AS1,3374,3071864,61946057
4,4,ENSG00000175899,A2M,6318,7894261,62704378
5,5,ENSG00000245105,A2M-AS1,2948,1637794,62086816
6,6,ENSG00000166535,A2ML1,7156,2156616,60911688
...,...,...,...,...,...,...
44701,44701,ENSG00000272948,AP001412.1,714,485957,56447577
44702,44702,ENSG00000273210,AP001437.1,456,157112,55134991
44729,44729,ENSG00000184441,AP001062.7,4354,568847,56408710
44735,44735,ENSG00000272825,LL21NC02-1C16.2,652,80819,54503871


# Fetching all single-cell human heart data from the Census

In [None]:
heart_sample_ids = heart_obs["soma_joinid"].to_numpy()
heart_gene_ids = heart_var["soma_joinid"].to_numpy()
heart_adata = cellxgene_census.get_anndata(
    census,
    organism="Homo sapiens",
    obs_coords=heart_sample_ids,
    var_coords=heart_gene_ids,
)

heart_adata

AnnData object with n_obs × n_vars = 1776351 × 15871
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'

In [None]:
heart_adata.obs

Unnamed: 0,soma_joinid,dataset_id,assay,assay_ontology_term_id,cell_type,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,...,suspension_type,tissue,tissue_ontology_term_id,tissue_general,tissue_general_ontology_term_id,raw_sum,nnz,raw_mean_nnz,raw_variance_nnz,n_measured_vars
0,2517383,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,endothelial cell,CL:0000115,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,1958.0,1073,1.824790,29.730467,31071
1,2517385,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,endothelial cell,CL:0000115,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,1559.0,755,2.064901,29.954668,31071
2,2517393,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,mural cell,CL:0008034,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,1552.0,722,2.149584,146.340978,31071
3,2517400,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,mural cell,CL:0008034,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,1275.0,686,1.858601,25.736181,31071
4,2517401,d567b692-c374-4628-a508-8008f6778f22,10x 3' v2,EFO:0009899,myeloid cell,CL:0000763,sixth decade human stage,HsapDv:0000240,normal,PATO:0000461,...,cell,apex of heart,UBERON:0002098,heart,UBERON:0000948,1472.0,845,1.742012,8.833837,31071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1776346,62887152,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,560.0,402,1.393035,2.174315,26454
1776347,62887153,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,758.0,515,1.471845,2.724400,26454
1776348,62887154,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,568.0,414,1.371981,1.648220,26454
1776349,62887155,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,microwell-seq,EFO:0030002,endothelial cell,CL:0000115,11th week post-fertilization human stage,HsapDv:0000048,normal,PATO:0000461,...,cell,heart,UBERON:0000948,heart,UBERON:0000948,418.0,300,1.393333,2.312999,26454


In [None]:
heart_adata.var

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length,nnz,n_measured_obs
0,1,ENSG00000121410,A1BG,3999,5640476,62641311
1,2,ENSG00000268895,A1BG-AS1,3374,3071864,61946057
2,4,ENSG00000175899,A2M,6318,7894261,62704378
3,5,ENSG00000245105,A2M-AS1,2948,1637794,62086816
4,6,ENSG00000166535,A2ML1,7156,2156616,60911688
...,...,...,...,...,...,...
15866,44701,ENSG00000272948,AP001412.1,714,485957,56447577
15867,44702,ENSG00000273210,AP001437.1,456,157112,55134991
15868,44729,ENSG00000184441,AP001062.7,4354,568847,56408710
15869,44735,ENSG00000272825,LL21NC02-1C16.2,652,80819,54503871


In [None]:
heart_adata.var_names = heart_adata.var["feature_name"]

### Closing census

In [None]:
census.close()
del census

## Data saving

In [None]:
sc.write('data/cg_heart_all.h5ad',heart_adata)