# Standardising the metadata tables
[Cellxgene Datasets](https://cellxgene.cziscience.com/datasets)<br>

CZ CELLxGENE Discover: A single-cell data platform for scalable exploration, analysis and modeling of aggregated data CZI Single-Cell Biology, et al. bioRxiv 2023.10.30; doi: https://doi.org/10.1101/2023.10.30.563174
- Developed by: Christian Eger
- Würzburg Institute for Systems Immunology - Faculty of Medicine - Julius Maximilian Universität Würzburg
- Created on: 2402423
- Last modified: 240424

## Environment
**yaml path:** conda_environments/cellxgene-env.yml

## Import modules

In [1]:
import sys
sys.path.append('../tools/')
import experiment

## Load datasets

In [17]:
experiment_collection = experiment.Experiment_Collection(
    dataset_ids_csv='experiments.csv',
    h5ad_folder='../.data/h5ad_files/',
)

100%|██████████| 79/79 [01:06<00:00,  1.20it/s]


## Inspect metadata

### Standardizing obs columns

#### Inspecting obs columns that aren't shared across all experiments

In [3]:
unique_obs_columns = experiment_collection.obs_difference.copy()
unique_obs_columns

['AVN_P_cell',
 'AVN_P_cell_abundance',
 'AVN_bundle_cell',
 'AVN_bundle_cell_abundance',
 'Adip1',
 'Adip1_abundance',
 'Adip2',
 'Adip2_abundance',
 'Adip3',
 'Adip3_abundance',
 'Adip4',
 'Adip4_abundance',
 'Adipocyte',
 'B',
 'B_abundance',
 'B_plasma',
 'B_plasma_abundance',
 'CD14+Mo',
 'CD14+Mo_abundance',
 'CD16+Mo',
 'CD16+Mo_abundance',
 'CD4+T_Tfh',
 'CD4+T_Tfh_abundance',
 'CD4+T_Th1',
 'CD4+T_Th1_abundance',
 'CD4+T_Th2',
 'CD4+T_Th2_abundance',
 'CD4+T_act',
 'CD4+T_act_abundance',
 'CD4+T_naive',
 'CD4+T_naive_abundance',
 'CD4+T_reg',
 'CD4+T_reg_abundance',
 'CD8+T_cytox',
 'CD8+T_cytox_abundance',
 'CD8+T_em',
 'CD8+T_em_abundance',
 'CD8+T_te',
 'CD8+T_te_abundance',
 'CD8+T_trans',
 'CD8+T_trans_abundance',
 'Cardiomyocyte',
 'Cycling.cells',
 'DC',
 'DC_abundance',
 'EC10_CMC-like',
 'EC10_CMC-like_abundance',
 'EC1_cap',
 'EC1_cap_abundance',
 'EC2_cap',
 'EC2_cap_abundance',
 'EC3_cap',
 'EC3_cap_abundance',
 'EC4_immune',
 'EC4_immune_abundance',
 'EC5_art',
 '

In [4]:
# removing the columns relating to celltype
remove_columns_list = [
    'Adip',
    'CD4',
    'B_',
    'CD16',
    'vCM',
    'EC',
    'FB',
    'CD8',
    'CD14',
    'AVN',
    'DC',
    'Cardiomyocyte'
    'Cycling.cells',
    'Endothelial',
    'Fibroblast',
    'ILC',
    'Cycling.cells',
    'LYVE1',
    'Lymphoid',
    'MAIT',
    'Mast',
    'Meso',
    'MoMP',
    'Myeloid',
    'NC',
    'NK',
    'prop',
    'PC',
    'Pericyte',
    'SMC',
    'T/NK',
    'Neu',
    'SAN',
    'aCM',
    'vSMC',
    'gdT',
    'Cardiomyocyte',
    'B'
]

unique_obs_columns = [column for column in unique_obs_columns if not any(column.startswith(item) for item in remove_columns_list)]
unique_obs_columns

['Publication',
 'age',
 'annotation_JC',
 'annotation_final',
 'cell_type_original',
 'donor_type',
 'ethinic_origin',
 'facility',
 'flushed',
 'in_tissue',
 'log1p_n_genes_by_counts',
 'log1p_total_counts',
 'mt_frac',
 'n_counts',
 'n_genes',
 'n_genes_by_counts',
 'pct_counts_in_top_100_genes',
 'pct_counts_in_top_200_genes',
 'pct_counts_in_top_500_genes',
 'pct_counts_in_top_50_genes',
 'percent.mt',
 'region',
 'region_finest',
 'sample',
 'sangerID',
 'total_counts']

#### Creating dictionary for the purpose of standardizing column names

In [5]:
# The only two columns with different names referring to the same data appear to be 'total_counts' and 'n_counts'
standard_obs_columns_dict = {
    'n_counts': ['n_counts', 'total_counts']
}

In [6]:
experiment_collection.standardize_experiments_obs_columns_names(standard_obs_columns_dict)

{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_counts'}
{'n_counts': ['n_counts', 'total_counts']} {'total_counts': 'n_c

### Shared adata.obs columns across experiments

In [7]:
shared_obs_columns = experiment_collection.get_obs_intersection().copy()
shared_obs_columns

['assay',
 'assay_ontology_term_id',
 'cell_type',
 'cell_type_ontology_term_id',
 'development_stage',
 'development_stage_ontology_term_id',
 'disease',
 'disease_ontology_term_id',
 'donor_id',
 'is_primary_data',
 'n_counts',
 'observation_joinid',
 'organism',
 'organism_ontology_term_id',
 'self_reported_ethnicity',
 'self_reported_ethnicity_ontology_term_id',
 'sex',
 'sex_ontology_term_id',
 'suspension_type',
 'tissue',
 'tissue_ontology_term_id',
 'tissue_type']

### Unique Values in shared adata.obs columns

In [8]:
# removing 'observation_joinid' and 'n_counts' because the columns values are not relevant
remove_columns = ['observation_joinid', 'n_counts']
shared_obs_columns = [column for column in shared_obs_columns if column not in remove_columns]
for column in shared_obs_columns:
    print(column, experiment_collection.get_obs_unique_values(column), '\n')

assay ['Visium Spatial Gene Expression'] 

assay_ontology_term_id ['EFO:0010961'] 

cell_type ['innate lymphoid cell', 'CD14-positive monocyte', 'glial cell', 'smooth muscle myoblast', 'adipocyte', 'unknown', 'cardiac pacemaker cell of sinoatrial node', 'mature NK T cell', 'endothelial cell', 'mast cell', 'CD8-positive, alpha-beta T cell', 'immature innate lymphoid cell', 'smooth muscle cell', 'smooth muscle cell of the pulmonary artery', 'vein endothelial cell', 'mucosal invariant T cell', 'neutrophil', 'endothelial cell of artery', 'CD8-positive, alpha-beta cytotoxic T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'dendritic cell, human', 'T-helper 2 cell', 'endothelial cell of lymphatic vessel', 'CD16-negative, CD56-bright natural killer cell, human', 'mesothelial cell', 'cardiac endothelial cell', 'lymphoid lineage restricted progenitor cell', 'endocardial cell', 'regular ventricular cardiac myocyte', 'pericyte', 'cardiac muscle myoblast', 'CD14-low, CD16-positive 

In [9]:
'''
dict[str, dict[str, list[str]]]
{column_1: {new_column_value_1: [old_column_value_1_1, ..., old_column_value_1_n]}}
'''
obs_values_dict = {
    'cell_type': {
        'B Cell': ['B cell', ],
        'T Cell': ['T-helper 2 cell', 'activated CD4-positive, alpha-beta T cell', 'mucosal invariant T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell',
                   'effector memory CD8-positive, alpha-beta T cell', 'CD8-positive, alpha-beta cytotoxic T cell', 'CD8-positive, alpha-beta T cell'],
        'NK Cell': ['CD16-negative, CD56-bright natural killer cell, human', 'CD16-positive, CD56-dim natural killer cell, human'],
        'NK T Cell': ['mature NK T cell', ],
        'Macrophage': ['macrophage', ],
        'Endothelial Cell': ['cardiac endothelial cell', 'capillary endothelial cell', 'endothelial cell of lymphatic vessel',
                             'endothelial cell of artery', 'endothelial cell', 'vein endothelial cell', ],
        'Fibroblast': ['fibroblast of cardiac tissue', ],
        'Monocyte': ['monocyte', 'CD14-positive monocyte', ],
        'Mesothelial Cell': ['mesothelial cell', ],
        'Pericyte': ['pericyte', ],
        'Dendritic Cell': ['dendritic cell, human', ],
        'Myoblast': [],
        'Plasma Cell': ['plasma cell', ],
        'ILC': ['innate lymphoid cell', 'immature innate lymphoid cell', ],
        'Glial Cell': ['glial cell'],
        'Monocyte': [],
        'Endocardial Cell': ['endocardial cell'],
        'Adipocyte': ['adipocyte of epicardial fat of left ventricle', 'adipocyte', ],
        'Smooth Muscle Cell': ['smooth muscle cell', ],
        'CLP': ['lymphoid lineage restricted progenitor cell', ],
        'Neutrophil': ['neutrophil', ],
        'Myocyte': ['regular ventricular cardiac myocyte', 'regular atrial cardiac myocyte'],
        'Myoblast': ['cardiac muscle myoblast', 'smooth muscle myoblast'],
        'Mast Cell': ['mast cell', ],
    },
    'development_stage': {
        '0-9s': [],
        '10-19': [],
        '20-29': [],
        '30-39': ['38-year-old human stage', 'third decade human stage', ],
        '40-49': ['47-year-old human stage', '44-year-old human stage','40-year-old human stage', '43-year-old human stage', ],
        '50-59': ['57-year-old human stage', '52-year-old human stage', 'fifth decade human stage', '51-year-old human stage', '58-year-old human stage',
                  '55-year-old human stage', ],
        '60-69': ['64-year-old human stage', '60-year-old human stage', '66-year-old human stage', 'sixth decade human stage', '61-year-old human stage',
                  '63-year-old human stage', ],
        '70-79': ['seventh decade human stage', '74-year-old human stage', ],
        '80-89': ['eighth decade human stage', ],
        '90-99': [],
        '100-109': [],
    }
}

In [10]:
experiment_collection.standardize_experiments_obs_columns_values(obs_values_dict)

In [11]:
shared_obs_columns = experiment_collection.get_obs_intersection().copy()
shared_obs_columns

['assay',
 'assay_ontology_term_id',
 'cell_type',
 'cell_type_ontology_term_id',
 'cell_type_standard',
 'development_stage',
 'development_stage_ontology_term_id',
 'development_stage_standard',
 'disease',
 'disease_ontology_term_id',
 'donor_id',
 'is_primary_data',
 'n_counts',
 'observation_joinid',
 'organism',
 'organism_ontology_term_id',
 'self_reported_ethnicity',
 'self_reported_ethnicity_ontology_term_id',
 'sex',
 'sex_ontology_term_id',
 'suspension_type',
 'tissue',
 'tissue_ontology_term_id',
 'tissue_type']

In [12]:
# removing 'observation_joinid' and 'n_counts' because the columns values are not relevant
remove_columns = ['observation_joinid', 'n_counts']
shared_obs_columns = [column for column in shared_obs_columns if column not in remove_columns]
for column in shared_obs_columns:
    print(column, experiment_collection.get_obs_unique_values(column), '\n')

assay ['Visium Spatial Gene Expression'] 

assay_ontology_term_id ['EFO:0010961'] 

cell_type ['innate lymphoid cell', 'CD14-positive monocyte', 'glial cell', 'smooth muscle myoblast', 'adipocyte', 'unknown', 'cardiac pacemaker cell of sinoatrial node', 'mature NK T cell', 'endothelial cell', 'mast cell', 'CD8-positive, alpha-beta T cell', 'immature innate lymphoid cell', 'smooth muscle cell', 'smooth muscle cell of the pulmonary artery', 'vein endothelial cell', 'mucosal invariant T cell', 'neutrophil', 'endothelial cell of artery', 'CD8-positive, alpha-beta cytotoxic T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'dendritic cell, human', 'T-helper 2 cell', 'endothelial cell of lymphatic vessel', 'CD16-negative, CD56-bright natural killer cell, human', 'mesothelial cell', 'cardiac endothelial cell', 'lymphoid lineage restricted progenitor cell', 'endocardial cell', 'regular ventricular cardiac myocyte', 'pericyte', 'cardiac muscle myoblast', 'CD14-low, CD16-positive 

In [13]:
columns_list = ['tissue', 'disease', 'organism', 'assay']

for column in columns_list:
    values_list = []
    for i in range(len(experiment_collection.experiments)):
        value = experiment_collection.experiments[i].adata.obs[column].unique()[0]
        values_list.append(value)
    experiment_collection.dataset_df[column] = values_list

In [14]:
experiment_collection.dataset_df

Unnamed: 0_level_0,tissue,disease,organism,assay
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ffa80f3c-f779-49a2-9a7e-275089ece6e2,apex of heart,normal,Homo sapiens,Visium Spatial Gene Expression
ff9fbe4a-f353-4eff-9e3d-1edfde5179ac,apex of heart,normal,Homo sapiens,Visium Spatial Gene Expression
fdf7002a-97d0-4eea-9700-e7d325f074cb,sinoatrial node,normal,Homo sapiens,Visium Spatial Gene Expression
fca1bb0b-76c9-4422-84b7-c225e2c6a3e4,atrioventricular node,normal,Homo sapiens,Visium Spatial Gene Expression
f5dc5434-cd48-4389-a09c-7189dad6d0b1,sinoatrial node,normal,Homo sapiens,Visium Spatial Gene Expression
...,...,...,...,...
570098e3-bfa4-49d0-ad51-40f79d185577,heart left ventricle,myocardial infarction,Homo sapiens,Visium Spatial Gene Expression
5480ae73-a12c-41ad-b10d-69a671502553,heart left ventricle,myocardial infarction,Homo sapiens,Visium Spatial Gene Expression
1f2871a8-3020-4ac4-8b36-fc09bcbb7885,heart left ventricle,myocardial infarction,Homo sapiens,Visium Spatial Gene Expression
1f196854-14e6-4446-9536-297b1f8d8852,heart left ventricle,myocardial infarction,Homo sapiens,Visium Spatial Gene Expression


In [16]:
experiment_collection.dataset_df.to_csv('experiments.csv')