## Notebook for the adding of the Kong-2023 dataset to the GCA_Wang_Smillie dataset 

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 11th April 2023

#### Import required modules

In [29]:
import scanpy as sc
import numpy as np
import anndata as ad
import pandas as pd
import anndata as an

In [14]:
import matplotlib.pyplot as plt
from matplotlib import axes
from matplotlib import pylab

#### Setup Cells

In [17]:
%matplotlib inline

In [18]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi = 160, color_map = 'magma_r', dpi_save = 300, vector_friendly = True)

scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.9.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


In [19]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Upload All Datasets 


In [20]:
# Upload GCA_Smillie_Wang dataset
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/GCA_Smillie_Wang_unprocessed/Reference_map_(Gut_cell_atlas+Smilie+Wang).h5ad'
GCA_adata = sc.read_h5ad(input)

In [25]:
# Upload Kong-2023 dataset
input_Kong = '/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/Raw_anndata/adata_Kong_2023_healthy_with_QC.h5ad'
Kong_adata = sc.read_h5ad(input_Kong)

input_Kong_raw = '/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/Raw_anndata/Kong_2023_raw_anndata.h5ad'
Kong_adata_raw = sc.read_h5ad(input_Kong_raw)

### Datasets Preprocessing

In [26]:
X_is_raw(GCA_adata)

True

In [27]:
X_is_raw(Kong_adata)

False

In [28]:
X_is_raw(Kong_adata_raw)

True

In [30]:
Kong_adata

AnnData object with n_obs × n_vars = 181806 × 27830
    obs: 'cell_type', 'tissue', 'batch', 'biosample_id', 'n_genes', 'n_counts', 'Chem', 'Site', 'Type', 'donor_id', 'Layer', 'Celltype', 'sex', 'species', 'species__ontology_label', 'library_preparation_protocol', 'library_preparation_protocol__ontology_label', 'organ', 'organ__ontology_label', 'disease', 'disease__ontology_label', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'percent_chrY', 'XIST-counts'
    var: 'gene_id', 'gene_name', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo', 'highly_variable', 'highly_variable_rank', 'me

In [33]:
# Filter out only healthy samples
Kong_adata_raw = Kong_adata_raw[Kong_adata_raw.obs['disease__ontology_label'] == 'normal']

In [34]:
Kong_adata = an.AnnData(X = Kong_adata_raw.X, obs = Kong_adata.obs, var = Kong_adata.var)

In [35]:
X_is_raw(Kong_adata)

True

In [36]:
# Change 'CO' in Kong_adata.obs['Site'] to 'Colon', 'TI' to 'Terminal Ileum', and 'SB' to 'Small Bowel'
Kong_adata.obs['Site'] = Kong_adata.obs['Site'].replace('CO', 'Colon')
Kong_adata.obs['Site'] = Kong_adata.obs['Site'].replace('TI', 'Terminal Ileum')
Kong_adata.obs['Site'] = Kong_adata.obs['Site'].replace('SB', 'Small Bowel')

In [37]:
# Rename columns in Kong-2023 dataset as in GCA_Smillie_Wang dataset
Kong_adata.obs.rename(columns = {'donor_id': 'Donor_ID'}, inplace = True)
Kong_adata.obs['Study_name'] = 'Kong 2023'
Kong_adata.obs.rename(columns = {'biosample_id': 'Sample_ID'}, inplace = True)
Kong_adata.obs.rename(columns = {'Site': 'Location'}, inplace = True)

GCA_adata.obs.rename(columns = {'10X': 'Library_Preparation_Protocol'}, inplace = True)
Kong_adata.obs.rename(columns = {'library_preparation_protocol__ontology_label': 'Library_Preparation_Protocol'}, inplace = True)

In [38]:
# Delete log1p_n_genes_by_counts, log1p_total_counts, log1p_total_counts, pct_counts_in_top_50_genes, pct_counts_in_top_100_genes, pct_counts_in_top_200_genes, pct_counts_in_top_500_genes, and total_counts columns from Kong_adata.obs
Kong_adata.obs.drop(columns = ['log1p_n_genes_by_counts', 'log1p_total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts'], inplace = True)

In [39]:
# Change Geneder to the same values
Kong_adata.obs['sex'] = Kong_adata.obs['sex'].replace('male', 'Male')
Kong_adata.obs['sex'] = Kong_adata.obs['sex'].replace('female', 'Female')
Kong_adata.obs.rename(columns = {'sex': 'Sex'}, inplace = True)
GCA_adata.obs.rename(columns = {'Gender': 'Sex'}, inplace = True)
GCA_adata.obs['Sex'] = GCA_adata.obs['Sex'].replace('M', 'Male')
GCA_adata.obs['Sex'] = GCA_adata.obs['Sex'].replace('F', 'Female')

In [40]:
Kong_adata.obs['doublet_info'].value_counts()

False    181574
True        232
Name: doublet_info, dtype: int64

In [41]:
# Make QC metrics columns the same
GCA_adata.obs.rename(columns = {'total_counts_mt': 'total_counts_mito'}, inplace = True)
Kong_adata.obs.rename(columns = {'pct_counts_mito': 'percent_mito'}, inplace = True)
Kong_adata.obs.rename(columns = {'pct_counts_ribo': 'percent_ribo'}, inplace = True)

In [42]:
# Uniform the values in the 'Diagnosis' column
GCA_adata.obs['Diagnosis'] = GCA_adata.obs['Diagnosis'].replace('fetal', 'Fetal Healthy')
GCA_adata.obs['Diagnosis'] = GCA_adata.obs['Diagnosis'].replace('Healthy', 'Healthy adult')
GCA_adata.obs['Diagnosis'] = GCA_adata.obs['Diagnosis'].replace('nan', 'Healthy adult')
GCA_adata.obs['Diagnosis'] = GCA_adata.obs['Diagnosis'].replace('Non-inflamed', 'Adult Ulcerative Colitis Non-inflamed')
Kong_adata.obs['disease__ontology_label'] = Kong_adata.obs['disease__ontology_label'].replace('normal', 'Healthy adult')
Kong_adata.obs.rename(columns = {'disease__ontology_label': 'Diagnosis'}, inplace = True)

In [43]:
# Drop unnecessary columns
Kong_adata.obs.drop(columns = ['G2M_score', 'percent_chrY', 'doublet_info', 'XIST-counts', 'S_score'], inplace = True)

In [44]:
Kong_adata.obs.rename(columns = {'Celltype': 'Cell States'}, inplace = True)
Kong_adata.obs.rename(columns = {'cell_type': 'Cell Type'}, inplace = True)
GCA_adata.obs.rename(columns = {'CellType': 'Cell Type'}, inplace = True)

In [45]:
GCA_adata.obs['Age_group'] = GCA_adata.obs['Age_group'].replace('nan', 'Adult')
Kong_adata.obs['Age_group'] = 'Adult'

In [46]:
# Drop unnecessary columns
Kong_adata.obs.drop(columns = ['organ', 'tissue', 'Type', 'library_preparation_protocol', 'disease', 'organ__ontology_label', 'species', 'species__ontology_label'], inplace = True)

In [47]:
# Create a list with the Kong_adata.obs_keys and GCA_adata.obs_keys, and then make a list with the columns that are present in Kong_adata.obs_keys but not in GCA_adata.obs_keys
Kong_adata_keys = list(Kong_adata.obs_keys())
GCA_adata_keys = list(GCA_adata.obs_keys())
Kong_adata_keys_not_in_GCA_adata_keys = list(set(Kong_adata_keys) - set(GCA_adata_keys))

### Uniform the Cell Types

In [48]:
GCA_adata.obs['Cell Type'].value_counts()

Mesenchymal        155794
Epithelial         129929
Neuronal            17594
T cells             17394
Plasma cells        16445
Endothelial         13082
B cells             11381
Myeloid              9820
Stem Cell            2508
Red blood cells      1346
Name: Cell Type, dtype: int64

In [49]:
Kong_adata.obs['Cell Type'].value_counts()

Immune        81599
Epithelial    77638
Stromal       22569
Name: Cell Type, dtype: int64

In [50]:
# Filter only healthy adult samples
GCA_adata_adult = GCA_adata[GCA_adata.obs['Diagnosis'] == 'Healthy adult']

# Make a list of cell states in the Endothelial cell type in GCA_adata
GCA_adata_Endothelial_cell_states = list(GCA_adata_adult.obs.loc[GCA_adata_adult.obs['Cell Type'] == 'Endothelial', 'Cell States'].unique())

In [51]:
# List of specific Cell States to look for
cell_states_list = [
    'Stem cells OLFM4 LGR5',
    'Stem cells OLFM4 PCNA',
    'Stem cells OLFM4 GSTA1',
    'Stem cells OLFM4'
]

# Add 'Stem Cell' as a new category to the 'Cell Type' column
Kong_adata.obs['Cell Type'] = Kong_adata.obs['Cell Type'].cat.add_categories(['Stem Cell'])

# Update 'Cell Type' based on the condition in 'Cell States'
Kong_adata.obs.loc[Kong_adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Stem Cell'

In [52]:
# Create a new DataFrame with unique cell states and their corresponding cell types
unique_cell_states = GCA_adata_adult.obs.drop_duplicates(subset=['Cell States', 'Cell Type'])

# Reset index and drop the original index
unique_cell_states.reset_index(drop=True, inplace=True)

# Drop all columns except 'Cell States' and 'Cell Type' from unique_cell_states
unique_cell_states = unique_cell_states[['Cell States', 'Cell Type']]


In [53]:
# List of specific Cell States to look for
cell_states_list = [
    'B cells',
    'B cells AICDA LRMP'
]

# Add 'B cells' as a new category to the 'Cell Type' column
Kong_adata.obs['Cell Type'] = Kong_adata.obs['Cell Type'].cat.add_categories(['B cells'])

# Update 'Cell Type' based on the condition in 'Cell States'
Kong_adata.obs.loc[Kong_adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'B cells'

In [54]:
# List of specific Cell States to look for
cell_states_list = [
    'T cells CD4 FOSB',
    'T cells CD4 IL17A',
    'T cells CD8',
    'T cells CD8 KLRG1',
    'T cells Naive CD4',
    'T cells OGT',
    'Tregs',
    'NK cells KLRF1 CD3G-',
    'NK-like cells ID3 ENTPD1',
    'ILCs',
    'IELs ID3 ENTPD1',
    'Lymphatics'
]

# Add 'T cells' as a new category to the 'Cell Type' column
Kong_adata.obs['Cell Type'] = Kong_adata.obs['Cell Type'].cat.add_categories(['T cells'])

# Update 'Cell Type' based on the condition in 'Cell States'
Kong_adata.obs.loc[Kong_adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'T cells'

In [55]:
cell_states_list = [
    'Plasma cells'
]

# Add 'Plasma cells' as a new category to the 'Cell Type' column
Kong_adata.obs['Cell Type'] = Kong_adata.obs['Cell Type'].cat.add_categories(['Plasma cells'])

# Update 'Cell Type' based on the condition in 'Cell States'
Kong_adata.obs.loc[Kong_adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Plasma cells'

In [56]:
cell_states_list = [
    'Cycling cells',
    'DC1',
    'DC2 CD1D',
    'DC2 CD1D-',
    'Immune Cycling cells',
    'Macrophages',
    'Macrophages CCL3 CCL4',
    'Macrophages CXCL9 CXCL10',
    'Macrophages LYVE1',
    'Macrophages Metallothionein',
    'Macrophages PLA2G2D',
    'Mast cells',
    'Mature DCs',
    'Monocytes CHI3L1 CYP27A1', 
    'Monocytes HBB',
    'Monocytes S100A8 S100A9'
]

# Add 'Myeloid' as a new category to the 'Cell Type' column
Kong_adata.obs['Cell Type'] = Kong_adata.obs['Cell Type'].cat.add_categories(['Myeloid'])

# Update 'Cell Type' based on the condition in 'Cell States'
Kong_adata.obs.loc[Kong_adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Myeloid'

In [57]:
Kong_adata.obs['Cell Type'].value_counts()

Epithelial      61278
Plasma cells    30236
T cells         29649
Stromal         22125
Stem Cell       16360
Myeloid         15767
B cells          6391
Immune              0
Name: Cell Type, dtype: int64

In [58]:
cell_states_list = [
    'Glial cells'
]

# Add 'Neuronal' as a new category to the 'Cell Type' column
Kong_adata.obs['Cell Type'] = Kong_adata.obs['Cell Type'].cat.add_categories(['Neuronal'])

# Update 'Cell Type' based on the condition in 'Cell States'
Kong_adata.obs.loc[Kong_adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Neuronal'

In [59]:
cell_states_list = [
    'Endothelial cells CA4 CD36',
    'Endothelial cells CD36',
    'Endothelial cells DARC',
    'Endothelial cells LTC4S SEMA3G'
]

# Add 'Endothelial' as a new category to the 'Cell Type' column
Kong_adata.obs['Cell Type'] = Kong_adata.obs['Cell Type'].cat.add_categories(['Endothelial'])

# Update 'Cell Type' based on the condition in 'Cell States'
Kong_adata.obs.loc[Kong_adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Endothelial'

In [60]:
cell_states_list = [
    'Activated fibroblasts CCL19 ADAMADEC1',
    'Fibroblasts ADAMDEC1',
    'Fibroblasts KCNN3 LY6H',
    'Fibroblasts NPY SLITRK6',
    'Fibroblasts SFRP2 SLPI',
    'Fibroblasts SMOC2 PTGIS',
    'Inflammatory fibroblasts IL11 CHI3L1',
    'Pericytes HIGD1B STEAP4',
    'Pericytes RERGL NTRK2',
    'Stromal Cycling cells',
    'Myofibroblasts GREM1 GREM2',
    'Myofibroblasts HHIP NPNT'
]

# Add 'Mesenchymal' as a new category to the 'Cell Type' column
Kong_adata.obs['Cell Type'] = Kong_adata.obs['Cell Type'].cat.add_categories(['Mesenchymal'])

# Update 'Cell Type' based on the condition in 'Cell States'
Kong_adata.obs.loc[Kong_adata.obs['Cell States'].isin(cell_states_list), 'Cell Type'] = 'Mesenchymal'

In [61]:
Kong_adata.obs['Cell Type'].value_counts()

Epithelial      61278
Plasma cells    30236
T cells         29649
Mesenchymal     16863
Stem Cell       16360
Myeloid         15767
B cells          6391
Endothelial      3549
Neuronal         1713
Immune              0
Stromal             0
Name: Cell Type, dtype: int64

### Files concatination

In [62]:
Kong_adata.obs['Cell States Kong'] = Kong_adata.obs['Cell States']
GCA_adata.obs['Cell States GCA'] = GCA_adata.obs['Cell States']

In [63]:
# Merge the two datasets
adata = Kong_adata.concatenate(GCA_adata, batch_categories=['Kong', 'GCA'], index_unique=None)

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [64]:
adata.obs_keys

<bound method AnnData.obs_keys of AnnData object with n_obs × n_vars = 557099 × 23616
    obs: 'Cell Type', 'batch', 'Sample_ID', 'n_genes', 'n_counts', 'Chem', 'Location', 'Donor_ID', 'Layer', 'Cell States', 'Gender', 'library_preparation_protocol__ontology_label', 'Diagnosis', 'n_genes_by_counts', 'total_counts_mito', 'percent_mito', 'total_counts_ribo', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'Study_name', 'Age_group', 'Cell States Kong', 'UniqueCell_ID', 'Age', 'Region code', 'Fraction', 'Cell States GCA'
    var: 'gene_id-Kong', 'gene_name-Kong', 'n_cells_by_counts-Kong', 'mean_counts-Kong', 'log1p_mean_counts-Kong', 'pct_dropout_by_counts-Kong', 'total_counts-Kong', 'log1p_total_counts-Kong', 'mito-Kong', 'ribo-Kong', 'highly_variable-Kong', 'highly_variable_rank-Kong', 'means-Kong', 'variances-Kong', 'variances_norm-Kong'>

In [65]:
adata.obs.index = adata.obs.index.astype(str)
adata.var.index = adata.var.index.astype(str)
adata.obs_names = adata.obs_names.astype(str)
adata.var_names = adata.var_names.astype(str)
adata.obs = adata.obs.astype(str)

In [66]:
# Write adata to file
adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/GCA_Kong_Smillie_Wang_unprocessed/GCA_Kong_Wang_Smillie_raw.h5ad')