## Notebook for anndata object preparation that will be used for the datasets integration

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v230410

### Import Packages

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
import anndata as ad

In [2]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Datasets Upload

In [3]:
# Upload labeled Stem cells dataset
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Stem_cells_diversity/scNYM/scnym_output_7K.h5ad'
output = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Stem_cells_diversity/scNYM/scnym_output_7K_output.h5ad'
adata_stem = sc.read_h5ad(input)

In [4]:
# Read Kong-2023 dataset
input_Kong = '/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/Raw_anndata/Kong_2023_raw_anndata.h5ad'
output_Kong = '/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/Processed_anndata/Kong_2023_raw_anndata_output.h5ad'
adata_Kong = sc.read(input_Kong)

In [5]:
# Read Gut Cell Atlas dataset
input_GCA = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/GCA_Smillie_Wang_unprocessed/Reference_map_(Gut_cell_atlas+Smilie+Wang).h5ad'
output_GCA = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/output.h5ad'
adata_GCA = sc.read_h5ad(input_GCA)

### Explore Datasets

In [6]:
X_is_raw(adata_GCA)

True

In [7]:
X_is_raw(adata_Kong)

True

In [8]:
# Filter out only healthy samples
adata_Kong = adata_Kong[adata_Kong.obs['disease__ontology_label'] == 'normal']

In [9]:
# Copy gene_id column in query.var as a first column in query.var with a name 'gene_name'
adata_GCA.var.insert(0, 'gene_name', adata_GCA.var.index)
# Make a gene name as index
adata_GCA.var.index = adata_GCA.var['gene_name']
# Delete gene_name column
del adata_GCA.var['gene_name']

In [10]:
# Change 'CO' in Kong_adata.obs['Site'] to 'Colon', 'TI' to 'Terminal Ileum', and 'SB' to 'Small Bowel'
adata_Kong.obs['Site'] = adata_Kong.obs['Site'].replace('CO', 'Colon')
adata_Kong.obs['Site'] = adata_Kong.obs['Site'].replace('TI', 'Terminal Ileum')
adata_Kong.obs['Site'] = adata_Kong.obs['Site'].replace('SB', 'Small Bowel')

  adata_Kong.obs['Site'] = adata_Kong.obs['Site'].replace('CO', 'Colon')


In [11]:
# Rename 'donor_id' column in reference.obs to 'Donor_ID'
adata_Kong.obs.rename(columns = {'donor_id': 'Donor_ID'}, inplace = True)
adata_Kong.obs['Study_name'] = 'Kong 2023'
adata_Kong.obs.rename(columns = {'biosample_id': 'Sample_ID'}, inplace = True)
adata_Kong.obs.rename(columns = {'Site': 'Location'}, inplace = True)
#reference.obs.rename(columns = {'cell_type': 'Celltype'}, inplace = True)
adata_GCA.obs.rename(columns = {'10X': 'Library_Preparation_Protocol'}, inplace = True)
adata_Kong.obs.rename(columns = {'library_preparation_protocol__ontology_label': 'Library_Preparation_Protocol'}, inplace = True)

In [12]:
# Make gene_id as a first column in adata.var
adata_Kong.var.insert(0, 'gene_id', adata_Kong.var.index)
# Make a gene name as index 
adata_Kong.var.index = adata_Kong.var['gene_name']

adata_Kong.obs_names_make_unique()
adata_GCA.obs_names_make_unique()
adata_Kong.var_names_make_unique()
adata_GCA.var_names_make_unique()

In [13]:
# Change Geneder to the same values
adata_Kong.obs['sex'] = adata_Kong.obs['sex'].replace('male', 'Male')
adata_Kong.obs['sex'] = adata_Kong.obs['sex'].replace('female', 'Female')
adata_Kong.obs.rename(columns = {'sex': 'Gender'}, inplace = True)
adata_GCA.obs['Gender'] = adata_GCA.obs['Gender'].replace('M', 'Male')
adata_GCA.obs['Gender'] = adata_GCA.obs['Gender'].replace('F', 'Female')

In [14]:
# Add 'Female' to sex column in adata.obs for donor_id= 101694 , 110216 , 139073 , 152638 , 157844 , 158160 , 199129 , N8 , N10 , N13, N18 , N20 , N21
female_donors = ['101694', '110216', '139073', '152638', '157844', '158160', '199129', 'N8', 'N10', 'N13', 'N18', 'N20', 'N21']

adata_Kong.obs['Gender'] = ['Female' if donor in female_donors else 'Male' for donor in adata_Kong.obs['Donor_ID']]

In [15]:
# Uniform the values in the 'Diagnosis' column
adata_GCA.obs['Diagnosis'] = adata_GCA.obs['Diagnosis'].replace('fetal', 'Fetal Healthy')
adata_GCA.obs['Diagnosis'] = adata_GCA.obs['Diagnosis'].replace('Healthy', 'Healthy adult')
adata_GCA.obs['Diagnosis'] = adata_GCA.obs['Diagnosis'].replace('nan', 'Healthy adult')
adata_GCA.obs['Diagnosis'] = adata_GCA.obs['Diagnosis'].replace('Non-inflamed', 'Adult Ulcerative Colitis Non-inflamed')
adata_Kong.obs['disease__ontology_label'] = adata_Kong.obs['disease__ontology_label'].replace('normal', 'Healthy adult')
adata_Kong.obs.rename(columns = {'disease__ontology_label': 'Diagnosis'}, inplace = True)

In [16]:
adata_Kong.obs.rename(columns = {'Celltype': 'Cell States Kong'}, inplace = True)
adata_Kong.obs.rename(columns = {'cell_type': 'Cell Type'}, inplace = True)
adata_GCA.obs.rename(columns = {'CellType': 'Cell Type'}, inplace = True)

In [17]:
adata_GCA.obs['Age_group'] = adata_GCA.obs['Age_group'].replace('nan', 'Adult')
adata_Kong.obs['Age_group'] = 'Adult'

In [18]:
# Drop unnecessary columns
adata_Kong.obs.drop(columns = ['organ', 'tissue', 'Type', 'library_preparation_protocol', 'disease', 'organ__ontology_label', 'species', 'species__ontology_label'], inplace = True)

In [19]:
# Replace the "nan" or "NaN" string in the Sample_ID column
adata_GCA.obs['Sample_ID'] = adata_GCA.obs.apply(
    lambda row: row['Sample_ID'] if row['Sample_ID'].lower() != "nan" else
    (row['Donor_ID'] + '_' + str(row['Age']) + '_' + row['Region code'] + '_' + str(row['Fraction'])),
    axis=1
)

adata_GCA.obs['Donor_ID'] = adata_GCA.obs['Donor_ID'].astype('str')
adata_GCA.obs['Age'] = adata_GCA.obs['Age'].astype('str')
adata_GCA.obs['Region code'] = adata_GCA.obs['Region code'].astype('str')
adata_GCA.obs['Fraction'] = adata_GCA.obs['Fraction'].astype('str')

adata_GCA.obs['Sample_ID'] = adata_GCA.obs['Sample_ID'].where(
    pd.notna(adata_GCA.obs['Sample_ID']),
    adata_GCA.obs['Donor_ID'] + '_' + adata_GCA.obs['Age'].astype(str) + '_' +
    adata_GCA.obs['Region code'] + '_' + adata_GCA.obs['Fraction'].astype(str)
)

### Make Uniform Cell Type names

In [20]:
# List of specific Cell States to look for
cell_states_list = [
    'Stem cells OLFM4 LGR5',
    'Stem cells OLFM4 PCNA',
    'Stem cells OLFM4 GSTA1',
    'Stem cells OLFM4'
]

# Add 'Stem Cell' as a new category to the 'Cell Type' column
adata_Kong.obs['Cell Type'] = adata_Kong.obs['Cell Type'].cat.add_categories(['Stem Cell'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata_Kong.obs.loc[adata_Kong.obs['Cell States Kong'].isin(cell_states_list), 'Cell Type'] = 'Stem Cell'

In [21]:
# List of specific Cell States to look for
cell_states_list = [
    'B cells',
    'B cells AICDA LRMP'
]

# Add 'B cells' as a new category to the 'Cell Type' column
adata_Kong.obs['Cell Type'] = adata_Kong.obs['Cell Type'].cat.add_categories(['B cells'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata_Kong.obs.loc[adata_Kong.obs['Cell States Kong'].isin(cell_states_list), 'Cell Type'] = 'B cells'

In [22]:
# List of specific Cell States to look for
cell_states_list = [
    'T cells CD4 FOSB',
    'T cells CD4 IL17A',
    'T cells CD8',
    'T cells CD8 KLRG1',
    'T cells Naive CD4',
    'T cells OGT',
    'Tregs',
    'NK cells KLRF1 CD3G-',
    'NK-like cells ID3 ENTPD1',
    'ILCs',
    'IELs ID3 ENTPD1',
    'Lymphatics'
]

# Add 'T cells' as a new category to the 'Cell Type' column
adata_Kong.obs['Cell Type'] = adata_Kong.obs['Cell Type'].cat.add_categories(['T cells'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata_Kong.obs.loc[adata_Kong.obs['Cell States Kong'].isin(cell_states_list), 'Cell Type'] = 'T cells'


In [23]:
cell_states_list = [
    'Plasma cells'
]

# Add 'Plasma cells' as a new category to the 'Cell Type' column
adata_Kong.obs['Cell Type'] = adata_Kong.obs['Cell Type'].cat.add_categories(['Plasma cells'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata_Kong.obs.loc[adata_Kong.obs['Cell States Kong'].isin(cell_states_list), 'Cell Type'] = 'Plasma cells'

In [24]:
cell_states_list = [
    'Cycling cells',
    'DC1',
    'DC2 CD1D',
    'DC2 CD1D-',
    'Immune Cycling cells',
    'Macrophages',
    'Macrophages CCL3 CCL4',
    'Macrophages CXCL9 CXCL10',
    'Macrophages LYVE1',
    'Macrophages Metallothionein',
    'Macrophages PLA2G2D',
    'Mast cells',
    'Mature DCs',
    'Monocytes CHI3L1 CYP27A1', 
    'Monocytes HBB',
    'Monocytes S100A8 S100A9'
]

# Add 'Myeloid' as a new category to the 'Cell Type' column
adata_Kong.obs['Cell Type'] = adata_Kong.obs['Cell Type'].cat.add_categories(['Myeloid'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata_Kong.obs.loc[adata_Kong.obs['Cell States Kong'].isin(cell_states_list), 'Cell Type'] = 'Myeloid'

In [25]:
cell_states_list = [
    'Glial cells'
]

# Add 'Neuronal' as a new category to the 'Cell Type' column
adata_Kong.obs['Cell Type'] = adata_Kong.obs['Cell Type'].cat.add_categories(['Neuronal'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata_Kong.obs.loc[adata_Kong.obs['Cell States Kong'].isin(cell_states_list), 'Cell Type'] = 'Neuronal'

In [26]:
cell_states_list = [
    'Endothelial cells CA4 CD36',
    'Endothelial cells CD36',
    'Endothelial cells DARC',
    'Endothelial cells LTC4S SEMA3G'
]

# Add 'Endothelial' as a new category to the 'Cell Type' column
adata_Kong.obs['Cell Type'] = adata_Kong.obs['Cell Type'].cat.add_categories(['Endothelial'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata_Kong.obs.loc[adata_Kong.obs['Cell States Kong'].isin(cell_states_list), 'Cell Type'] = 'Endothelial'


In [27]:
cell_states_list = [
    'Activated fibroblasts CCL19 ADAMADEC1',
    'Fibroblasts ADAMDEC1',
    'Fibroblasts KCNN3 LY6H',
    'Fibroblasts NPY SLITRK6',
    'Fibroblasts SFRP2 SLPI',
    'Fibroblasts SMOC2 PTGIS',
    'Inflammatory fibroblasts IL11 CHI3L1',
    'Pericytes HIGD1B STEAP4',
    'Pericytes RERGL NTRK2',
    'Stromal Cycling cells',
    'Myofibroblasts GREM1 GREM2',
    'Myofibroblasts HHIP NPNT'
]

# Add 'Mesenchymal' as a new category to the 'Cell Type' column
adata_Kong.obs['Cell Type'] = adata_Kong.obs['Cell Type'].cat.add_categories(['Mesenchymal'])

# Update 'Cell Type' based on the condition in 'Cell States'
adata_Kong.obs.loc[adata_Kong.obs['Cell States Kong'].isin(cell_states_list), 'Cell Type'] = 'Mesenchymal'

adata_Kong.obs['Cell Type'].value_counts()

Epithelial      61278
Plasma cells    30236
T cells         29649
Mesenchymal     16863
Stem Cell       16360
Myeloid         15767
B cells          6391
Endothelial      3549
Neuronal         1713
Immune              0
Stromal             0
Name: Cell Type, dtype: int64

### Datasets Concatination

In [28]:
adata_Kong.obs

Unnamed: 0_level_0,Cell Type,batch,Sample_ID,n_genes,n_counts,Chem,Location,Donor_ID,Layer,Cell States Kong,Gender,Library_Preparation_Protocol,Diagnosis,Study_name,Age_group
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
H197396_N1-TCAGCAATCTTTACGT,Endothelial,0,H197396_N1,2635,10265,v2,Colon,197396,N,Endothelial cells DARC,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H197396_N1-GTAGGCCTCTTCATGT,Mesenchymal,0,H197396_N1,2559,8477,v2,Colon,197396,N,Myofibroblasts GREM1 GREM2,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H197396_N1-GAGTCCGTCTTTAGGG,Mesenchymal,0,H197396_N1,2334,7392,v2,Colon,197396,N,Myofibroblasts GREM1 GREM2,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H197396_N1-TGTGTTTCAACAACCT,Mesenchymal,0,H197396_N1,2320,7314,v2,Colon,197396,N,Myofibroblasts GREM1 GREM2,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H197396_N1-CATCGGGGTAGCACGA,Mesenchymal,0,H197396_N1,2234,6959,v2,Colon,197396,N,Myofibroblasts GREM1 GREM2,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H180844_N4-GATCGATCATATACGC,Plasma cells,5,H180844_N4,101,228,v2,Terminal Ileum,180844,N,Plasma cells,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H180844_N4-ACTGCTCAGAAACCTA,Myeloid,5,H180844_N4,221,310,v2,Terminal Ileum,180844,N,Cycling cells,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H180844_N4-CATCAGACACGGCCAT,Myeloid,5,H180844_N4,204,306,v2,Terminal Ileum,180844,N,Macrophages,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H180844_N4-TATGCCCCAATGACCT,Myeloid,5,H180844_N4,117,232,v2,Terminal Ileum,180844,N,Mast cells,Male,10x 3' v2,Healthy adult,Kong 2023,Adult


In [29]:
del adata_GCA.obs['percent_ribo'], adata_GCA.obs['percent_mito'], adata_GCA.obs['n_counts'], adata_GCA.obs['total_counts_ribo']

In [30]:
del adata_GCA.obs['UniqueCell_ID'], adata_GCA.obs['n_genes'], adata_GCA.obs['n_genes_by_counts'], adata_GCA.obs['total_counts_mt'], adata_GCA.obs['doublet_scores'], adata_GCA.obs['predicted_doublets']

In [31]:
adata_GCA.obs

Unnamed: 0_level_0,Sample_ID,Cell Type,Study_name,Donor_ID,Diagnosis,Age,Region code,Fraction,Gender,Library_Preparation_Protocol,batch,Age_group,Location,Cell States
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AACACGTTCTTGCATT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,,,Adult,,Stem_Cells_ext
AACCGCGCATGAAGTA_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,,,Adult,,Stem_Cells_ext
AACTCAGAGCGATCCC_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,,,Adult,,Stem_Cells_ext
AACTCCCTCTCAACTT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,,,Adult,,Stem_Cells_ext
AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,,,Adult,,Stem_Cells_ext
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N110.LPA.TCGGGACGTCAACTGT,N110.LPA,Stem Cell,Smilie,N110,Adult Ulcerative Colitis Non-inflamed,,,,,,,Adult,LP,Stem_Cells_ext
N110.LPA.TGAGCATTCCAGTAGT,N110.LPA,Stem Cell,Smilie,N110,Adult Ulcerative Colitis Non-inflamed,,,,,,,Adult,LP,Stem_Cells_ext
N110.LPA.TGGCCAGAGAGGACGG,N110.LPA,Stem Cell,Smilie,N110,Adult Ulcerative Colitis Non-inflamed,,,,,,,Adult,LP,Stem_Cells_ext
N110.LPA.TTCTTAGCAGTCCTTC,N110.LPA,Stem Cell,Smilie,N110,Adult Ulcerative Colitis Non-inflamed,,,,,,,Adult,LP,Stem_Cells_ext


In [32]:
adata_Kong.obs

Unnamed: 0_level_0,Cell Type,batch,Sample_ID,n_genes,n_counts,Chem,Location,Donor_ID,Layer,Cell States Kong,Gender,Library_Preparation_Protocol,Diagnosis,Study_name,Age_group
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
H197396_N1-TCAGCAATCTTTACGT,Endothelial,0,H197396_N1,2635,10265,v2,Colon,197396,N,Endothelial cells DARC,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H197396_N1-GTAGGCCTCTTCATGT,Mesenchymal,0,H197396_N1,2559,8477,v2,Colon,197396,N,Myofibroblasts GREM1 GREM2,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H197396_N1-GAGTCCGTCTTTAGGG,Mesenchymal,0,H197396_N1,2334,7392,v2,Colon,197396,N,Myofibroblasts GREM1 GREM2,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H197396_N1-TGTGTTTCAACAACCT,Mesenchymal,0,H197396_N1,2320,7314,v2,Colon,197396,N,Myofibroblasts GREM1 GREM2,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H197396_N1-CATCGGGGTAGCACGA,Mesenchymal,0,H197396_N1,2234,6959,v2,Colon,197396,N,Myofibroblasts GREM1 GREM2,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H180844_N4-GATCGATCATATACGC,Plasma cells,5,H180844_N4,101,228,v2,Terminal Ileum,180844,N,Plasma cells,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H180844_N4-ACTGCTCAGAAACCTA,Myeloid,5,H180844_N4,221,310,v2,Terminal Ileum,180844,N,Cycling cells,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H180844_N4-CATCAGACACGGCCAT,Myeloid,5,H180844_N4,204,306,v2,Terminal Ileum,180844,N,Macrophages,Male,10x 3' v2,Healthy adult,Kong 2023,Adult
H180844_N4-TATGCCCCAATGACCT,Myeloid,5,H180844_N4,117,232,v2,Terminal Ileum,180844,N,Mast cells,Male,10x 3' v2,Healthy adult,Kong 2023,Adult


In [33]:
del adata_Kong.obs['n_counts'], adata_Kong.obs['n_genes']

In [34]:
adata_GCA.obs['Cell States GCA'] = adata_GCA.obs['Cell States'].copy()
adata_Kong.obs['Cell States'] = adata_Kong.obs['Cell States Kong'].copy()

In [35]:
# Concatenate reference and query
adata = adata_GCA.concatenate(adata_Kong, index_unique = None, batch_key = 'dataset', batch_categories = ['reference', 'query'])

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [36]:
adata.obs

Unnamed: 0_level_0,Sample_ID,Cell Type,Study_name,Donor_ID,Diagnosis,Age,Region code,Fraction,Gender,Library_Preparation_Protocol,batch,Age_group,Location,Cell States,Cell States GCA,Chem,Layer,Cell States Kong,dataset
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
AACACGTTCTTGCATT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,,,Adult,,Stem_Cells_ext,Stem_Cells_ext,,,,reference
AACCGCGCATGAAGTA_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,,,Adult,,Stem_Cells_ext,Stem_Cells_ext,,,,reference
AACTCAGAGCGATCCC_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,,,Adult,,Stem_Cells_ext,Stem_Cells_ext,,,,reference
AACTCCCTCTCAACTT_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,,,Adult,,Stem_Cells_ext,Stem_Cells_ext,,,,reference
AACTCTTAGCTTCGCG_Ileum-1_Stem Cell,Ileum-1,Stem Cell,Wang,Wang_Donor_1,Healthy adult,,,,,,,Adult,,Stem_Cells_ext,Stem_Cells_ext,,,,reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H180844_N4-GATCGATCATATACGC,H180844_N4,Plasma cells,Kong 2023,180844,Healthy adult,,,,Male,10x 3' v2,5,Adult,Terminal Ileum,Plasma cells,,v2,N,Plasma cells,query
H180844_N4-ACTGCTCAGAAACCTA,H180844_N4,Myeloid,Kong 2023,180844,Healthy adult,,,,Male,10x 3' v2,5,Adult,Terminal Ileum,Cycling cells,,v2,N,Cycling cells,query
H180844_N4-CATCAGACACGGCCAT,H180844_N4,Myeloid,Kong 2023,180844,Healthy adult,,,,Male,10x 3' v2,5,Adult,Terminal Ileum,Macrophages,,v2,N,Macrophages,query
H180844_N4-TATGCCCCAATGACCT,H180844_N4,Myeloid,Kong 2023,180844,Healthy adult,,,,Male,10x 3' v2,5,Adult,Terminal Ileum,Mast cells,,v2,N,Mast cells,query


### Add QC values

In [37]:
# Calculate quality control metrics
sc.pp.calculate_qc_metrics(adata, inplace=True)

# Calculate mitochondrial and rybosomal fraction
adata.var['mito'] = adata.var_names.str.startswith(("MT-"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(adata, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

In [39]:
del adata.obs['log1p_n_genes_by_counts'], adata.obs['log1p_total_counts'], adata.obs['pct_counts_in_top_50_genes'], adata.obs['pct_counts_in_top_100_genes'], adata.obs['pct_counts_in_top_200_genes'], adata.obs['pct_counts_in_top_500_genes']
adata

AnnData object with n_obs × n_vars = 557099 × 23616
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_id-query', 'gene_name-query', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo'

### Add correct stem cells labels

In [42]:
adata_stem.obs['Cell_ID'] = adata_stem.obs.index

In [46]:
def remove_suffix(cell_id):
    for suffix in ['-reference', '-query']:
        if cell_id.endswith(suffix):
            return cell_id[:-len(suffix)]
    return cell_id

adata_stem.obs['Cell_ID'] = adata_stem.obs['Cell_ID'].apply(remove_suffix)

In [50]:
adata.obs['Cell_ID'] = adata.obs.index

# Create a dictionary with the Cell_IDs and scNym annotations from adata_stem
cell_id_to_scNym = dict(zip(adata_stem.obs['Cell_ID'], adata_stem.obs['scNym']))

# Update the 'Cell States' annotations in adata with the corresponding 'scNym' annotations from adata_stem
adata.obs['Cell States'] = adata.obs.apply(lambda row: cell_id_to_scNym[row['Cell_ID']] if row['Cell_ID'] in cell_id_to_scNym else row['Cell States'], axis=1)

In [53]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Initial Datasets/All_cells_all_genes.h5ad')

### Calculate Highly Variable Genes

#### 3000 HVGs

In [54]:
# Make a copy of the original data
adata_raw = adata.copy()

In [55]:
adata.layers['counts'] = adata.X.copy()

In [56]:
# Calculate 3000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 3000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [57]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Initial Datasets/All_cells_3000_HVGs.h5ad')

#### 5000 HVGs

In [58]:
adata = adata_raw.copy()

In [59]:
adata.layers['counts'] = adata.X.copy()

In [60]:
# Calculate 3000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [61]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Initial Datasets/All_cells_5000_HVGs.h5ad')

#### 7000 HVGs

In [62]:
adata = adata_raw.copy()

In [63]:
adata.layers['counts'] = adata.X.copy()

In [66]:
# Calculate 3000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [67]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Initial Datasets/All_cells_7000_HVGs.h5ad')