## Notebook for anndata objects for labels transfer creations

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 21st April 2023

### Load required modules

In [77]:
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import anndata as an

In [78]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Read in datasets

In [79]:
# Read reference (Kong-2023 dataset)
input_Kong = '/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/Raw_anndata/Kong_2023_raw_anndata.h5ad'
output_Kong = '/Users/anna.maguza/Desktop/Data/Gut_project/Healthy_gut_data/Kong_2023/Processed_anndata/Kong_2023_raw_anndata_output.h5ad'
reference = sc.read(input_Kong)

In [80]:
# Read query (predicted Stem cells)
input_Stem = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Predicted_stem_cells/Predicted_stem_cells.h5ad'
output_Stem = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Predicted_stem_cells/Predicted_stem_cells_output.h5ad'
query = sc.read(input_Stem)

### Preprocess datasets

In [81]:
X_is_raw(query)

True

In [82]:
X_is_raw(reference)

True

In [83]:
# Filter out only healthy samples
reference = reference[reference.obs['disease__ontology_label'] == 'normal']

In [84]:
# Copy gene_id column in query.var as a first column in query.var with a name 'gene_name'
query.var.insert(0, 'gene_name', query.var.index)
# Make a gene name as index
query.var.index = query.var['gene_name']
# Delete gene_name column
del query.var['gene_name']

In [85]:
# Filter cells that have 'Stem cells' in reference.obs['Celltype']
stem_cells_list = ['Stem cells OLFM4 LGR5', 'Stem cells OLFM4 PCNA', 'Stem cells OLFM4 GSTA1', 'Stem cells OLFM4']
reference = reference[reference.obs['Celltype'].isin(stem_cells_list)]

In [86]:
# Change 'CO' in Kong_adata.obs['Site'] to 'Colon', 'TI' to 'Terminal Ileum', and 'SB' to 'Small Bowel'
reference.obs['Site'] = reference.obs['Site'].replace('CO', 'Colon')
reference.obs['Site'] = reference.obs['Site'].replace('TI', 'Terminal Ileum')
reference.obs['Site'] = reference.obs['Site'].replace('SB', 'Small Bowel')

  reference.obs['Site'] = reference.obs['Site'].replace('CO', 'Colon')


In [87]:
# Rename 'donor_id' column in reference.obs to 'Donor_ID'
reference.obs.rename(columns = {'donor_id': 'Donor_ID'}, inplace = True)
reference.obs['Study_name'] = 'Kong 2023'
reference.obs.rename(columns = {'biosample_id': 'Sample_ID'}, inplace = True)
reference.obs.rename(columns = {'Site': 'Location'}, inplace = True)
#reference.obs.rename(columns = {'cell_type': 'Celltype'}, inplace = True)
query.obs.rename(columns = {'10X': 'Library_Preparation_Protocol'}, inplace = True)
reference.obs.rename(columns = {'library_preparation_protocol__ontology_label': 'Library_Preparation_Protocol'}, inplace = True)

In [88]:
# Make gene_id as a first column in adata.var
reference.var.insert(0, 'gene_id', reference.var.index)
# Make a gene name as index 
reference.var.index = reference.var['gene_name']

reference.obs_names_make_unique()
query.obs_names_make_unique()
reference.var_names_make_unique()
query.var_names_make_unique()

In [89]:
# Change Geneder to the same values
reference.obs['sex'] = reference.obs['sex'].replace('male', 'Male')
reference.obs['sex'] = reference.obs['sex'].replace('female', 'Female')
reference.obs.rename(columns = {'sex': 'Gender'}, inplace = True)
query.obs['Gender'] = query.obs['Gender'].replace('M', 'Male')
query.obs['Gender'] = query.obs['Gender'].replace('F', 'Female')

In [90]:
# Add 'Female' to sex column in adata.obs for donor_id= 101694 , 110216 , 139073 , 152638 , 157844 , 158160 , 199129 , N8 , N10 , N13, N18 , N20 , N21
female_donors = ['101694', '110216', '139073', '152638', '157844', '158160', '199129', 'N8', 'N10', 'N13', 'N18', 'N20', 'N21']

reference.obs['Gender'] = ['Female' if donor in female_donors else 'Male' for donor in reference.obs['Donor_ID']]

In [91]:
# Uniform the values in the 'Diagnosis' column
query.obs['Diagnosis'] = query.obs['Diagnosis'].replace('fetal', 'Fetal Healthy')
query.obs['Diagnosis'] = query.obs['Diagnosis'].replace('Healthy', 'Healthy adult')
query.obs['Diagnosis'] = query.obs['Diagnosis'].replace('nan', 'Healthy adult')
query.obs['Diagnosis'] = query.obs['Diagnosis'].replace('Non-inflamed', 'Adult Ulcerative Colitis Non-inflamed')
reference.obs['disease__ontology_label'] = reference.obs['disease__ontology_label'].replace('normal', 'Healthy adult')
reference.obs.rename(columns = {'disease__ontology_label': 'Diagnosis'}, inplace = True)


In [92]:
reference.obs.rename(columns = {'Celltype': 'Cell States Kong'}, inplace = True)
reference.obs['Cell States'] = 'Stem_Cells_ext'
reference.obs.rename(columns = {'cell_type': 'Cell Type'}, inplace = True)
query.obs.rename(columns = {'CellType': 'Cell Type'}, inplace = True)

In [93]:
query.obs['Age_group'] = query.obs['Age_group'].replace('nan', 'Adult')
reference.obs['Age_group'] = 'Adult'

In [94]:
# Drop unnecessary columns
reference.obs.drop(columns = ['organ', 'tissue', 'Type', 'library_preparation_protocol', 'disease', 'organ__ontology_label', 'species', 'species__ontology_label'], inplace = True)

In [95]:
# Replace the "nan" or "NaN" string in the Sample_ID column
query.obs['Sample_ID'] = query.obs.apply(
    lambda row: row['Sample_ID'] if row['Sample_ID'].lower() != "nan" else
    (row['Donor_ID'] + '_' + str(row['Age']) + '_' + row['Region code'] + '_' + str(row['Fraction'])),
    axis=1
)

query.obs['Donor_ID'] = query.obs['Donor_ID'].astype('str')
query.obs['Age'] = query.obs['Age'].astype('str')
query.obs['Region code'] = query.obs['Region code'].astype('str')
query.obs['Fraction'] = query.obs['Fraction'].astype('str')

query.obs['Sample_ID'] = query.obs['Sample_ID'].where(
    pd.notna(query.obs['Sample_ID']),
    query.obs['Donor_ID'] + '_' + query.obs['Age'].astype(str) + '_' +
    query.obs['Region code'] + '_' + query.obs['Fraction'].astype(str)
)

In [96]:
reference.obs['seed_labels'] = reference.obs['Cell States Kong'].copy()
query.obs['seed_labels'] = 'Unlabeled'

### Concatenate reference and query

In [97]:
# Concatenate reference and query
adata = reference.concatenate(query, batch_key = 'dataset', batch_categories = ['reference', 'query'])

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [98]:
del reference, query

In [99]:
X_is_raw(adata)

True

In [100]:
adata.obs['seed_labels'].value_counts()

Unlabeled                 14938
Stem cells OLFM4           7053
Stem cells OLFM4 LGR5      4972
Stem cells OLFM4 PCNA      2931
Stem cells OLFM4 GSTA1     1404
Name: seed_labels, dtype: int64

In [101]:
# Calculate quality control metrics
sc.pp.calculate_qc_metrics(adata, inplace=True)

# Calculate mitochondrial and rybosomal fraction
adata.var['mito'] = adata.var_names.str.startswith(("MT-"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(adata, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

In [102]:
del adata.obs['n_genes'], adata.obs['n_counts'], adata.obs['log1p_n_genes_by_counts'], adata.obs['log1p_total_counts'], adata.obs['pct_counts_in_top_50_genes'], adata.obs['pct_counts_in_top_100_genes'], adata.obs['pct_counts_in_top_200_genes'], adata.obs['pct_counts_in_top_500_genes'], adata.obs['percent_mito'], adata.obs['percent_ribo'], adata.obs['total_counts_mt']

In [103]:
adata.obs_keys

<bound method AnnData.obs_keys of AnnData object with n_obs × n_vars = 31298 × 23616
    obs: 'Cell Type', 'batch', 'Sample_ID', 'Chem', 'Location', 'Donor_ID', 'Layer', 'Cell States Kong', 'Gender', 'Library_Preparation_Protocol', 'Diagnosis', 'Study_name', 'Cell States', 'Age_group', 'seed_labels', 'UniqueCell_ID', 'Age', 'Region code', 'Fraction', 'n_genes_by_counts', 'doublet_scores', 'predicted_doublets', 'total_counts_ribo', 'Cell Label', 'dataset', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'pct_counts_ribo'
    var: 'gene_id-reference', 'gene_name-reference', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo'>

In [104]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Stem_cells_diversity/Initial_anndata/predicted_stem_cells_and_Kong_all_genes.h5ad')

### Calculate Highly Variable Genes

#### 3000 HVGs

In [105]:
# Make a copy of the original data
adata_raw = adata.copy()

In [106]:
adata.layers['counts'] = adata.X.copy()

In [107]:
# Calculate 3000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 3000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [108]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Stem_cells_diversity/Initial_anndata/predicted_stem_cells_and_Kong_3K.h5ad')

#### 5000 HVGs

In [109]:
adata = adata_raw.copy()

In [110]:
adata.layers['counts'] = adata.X.copy()

In [111]:
# Calculate 3000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [112]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Stem_cells_diversity/Initial_anndata/predicted_stem_cells_and_Kong_5K.h5ad')

#### 7000 HVGs

In [113]:
adata = adata_raw.copy()

In [114]:
adata.layers['counts'] = adata.X.copy()

In [115]:
# Calculate 3000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [116]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Stem_cells_diversity/Initial_anndata/predicted_stem_cells_and_Kong_7K.h5ad')