## Notebook for the Integrated Healthy dataset and Colon Cancer Atlas
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 15 May 2023

#### Load required packages

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import scipy as sci

#### Setup Cells

In [None]:
%matplotlib inline

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Upload Data

In [None]:
input_path_cancer = '/Users/anna.maguza/Desktop/Data/Gut_project/Human_Colon_Cancer_Atlas/anndata/Colon_cancer_atlas_anndata.h5ad'
Cancer_adata = sc.read_h5ad(input_path_cancer)

In [None]:
X_is_raw(Cancer_adata)

### Preprocess obs in Cancer data

In [None]:
Cancer_adata.obs_keys()

In [None]:
Cancer_adata.obs['disease__ontology_label'].value_counts()

In [None]:
# Filter only cancer cells
Cancer_adata = Cancer_adata[Cancer_adata.obs['disease__ontology_label'] == 'colon adenocarcinoma', :]

In [None]:
Cancer_adata.obs_keys

In [None]:
# Delete unnecessary columns
del Cancer_adata.obs['disease'], Cancer_adata.obs['species'], Cancer_adata.obs['species__ontology_label'], Cancer_adata.obs['organ'], Cancer_adata.obs['library_preparation_protocol']
del Cancer_adata.obs['qc_geneCount']
del Cancer_adata.obs['qc_logMappedReads']
del Cancer_adata.obs['qc_meanReadsPerUmi']
del Cancer_adata.obs['qc_totalReads']
del Cancer_adata.obs['qc_logUmiCount']
del Cancer_adata.obs['qc_bcSwapFraction']
del Cancer_adata.obs['qc_geneSatFraction']
del Cancer_adata.obs['qc_seqDupEst']
del Cancer_adata.obs['qc_umiSatFraction']
del Cancer_adata.obs['qc_emptyDropPval']
del Cancer_adata.obs['qc_mitoFraction']

In [None]:
Cancer_adata.obs['Study_name'] = 'Colon_cancer_atlas'

In [None]:
# Rename columns in Cancer_adata as in Healthy_adata
Cancer_adata.obs.rename(columns={'biosample_id': 'Sample_ID',
                                  'donor_id': 'Donor_ID',
                                  'disease__ontology_label': 'Diagnosis',
                                  'library_preparation_protocol__ontology_label': 'Library_Preparation_Protocol',
                                  'sex': 'Sex',
                                  'organ': 'Location',
                                  'ClusterTop': 'Cell Type'}, inplace=True)

In [None]:
# Rename values in Cancer_adata.obs['ClusterTop'] as in Healthy_adata.obs['Cell Type']
Cancer_adata.obs['Cell Type'].replace({'Epi': 'Epithelial', 
                                        'Plasma': 'Plasma cells',
                                        'TNKILC': 'T cells',
                                        'B': 'B cells',
                                        'Strom': 'Mesenchymal',
                                        'Mast': 'Myeloid'}, inplace=True)

In [None]:
# Rename values in Cancer_adata.obs as in Healthy_adata.obs
Cancer_adata.obs['Gender'].replace({'male': 'Male', 
                                        'female': 'Female'}, inplace=True)

Cancer_adata.obs['Library_Preparation_Protocol'].replace({"10X 3' v2 sequencing": "10x 3' v2", 
                                        "10X 3' v3 sequencing": "10x 3' v3"}, inplace=True)

### Make the QC the same as in the Healthy dataset

In [None]:
# Calculate quality metrics for cancer dataset
sc.pp.calculate_qc_metrics(Cancer_adata, inplace=True)

In [None]:
# Make gene_id as a first column in adata.var
Cancer_adata.var.insert(0, 'gene_id', Cancer_adata.var.index)
# Make a gene name as index 
Cancer_adata.var.index = Cancer_adata.var['gene_name']

In [None]:
# Calculate mitochondrial fraction
Cancer_adata.var['mito'] = Cancer_adata.var_names.str.startswith(("MT-"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(Cancer_adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)
Cancer_adata.var['ribo'] = Cancer_adata.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(Cancer_adata, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

In [None]:
Cancer_adata.obs

In [None]:
# Delete unnecessary columns
del Cancer_adata.obs['log1p_total_counts'], Cancer_adata.obs['pct_counts_in_top_50_genes'], Cancer_adata.obs['log1p_n_genes_by_counts']
del Cancer_adata.obs['pct_counts_in_top_100_genes'], Cancer_adata.obs['pct_counts_in_top_200_genes'], Cancer_adata.obs['pct_counts_in_top_500_genes']

In [None]:
# Write objects
Cancer_adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Colon_cancer_atlas_anndata.h5ad')

# Prepare Epithelial cells

In [None]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_integrated_data_all_genes.h5ad'
healthy_all_counts = sc.read_h5ad(input)
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Colon_cancer_atlas_anndata.h5ad'
Cancer_adata = sc.read_h5ad(input)

In [None]:
healthy_all_counts.obs['Diagnosis'].value_counts()

In [None]:
# Filter out fetal and pediatric samples
healthy_all_counts = healthy_all_counts[healthy_all_counts.obs['Diagnosis'] != 'Fetal Healthy', :]
healthy_all_counts = healthy_all_counts[healthy_all_counts.obs['Diagnosis'] != 'Pediatric healthy', :]

In [None]:
healthy_all_counts.obs

### Unify Cell States in Healthy reference

In [None]:
# Subset only epitheleal cells
Cancer_adata = Cancer_adata[Cancer_adata.obs['Cell Type'] == 'Epithelial', :]
healthy_all_counts = healthy_all_counts[healthy_all_counts.obs['Cell Type'] == 'Epithelial', :]


In [None]:
healthy_all_counts.obs['Unified Cell States'] = healthy_all_counts.obs['Cell States']

In [None]:
healthy_all_counts.obs['Unified Cell States'].replace({"Enterocytes TMIGD1 MEP1A": 'Enterocyte',
                                               'Enterocytes CA1 CA2 CA4-': 'Enterocyte',
                                               'Enterocytes TMIGD1 MEP1A GSTA1': 'Enterocyte',
                                               'Stem cells OLFM4': 'Stem cells',
                                               'Stem cells OLFM4 LGR5': 'Stem cells',
                                               'Stem_Cells_GCA': 'Stem cells',
                                               'Stem cells OLFM4 PCNA': 'Stem cells',
                                               'Stem_Cells_ext': 'Stem cells',
                                               'Stem cells OLFM4 GSTA1': 'Stem cells',
                                               'Tuft': 'Tuft cells',
                                               'Paneth': 'Paneth cells',
                                               'Goblet cells SPINK4': 'Goblet cells',
                                               'Goblet cell': 'Goblet cells',
                                               'Goblet cells MUC2 TFF1-': 'Goblet cells',
                                               'Goblet cells MUC2 TFF1': 'Goblet cells',
                                               'BEST2+ Goblet cell': 'Goblet cells',
                                               'L cells (PYY+)': 'L cells',
                                               'EC cells (TAC1+)': 'Enterochromaffin cells',
                                               'EC cells (NPW+)': 'Enterochromaffin cells',
                                               'EECs': 'Enteroendocrine cells',
                                               'BEST4+ epithelial': 'Enterocytes BEST4'}, inplace=True)

### Counts Normalization

In [None]:
sc.pp.log1p(Cancer_adata)
sc.pp.log1p(healthy_all_counts)

### Extract Highly Variable Genes

In [None]:
healthy_all_counts.layers['raw_counts'] = healthy_all_counts.X.copy()

### HVGs selection
# Calculate HVGs for cancer dataset
sc.pp.highly_variable_genes(
    healthy_all_counts,
    flavor = "seurat_v3",
    n_top_genes = 3000,
    layer = "raw_counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [None]:
df = healthy_all_counts.obs['Cell States'].value_counts()

In [None]:
Cancer_adata.layers['raw_counts'] = Cancer_adata.X.copy()

In [None]:
# Extract same HVGs in the cancer dataset as in the healthy dataset

#Make indexes as string
Cancer_adata.var.index = Cancer_adata.var.index.astype(str)

# Ensure indexes are unique
Cancer_adata.var_names_make_unique()

# Identify common genes
common_genes = list(set(healthy_all_counts.var_names) & set(Cancer_adata.var_names))

# Filter genes
healthy_all_counts = healthy_all_counts[:, common_genes]
Cancer_adata = Cancer_adata[:, common_genes]

#Ensure the same order of the genes
Cancer_adata = Cancer_adata[:, healthy_all_counts.var_names]

In [None]:
# Save anndata objects
Cancer_adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Epithelial_Colon_cancer_atlas_normalized_3KA.h5ad')
healthy_all_counts.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Epithelial_Healthy_anndata_normalized_3K.h5ad')