## Notebook for the Integrated Healthy dataset and Colon Cancer Atlas
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 15 May 2023

#### Load required packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import scipy as sci

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.9.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


In [4]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Upload Data

In [27]:
input_path_healthy = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Integrated datasets/All_cells_5000_HVGs_scvi_scanvi.h5ad'
Healthy_adata = sc.read_h5ad(input_path_healthy)

In [28]:
X_is_raw(Healthy_adata)

True

In [29]:
input_path_cancer = '/Users/anna.maguza/Desktop/Data/Gut_project/Human_Colon_Cancer_Atlas/anndata/Colon_cancer_atlas_anndata.h5ad'
Cancer_adata = sc.read_h5ad(input_path_cancer)

In [30]:
X_is_raw(Cancer_adata)

False

In [38]:
# Upload unprocessed dataset (before HVGs extraction)
input_path_healthy_all_counts = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/GCA_Kong_Smillie_Wang_unprocessed/GCA_Kong_Wang_Smillie_raw.h5ad'
healthy_all_counts = sc.read_h5ad(input_path_healthy_all_counts)

In [39]:
X_is_raw(healthy_all_counts)

True

### Preprocess X in Healthy data

In [40]:
healthy_all_counts.X

<557099x23616 sparse matrix of type '<class 'numpy.float32'>'
	with 794816608 stored elements in Compressed Sparse Row format>

In [50]:
Healthy_adata

AnnData object with n_obs × n_vars = 557099 × 5000
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels'
    var: 'gene_id-query', 'gene_name-query', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'Age_group_colors', 'Cell Type_colors', 'Diagnosis_colors', 'Donor_ID_colors', 'Gender_colors', 'Library_Preparation_Protocol_colors', 'Location_colors', 'Study_name_colors', '_scvi_manager_uuid', '_scvi_uuid', 'hvg', 'neighb

In [53]:
healthy_all_counts

AnnData object with n_obs × n_vars = 557099 × 23616
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels'
    var: 'gene_id-Kong', 'gene_name-Kong', 'n_cells_by_counts-Kong', 'mean_counts-Kong', 'log1p_mean_counts-Kong', 'pct_dropout_by_counts-Kong', 'total_counts-Kong', 'log1p_total_counts-Kong', 'mito-Kong', 'ribo-Kong', 'highly_variable-Kong', 'highly_variable_rank-Kong', 'means-Kong', 'variances-Kong', 'variances_norm-Kong'
    uns: 'Age_group_colors', 'Cell Type_colors', 'Diagnosis_colors', 'Donor_ID_colors', 'Gender_colors', 'Library_Preparation_Protocol_colors', 'Location_colors', 'Study_name_colors', '_scvi_manager

In [52]:
# Transfer var and X from unprocessed dataset to processed one
healthy_all_counts.obs = Healthy_adata.obs
healthy_all_counts.obsm = Healthy_adata.obsm
healthy_all_counts.uns = Healthy_adata.uns
healthy_all_counts.obsp = Healthy_adata.obsp

### Preprocess obs in Cancer data

In [54]:
Cancer_adata.obs_keys()

['biosample_id',
 'donor_id',
 'SpecimenType',
 'TissueSource',
 'ProcessingMethod',
 'PatientTypeID',
 'sex',
 'Site',
 'Grade',
 'TumorStage',
 'LymphNodeStatus',
 'MMRStatusTumor',
 'MMRMLH1Tumor',
 'qc_geneCount',
 'qc_logMappedReads',
 'qc_meanReadsPerUmi',
 'qc_totalReads',
 'qc_logUmiCount',
 'qc_bcSwapFraction',
 'qc_geneSatFraction',
 'qc_seqDupEst',
 'qc_umiSatFraction',
 'qc_emptyDropPval',
 'qc_mitoFraction',
 'species',
 'species__ontology_label',
 'disease',
 'disease__ontology_label',
 'organ',
 'organ__ontology_label',
 'library_preparation_protocol',
 'library_preparation_protocol__ontology_label',
 'ClusterFull',
 'ClusterMidway',
 'ClusterTop']

In [55]:
Cancer_adata.obs['disease__ontology_label'].value_counts()

colon adenocarcinoma    258359
normal                  112864
Name: disease__ontology_label, dtype: int64

In [56]:
# Filter only cancer cells
Cancer_adata = Cancer_adata[Cancer_adata.obs['disease__ontology_label'] == 'colon adenocarcinoma', :]

In [57]:
Cancer_adata.obs_keys

<bound method AnnData.obs_keys of View of AnnData object with n_obs × n_vars = 258359 × 43282
    obs: 'biosample_id', 'donor_id', 'SpecimenType', 'TissueSource', 'ProcessingMethod', 'PatientTypeID', 'sex', 'Site', 'Grade', 'TumorStage', 'LymphNodeStatus', 'MMRStatusTumor', 'MMRMLH1Tumor', 'qc_geneCount', 'qc_logMappedReads', 'qc_meanReadsPerUmi', 'qc_totalReads', 'qc_logUmiCount', 'qc_bcSwapFraction', 'qc_geneSatFraction', 'qc_seqDupEst', 'qc_umiSatFraction', 'qc_emptyDropPval', 'qc_mitoFraction', 'species', 'species__ontology_label', 'disease', 'disease__ontology_label', 'organ', 'organ__ontology_label', 'library_preparation_protocol', 'library_preparation_protocol__ontology_label', 'ClusterFull', 'ClusterMidway', 'ClusterTop'
    var: 'gene_name'>

In [58]:
Healthy_adata.obs_keys

<bound method AnnData.obs_keys of AnnData object with n_obs × n_vars = 557099 × 5000
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels'
    var: 'gene_id-query', 'gene_name-query', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'Age_group_colors', 'Cell Type_colors', 'Diagnosis_colors', 'Donor_ID_colors', 'Gender_colors', 'Library_Preparation_Protocol_colors', 'Location_colors', 'Study_name_colors', '_scvi_manager_u

In [59]:
# Delete unnecessary columns
del Cancer_adata.obs['disease'], Cancer_adata.obs['species'], Cancer_adata.obs['species__ontology_label'], Cancer_adata.obs['organ'], Cancer_adata.obs['library_preparation_protocol']
del Cancer_adata.obs['qc_geneCount']
del Cancer_adata.obs['qc_logMappedReads']
del Cancer_adata.obs['qc_meanReadsPerUmi']
del Cancer_adata.obs['qc_totalReads']
del Cancer_adata.obs['qc_logUmiCount']
del Cancer_adata.obs['qc_bcSwapFraction']
del Cancer_adata.obs['qc_geneSatFraction']
del Cancer_adata.obs['qc_seqDupEst']
del Cancer_adata.obs['qc_umiSatFraction']
del Cancer_adata.obs['qc_emptyDropPval']
del Cancer_adata.obs['qc_mitoFraction']

In [60]:
Cancer_adata.obs['Study_name'] = 'Colon_cancer_atlas'

  Cancer_adata.obs['Study_name'] = 'Colon_cancer_atlas'


In [61]:
# Rename columns in Cancer_adata as in Healthy_adata
Cancer_adata.obs.rename(columns={'biosample_id': 'Sample_ID',
                                  'donor_id': 'Donor_ID',
                                  'disease__ontology_label': 'Diagnosis',
                                  'library_preparation_protocol__ontology_label': 'Library_Preparation_Protocol',
                                  'sex': 'Gender',
                                  'organ': 'Location',
                                  'ClusterTop': 'Cell Type'}, inplace=True)

In [62]:
# Rename values in Cancer_adata.obs['ClusterTop'] as in Healthy_adata.obs['Cell Type']
Cancer_adata.obs['Cell Type'].replace({'Epi': 'Epithelial', 
                                        'Plasma': 'Plasma cells',
                                        'TNKILC': 'T cells',
                                        'B': 'B cells',
                                        'Strom': 'Mesenchymal',
                                        'Mast': 'Myeloid'}, inplace=True)

In [18]:
healthy_all_counts.obs['Cell Type'].value_counts()

Epithelial         191207
Mesenchymal        172657
T cells             47043
Plasma cells        46681
Myeloid             25587
Neuronal            19307
Stem Cell           18868
B cells             17772
Endothelial         16631
Red blood cells      1346
Name: Cell Type, dtype: int64

In [63]:
# Rename values in Cancer_adata.obs['ClusterTop'] as in Healthy_adata.obs['Cell Type']
healthy_all_counts.obs['Cell Type'].replace({'Stem Cell': 'Epithelial'}, inplace=True)

### Make the QC the same as in the Healthy dataset

In [64]:
# Calculate quality metrics for cancer dataset
sc.pp.calculate_qc_metrics(Cancer_adata, inplace=True)

In [66]:
# Make gene_id as a first column in adata.var
Cancer_adata.var.insert(0, 'gene_id', Cancer_adata.var.index)
# Make a gene name as index 
Cancer_adata.var.index = Cancer_adata.var['gene_name']

In [68]:
# Calculate mitochondrial fraction
Cancer_adata.var['mito'] = Cancer_adata.var_names.str.startswith(("MT-"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(Cancer_adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)
Cancer_adata.var['ribo'] = Cancer_adata.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(Cancer_adata, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

In [71]:
Cancer_adata.obs

Unnamed: 0_level_0,Sample_ID,Donor_ID,SpecimenType,TissueSource,ProcessingMethod,PatientTypeID,Gender,Site,Grade,TumorStage,...,ClusterMidway,Cell Type,Study_name,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,total_counts_mito,pct_counts_mito,total_counts_ribo,pct_counts_ribo
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,EpiT,Epithelial,Colon_cancer_atlas,3098,8.038835,4603.801270,68.937042,1.497394,445.594940,9.678848
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,EpiT,Epithelial,Colon_cancer_atlas,507,6.230481,2108.768555,94.047325,4.459822,371.848663,17.633450
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,EpiT,Epithelial,Colon_cancer_atlas,6688,8.808220,5887.731934,76.699623,1.302702,400.767029,6.806816
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,EpiT,Epithelial,Colon_cancer_atlas,3515,8.165079,4969.738281,75.586861,1.520943,429.411682,8.640530
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,male,left,low,notT4,...,EpiT,Epithelial,Colon_cancer_atlas,6316,8.751000,5981.045410,78.871780,1.318696,373.194397,6.239618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C173_T_0_0_0_c1_v3_id-TTTGGAGTCATCGGGC,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,Macro,Myeloid,Colon_cancer_atlas,3172,8.062433,5487.610352,80.156494,1.460681,246.709534,4.495755
C173_T_0_0_0_c1_v3_id-TTTGGAGTCTAGTGTG,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,EpiT,Epithelial,Colon_cancer_atlas,4377,8.384347,5541.674316,94.976021,1.713851,358.535309,6.469801
C173_T_0_0_0_c1_v3_id-TTTGTTGCAGCAATTC,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,EpiT,Epithelial,Colon_cancer_atlas,869,6.768493,2635.017578,103.340820,3.921827,67.731804,2.570450
C173_T_0_0_0_c1_v3_id-TTTGTTGGTTCTGAGT,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,female,left,high,T4,...,TCD4,T cells,Colon_cancer_atlas,641,6.464588,2512.186279,78.763542,3.135259,368.961121,14.686854


In [70]:
# Delete unnecessary columns
del Cancer_adata.obs['log1p_total_counts'], Cancer_adata.obs['pct_counts_in_top_50_genes'], Cancer_adata.obs['log1p_n_genes_by_counts']
del Cancer_adata.obs['pct_counts_in_top_100_genes'], Cancer_adata.obs['pct_counts_in_top_200_genes'], Cancer_adata.obs['pct_counts_in_top_500_genes']

In [74]:
# Write objects
Cancer_adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Colon_cancer_atlas_anndata.h5ad')
healthy_all_counts.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_integrated_data_all_genes.h5ad')

# Prepare Epithelial cells

### Counts Normalization

In [5]:
# Upload objects
input_cancer = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Colon_cancer_atlas_anndata.h5ad'
Cancer_adata = sc.read_h5ad(input_cancer)

input_healthy = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_integrated_data_all_genes.h5ad'
Healthy_adata = sc.read_h5ad(input_healthy)

  utils.warn_names_duplicates("var")


In [7]:
# Subset only epitheleal cells
Cancer_adata = Cancer_adata[Cancer_adata.obs['Cell Type'] == 'Epithelial', :]
healthy_all_counts = healthy_all_counts[healthy_all_counts.obs['Cell Type'] == 'Epithelial', :]


In [8]:
sc.pp.log1p(Cancer_adata)
sc.pp.log1p(healthy_all_counts)

  view_to_actual(adata)


In [10]:
# Save anndata objects
Cancer_adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Epithelial_Colon_cancer_atlas_normalized.h5ad')
healthy_all_counts.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Epithelial_Healthy_anndata_normalized.h5ad')

### Extract Highly Variable Genes

In [2]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Epithelial_Healthy_anndata_normalized.h5ad'
healthy_adata = sc.read(input)

In [4]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Epithelial_Healthy_anndata_normalized.h5ad'
cancer_adata = sc.read(input)

In [3]:
healthy_adata.layers['raw_counts'] = healthy_adata.X.copy()

### HVGs selection
# Calculate HVGs for cancer dataset
sc.pp.highly_variable_genes(
    healthy_adata,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "raw_counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)



In [5]:
cancer_adata.layers['raw_counts'] = cancer_adata.X.copy()

In [6]:
# Extract same HVGs in the cancer dataset as in the healthy dataset

#Make indexes as string
cancer_adata.var.index = cancer_adata.var.index.astype(str)

# Ensure indexes are unique
cancer_adata.var_names_make_unique()

# Identify common genes
common_genes = list(set(healthy_adata.var_names) & set(cancer_adata.var_names))

# Filter genes
adata_healthy = healthy_adata[:, common_genes]
cancer_adata = cancer_adata[:, common_genes]

#Ensure the same order of the genes
cancer_adata = cancer_adata[:, adata_healthy.var_names]

In [9]:
# Save anndata objects
cancer_adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Epithelial_Colon_cancer_atlas_normalized_5K.h5ad')
adata_healthy.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Epithelial_Healthy_anndata_normalized_5K.h5ad')