## Notebook for anndata object preparation that will be used for the datasets integration

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 31st May 2023

### Import Packages

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
import anndata as ad

In [2]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Datasets Upload

In [44]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_integrated_data_all_genes.h5ad'
Healthy_adata = sc.read_h5ad(input)

In [45]:
input_cancer = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/output/Epithelial/Joanito/2000_HVGs/scVI/Joanito_predicted_labels_with_scVI_scANVI_2000HVGs.h5ad'
Cancer_adata_predicted = sc.read_h5ad(input_cancer)

In [46]:
input_cancer2 = '/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/anndata/Joanito_raw_anndata_tumor_cells.h5ad'
Cancer_adata_raw = sc.read(input_cancer2)

### Filter epithelial cells

In [47]:
# Filter out fetal and pediatric samples
Healthy_adata = Healthy_adata[Healthy_adata.obs['Diagnosis'] != 'Fetal Healthy', :]
Healthy_adata = Healthy_adata[Healthy_adata.obs['Diagnosis'] != 'Pediatric healthy', :]

In [48]:
# Subset only epitheleal cells
Healthy_adata = Healthy_adata[Healthy_adata.obs['Cell Type'] == 'Epithelial', :]

In [49]:
Healthy_adata.obs['Unified Cell States'] = Healthy_adata.obs['Cell States']

  Healthy_adata.obs['Unified Cell States'] = Healthy_adata.obs['Cell States']


In [50]:
Healthy_adata.obs['Unified Cell States'].replace({"Enterocytes TMIGD1 MEP1A": 'Enterocyte',
                                               'Enterocytes CA1 CA2 CA4-': 'Enterocyte',
                                               'Enterocytes TMIGD1 MEP1A GSTA1': 'Enterocyte',
                                               'Enterocytes BEST4': 'Enterocyte',
                                               'BEST4+ epithelial': 'Enterocyte',
                                               'Stem_Cells_GCA': 'Stem cells OLFM4',
                                               'Stem_Cells_ext': 'Stem cells OLFM4',
                                               'Tuft': 'Tuft cells',
                                               'Paneth': 'Paneth cells',
                                               'Epithelial Cycling cells': 'TA',
                                               'Goblet cells SPINK4': 'Goblet cells',
                                               'Goblet cell': 'Goblet cells',
                                               'Goblet cells MUC2 TFF1-': 'Goblet cells',
                                               'Goblet cells MUC2 TFF1': 'Goblet cells',
                                               'EC cells (TAC1+)': 'Enterochromaffin cells',
                                               'EECs': 'Enteroendocrine cells',
                                               'K cells (GIP+)': 'Enteroendocrine cells',
                                               'M/X cells (MLN/GHRL+)': 'Enteroendocrine cells',
                                               'Progenitor (NEUROG3+)': 'Enteroendocrine cells',
                                               'D cells (SST+)': 'Enteroendocrine cells',
                                               'I cells (CCK+)': 'Enteroendocrine cells',
                                               'N cells (NTS+)': 'Enteroendocrine cells',
                                               'L cells (PYY+)': 'L cells'}, inplace=True)

### Filter Cancer dataset

In [52]:
# Filter cancer cells 
Cancer_adata_predicted = Cancer_adata_predicted[Cancer_adata_predicted.obs['dataset'] == 'Cancer', :]

In [60]:
Cancer_adata_raw = Cancer_adata_raw[Cancer_adata_raw.obs['Cell Type'] == 'Epithelial', :]

In [14]:
# scArches + scANVI, 2000 HVGs
Cancer_adata_predicted.obs['predictions'].value_counts()

Paneth cells        15707
Colonocyte          12370
Stem cells OLFM4     3247
TA                    846
Enterocyte             11
Name: predictions, dtype: int64

In [53]:
# scVI + scANVI, 2000 HVGs
Cancer_adata_predicted.obs['C_scANVI'].value_counts()

Paneth cells             14716
Colonocyte               11247
Stem cells OLFM4          2163
TA                        1835
Stem cells OLFM4 LGR5     1227
Enterocyte                 993
Name: C_scANVI, dtype: int64

In [25]:
# scArches + scANVI, 3000 HVGs
Cancer_adata_predicted.obs['predictions'].value_counts()

Colonocyte               16185
Paneth cells             11369
TA                        4422
Enterocyte                 127
Stem cells OLFM4            52
Stem cells OLFM4 LGR5       26
Name: predictions, dtype: int64

In [55]:
Cancer_adata_predicted.obs_keys

<bound method AnnData.obs_keys of View of AnnData object with n_obs × n_vars = 32181 × 1808
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'Unified Cell States', 'seed_labels', 'nFeature_RNA', 'dataset_x', 'iCMS', 'msi', 'dataset_y', 'Tumor Stage', 'MSS/MSI', 'Side', 'Group Stage', 'Stage TNM', 'iCMS.transcriptomic', 'iCMS.inferCNV', 'KRAS', 'BRAF', 'TP53', 'APC', 'PIK3CA', 'LymphNode', 'Normal', 'Tumor', 'CMS', 'C_scANVI'
    var: 'feature_types-Cancer', 'genome-Cancer', 'n_cells_by_counts-Cancer', 'mean_counts-Cancer', 'log1p_mean_counts-Cancer', 'pct_dropout_by_counts-Cancer', 'total_counts-Cancer', 'log1p_total

In [57]:
Cancer_adata_predicted.obs

Unnamed: 0_level_0,Sample_ID,Cell Type,Study_name,Donor_ID,Diagnosis,Age,Region code,Fraction,Gender,Library_Preparation_Protocol,...,KRAS,BRAF,TP53,APC,PIK3CA,LymphNode,Normal,Tumor,CMS,C_scANVI
Cell_ID2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CRC16_MUX8563_AAAGATGCAGAAGCAC-1,MUX8563,Epithelial,Joanito_cancer,CRC2794,Tumor,,,,Female,,...,wt,wt,mut,mut,wt,0.0,1.0,4.0,CMS2,Paneth cells
CRC16_MUX8563_ACAGCCGGTCTCTTAT-1,MUX8563,Epithelial,Joanito_cancer,CRC2794,Tumor,,,,Female,,...,wt,wt,mut,mut,wt,0.0,1.0,4.0,CMS2,Colonocyte
CRC16_MUX8563_ACATACGGTTACGTCA-1,MUX8563,Epithelial,Joanito_cancer,CRC2794,Tumor,,,,Female,,...,wt,wt,mut,mut,wt,0.0,1.0,4.0,CMS2,Colonocyte
CRC16_MUX8563_ACATGGTGTCCATGAT-1,MUX8563,Epithelial,Joanito_cancer,CRC2794,Tumor,,,,Female,,...,wt,wt,mut,mut,wt,0.0,1.0,4.0,CMS2,Paneth cells
CRC16_MUX8563_ACGAGGACATCTGGTA-1,MUX8563,Epithelial,Joanito_cancer,CRC2794,Tumor,,,,Female,,...,wt,wt,mut,mut,wt,0.0,1.0,4.0,CMS2,Paneth cells
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KUL5_EXT127_GACTGCGAGTAGCGGT-1,EXT127,Epithelial,Joanito_cancer,SC044,Tumor,,,,Female,,...,wt,wt,wt,wt,,0.0,2.0,3.0,CMS1,Paneth cells
KUL5_EXT127_GTGCATAGTTTGACAC-1,EXT127,Epithelial,Joanito_cancer,SC044,Tumor,,,,Female,,...,wt,wt,wt,wt,,0.0,2.0,3.0,CMS1,Paneth cells
KUL5_EXT127_TATCAGGGTGTGAAAT-1,EXT127,Epithelial,Joanito_cancer,SC044,Tumor,,,,Female,,...,wt,wt,wt,wt,,0.0,2.0,3.0,CMS1,Stem cells OLFM4 LGR5
KUL5_EXT127_TCACAAGAGATCCCGC-1,EXT127,Epithelial,Joanito_cancer,SC044,Tumor,,,,Female,,...,wt,wt,wt,wt,,0.0,2.0,3.0,CMS1,Colonocyte


In [None]:
# Copy indexes from Cancer_adata as a column in obs of Cancer dataset
Cancer_adata_predicted.obs['Cell_ID2'] = Cancer_adata_predicted.obs.index

# Remove part '-Cancer' and '-Healthy' from the Cancer_adata.obs['Cell_ID'] 
Cancer_adata_predicted.obs['Cell_ID2'] = Cancer_adata_predicted.obs['Cell_ID2'].str.replace('-Cancer', '')
Cancer_adata_predicted.obs['Cell_ID2'] = Cancer_adata_predicted.obs['Cell_ID2'].str.replace('-Healthy', '')

# Make 'Cell_ID2' column as index
Cancer_adata_predicted.obs.set_index('Cell_ID2', inplace=True)

del Cancer_adata_predicted.obs['Cell_ID2']

In [61]:
# Merge Cancer_adata_predicted.obs['C_scANVI'] with Cancer_adata_raw.obs by index
Cancer_adata_raw.obs = pd.merge(Cancer_adata_raw.obs, Cancer_adata_predicted.obs['C_scANVI'], left_index=True, right_index=True)

In [63]:
# Rename column 'C_scANVI' to 'predictions'
Cancer_adata_raw.obs.rename(columns={'C_scANVI': 'Unified Cell States'}, inplace=True)

In [82]:
# Make a new column "Library_Preparation_Protocol" in Cancer_adata_raw.obs and if Cancer_adata_raw.obs[dataset_x] == 'SMC' then fill it with "10X 3' v2"
# if Cancer_adata_raw.obs[dataset_x] == 'CRC-SG2' then fill it with "10X 3' v3"
# if Cancer_adata_raw.obs[dataset_x] =='KUL3' then fill it with "10X 3' v2"
# if Cancer_adata_raw.obs[dataset_x] =='KUL5' then fill it with "5'"
# if dCancer_adata_raw.obs[dataset_x] == 'CRC-SG1' then fill it with "5'"

# Create a new column "Library_Preparation_Protocol" in Cancer_adata_raw.obs
Cancer_adata_raw.obs['Library_Preparation_Protocol'] = ""

# Iterate over each row and assign the corresponding protocol value
for idx, row in Cancer_adata_raw.obs.iterrows():
    dataset_x = row['dataset_x']
    if dataset_x == 'SMC':
        Cancer_adata_raw.obs.at[idx, 'Library_Preparation_Protocol'] = "10X 3' v2"
    elif dataset_x == 'CRC-SG2':
        Cancer_adata_raw.obs.at[idx, 'Library_Preparation_Protocol'] = "10X 3' v3"
    elif dataset_x == 'KUL3':
        Cancer_adata_raw.obs.at[idx, 'Library_Preparation_Protocol'] = "10X 3' v2"
    elif dataset_x == 'KUL5':
        Cancer_adata_raw.obs.at[idx, 'Library_Preparation_Protocol'] = "5'"
    elif dataset_x == 'CRC-SG1':
        Cancer_adata_raw.obs.at[idx, 'Library_Preparation_Protocol'] = "5'"

In [83]:
Cancer_adata_raw.obs['Library_Preparation_Protocol'].value_counts()

10X 3' v2    15584
5'            8597
10X 3' v3     8000
Name: Library_Preparation_Protocol, dtype: int64

### Datasets concatenation

In [84]:
# Concatenate reference and query
adata = Healthy_adata.concatenate(Cancer_adata_raw, index_unique = None, batch_key = 'dataset', batch_categories = ['healthy', 'cancer'])

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


### Identify Highly Variable Genes

In [85]:
adata.layers['counts'] = adata.X.copy()

# Calculate 5000 HVGs
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 5000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [86]:
# Save the adata object
adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Datasets_integration/All_cells_5000_HVGs.h5ad')