## Notebook for the gene enrichment analysis of Cancer Joanito Epithelial Cells

### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 4th June 2023

#### Load required packages

In [2]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import seaborn as sns
import numpy as np

#### Setup Cells


In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.9.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


#### Upload Cancer Data

In [None]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Predicted_cancer_labels/Joanito/Joanito_epithelial_cells_with_predicted_annotations_after_scVI.h5ad' 
adata = sc.read_h5ad(input)

In [5]:
adata.raw.X

<191909x18525 sparse matrix of type '<class 'numpy.float32'>'
	with 266670546 stored elements in Compressed Sparse Row format>

In [6]:
adata_raw = adata.raw.to_adata()

In [7]:
adata_raw

AnnData object with n_obs × n_vars = 191909 × 18525
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'Unified Cell States', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'nFeature_RNA', 'sample.origin', 'dataset_x', 'iCMS', 'msi', 'dataset_y', 'Tumor Stage', 'MSS/MSI', 'Side', 'Group Stage', 'Stage TNM', 'iCMS.transcriptomic', 'iCMS.inferCNV', 'KRAS', 'BRAF', 'TP53', 'APC', 'PIK3CA', 'LymphNode', 'Normal', 'Tumor', 'CMS', 'Sample origin'
    var: 'feature_types-Colorectal cancer', 'genome-Colorectal cancer', 'gene_id-Kong-Healthy gut', 'gene_name-Kong-Healthy gut', 'n_cells_by_counts-Kong-Healthy gut', 'mean_c

### Prepare Gene Sets

In [5]:
Krebs_cycle_genes = ['ACO2', 'CS', 'FH', 'MDH1', 'OGDH', 'PDHA1', 'PDHA2', 'SDHC', 'SUCLG1', 'ACLY', 'ACO1', 'DLAT', 'DLD',
                     'DLST', 'IDH1', 'IDH2', 'IDH3A', 'IDH3B', 'IDH3G', 'MDH2', 'MPC1', 'OGDHL', 'PC', 'PCK1', 'PCK2',
                     'PDHB', 'SDHA', 'SDHB', 'SDHD', 'SUCLA2', 'SUCLG2']

In [6]:
mitochondrial_genes = ['MRPL1', "MRPL2", "MRPL3", "MRPL4", "MRPL9", "MRPL10", "MRPL11", "MRPL12", "MRPL13", "MRPL14", "MRPL15",
                       "MRPL16", "MRPL17", "MRPL18", "MRPL19", "MRPL20", "MRPL21", "MRPL22", "MRPL23", "MRPL24", "MRPL27", "MRPL28", 
                       "MRPL30", "MRPL32", "MRPL33", "MRPL34", "MRPL35", "MRPL36", "MRPL37", "MRPL38", "MRPL39", "MRPL40","MRPL41",
                       "MRPL42", "MRPL43", "MRPL44", "MRPL45", "MRPL46", "MRPL47", "MRPL48", "MRPL49", "MRPL50", "MRPL51", "MRPL52",
                       "MRPL53", "MRPL54", "MRPL55", "MRPL57",
                       "MRPS2", "MRPS5", "MRPS6", "MRPS7", "MRPS9", "MRPS10", "MRPS11", "MRPS12", "MRPS14", "MRPS15", "MRPS16", "MRPS17",
                       "MRPS18A", "MRPS18B", "MRPS18C", "MRPS21", "MRPS22", "MRPS23", "MRPS24", "MRPS25", "MRPS26", "MRPS27", "MRPS28", "MRPS30", 
                       "MRPS31", "MRPS33", "MRPS34", "MRPS35", "MRPS36", 'DAP3',
                       "MT-ND1", "MT-ND2", "MT-COX1", "MT-COX2" ,"MT-ATP8", "MT-ATP6", "MT-COX3", "MT-ND3", "MT-ND4L", "MT-ND4", "MT-ND5", "MT-ND6", "MT-CYTB",
                       'POLG', "POLG2", "SOD2", "MFN1", "MFN2", "OPA1", "PINK1", "PARK7", "PARK2", "SDHA", "SDHB", "SDHC", "SDHD", "NDUFS1", "NDUFS2", "NDUFS3", "NDUFS4", "NDUFS7", "NDUFS8", "NDUFA1", "NDUFA2", "NDUFA9"]

In [7]:
glycolysis_genes = ['ALDOA', 'BPGM', 'ENO1', 'ENO2', 'GAPDH', 'GPI', 'HK1', 'HK2', 'HKDC1', 'PFKL', 'PFKM', 'PGAM1', 
                    'PGAM2', 'PGAM4', 'PGK1', 'PKLR', 'PKM', 'TPI1']

In [8]:
gluconeogenesis = ['ALDOA', 'ALDOB', 'ALDOC', 'ENO1', 'ENO2', 'ENO3', 'FBP1', 'FBP2', 'GAPDH', 'GAPDHS', 
                   'GOT1', 'GOT2', 'GPI', 'MDH1', 'MDH2', 'PC', 'PCK1', 'PCK2', 'PFKFB1', 'PGAM1', 'PGAM2', 
                   'PGK1', 'PRKACA', 'PRKACB', 'PRKACG', 'SLC25A1', 'SLC25A10', 'SLC25A11', 'SLC25A12', 'SLC25A13', 'TPI1']

In [9]:
lipogenesis = ['ACACA', 'ACBD3', 'AGPAT2', 'ANGPTL4', 'APOB', 'AR', 'BRCA1', 'CD68', 'CHUK', 'EIF2AK3', 'ENHO',
               'EPAS1', 'FASN', 'FTO', 'GHRL', 'HK2', 'HNF4A', 'HOXB13', 'IDH1', 'IDH2', 'IGF1', 'INSIG1', 'LPIN1',
               'MIR1-1', 'MIR185', 'MIR206', 'MIR29A', 'MIR342', 'MIR613', 'MLXIPL', 'MORC2', 'MRC1', 'MTOR', 'NAMPT',
               'NEWENTRY', 'NOS3', 'NR0B1', 'NR1H2', 'NR1H3', 'OLR1', 'PFKFB2', 'PIK3CA', 'PPARD', 'PPARG', 'PPARGC1A',
               'PRKACA', 'PRL', 'RPS6', 'SCD', 'SHBG', 'SMARCD3', 'SREBF1', 'TRA2B', 'TRIB1', 'TUT1']

In [10]:
line1 = ['AICDA', 'APC', 'APOBEC3C', 'ATM', 'CDKN2A', 'EDA', 'ERCC1', 'ERCC4', 'ETS1', 'EVC', 'EVC2', 'GSTM1',
         'H19', 'HLA-G', 'IGF2', 'IGF2BP3', 'IGFBP7', 'L1RE1', 'L1TD1', 'LINC01587', 'MAL', 'MECP2', 'MET', 'MGMT',
         'MLH1', 'MTHFR', 'MTRR', 'NEWENTRY', 'PADI4', 'PEG3', 'PIWIL2', 'PROM1', 'SAMHD1', 'SLIT2', 'STK32B', 'TERT',
         'TINF2',
         'ORF1', 'ORF2', 'APOBEC3', 'TREX1', 'MOV10', 'PIWIL1', "PIWIL2", "PIWIL3", "PIWIL4", "DICER1", "AGO2", "EIF2C2"]



In [11]:
IL10 = ['IL10', 'IL10RA', 'IL10RB']
IL23 = ['IL23A', 'IL12B', 'IL12RB1', 'IL23R']
TNFa = ['TNF', 'TNFRSF1A', 'TNFRSF1B']

### Cancer Dataser

In [None]:
sc.tl.score_genes(adata_raw, Krebs_cycle_genes)
sc.set_figure_params(figsize=(10, 10),dpi=200)
sc.pl.umap(adata_raw, color= ['score', 'Study_name', 'Unified Cell States'], color_map = "magma", size = 7, frameon = False)

In [None]:
sc.tl.score_genes(adata_raw, mitochondrial_genes)
sc.set_figure_params(figsize=(10, 10),dpi=200)
sc.pl.umap(adata_raw, color= ['score', 'Study_name', 'Unified Cell States'], color_map = "magma", size = 7, frameon = False)

In [None]:
sc.tl.score_genes(adata_raw, glycolysis_genes)
sc.set_figure_params(figsize=(10, 10),dpi=200)
sc.pl.umap(adata_raw, color= ['score', 'Study_name', 'Unified Cell States'], color_map = "magma", size = 7, frameon = False)

### Upload Healthy and Cancer Dataset

In [12]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Datasets_integration/Integrated_cancer_Joanito_and_Healthy_datasets_7000_output.h5ad'
adata = sc.read_h5ad(input)

In [13]:
adata.raw.X

<191909x18525 sparse matrix of type '<class 'numpy.float32'>'
	with 266670546 stored elements in Compressed Sparse Row format>

In [14]:
adata_raw = adata.raw.to_adata()

In [15]:
adata_raw

AnnData object with n_obs × n_vars = 191909 × 18525
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'Unified Cell States', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'nFeature_RNA', 'sample.origin', 'dataset_x', 'iCMS', 'msi', 'dataset_y', 'Tumor Stage', 'MSS/MSI', 'Side', 'Group Stage', 'Stage TNM', 'iCMS.transcriptomic', 'iCMS.inferCNV', 'KRAS', 'BRAF', 'TP53', 'APC', 'PIK3CA', 'LymphNode', 'Normal', 'Tumor', 'CMS', 'Sample origin'
    var: 'feature_types-Colorectal cancer', 'genome-Colorectal cancer', 'gene_id-Kong-Healthy gut', 'gene_name-Kong-Healthy gut', 'n_cells_by_counts-Kong-Healthy gut', 'mean_c

In [16]:
sc.tl.score_genes(adata_raw, Krebs_cycle_genes, score_name = "Krebs_cycle_genes")

computing score 'Krebs_cycle_genes'
    finished: added
    'Krebs_cycle_genes', score of gene set (adata.obs).
    499 total control genes are used. (0:00:04)


In [17]:
sc.tl.score_genes(adata_raw, mitochondrial_genes, score_name = "Mitochondrial_genes")

computing score 'Mitochondrial_genes'
    finished: added
    'Mitochondrial_genes', score of gene set (adata.obs).
    593 total control genes are used. (0:00:03)


In [18]:
sc.tl.score_genes(adata_raw, glycolysis_genes, score_name = "Glycolysis_genes")

computing score 'Glycolysis_genes'
    finished: added
    'Glycolysis_genes', score of gene set (adata.obs).
    448 total control genes are used. (0:00:03)


In [19]:
sc.tl.score_genes(adata_raw, gluconeogenesis, score_name = "Gluconeogenesis_genes")

computing score 'Gluconeogenesis_genes'
    finished: added
    'Gluconeogenesis_genes', score of gene set (adata.obs).
    698 total control genes are used. (0:00:03)


In [20]:
sc.tl.score_genes(adata_raw, lipogenesis, score_name = "Lipogenesis_genes")

computing score 'Lipogenesis_genes'
    finished: added
    'Lipogenesis_genes', score of gene set (adata.obs).
    898 total control genes are used. (0:00:03)


In [21]:
sc.tl.score_genes(adata_raw, line1, score_name = "LINE1_genes")

computing score 'LINE1_genes'
    finished: added
    'LINE1_genes', score of gene set (adata.obs).
    946 total control genes are used. (0:00:03)


In [22]:
sc.tl.score_genes(adata_raw, IL10, score_name = "IL-10")

computing score 'IL-10'
    finished: added
    'IL-10', score of gene set (adata.obs).
    150 total control genes are used. (0:00:02)


In [23]:
sc.tl.score_genes(adata_raw, IL23, score_name = "IL-23")

computing score 'IL-23'
    finished: added
    'IL-23', score of gene set (adata.obs).
    200 total control genes are used. (0:00:02)


In [24]:
sc.tl.score_genes(adata_raw, TNFa, score_name = "TNFa")

computing score 'TNFa'
    finished: added
    'TNFa', score of gene set (adata.obs).
    150 total control genes are used. (0:00:02)


In [None]:
sc.set_figure_params(figsize=(10, 10),dpi=200)
sc.pl.umap(adata_raw, color= ['Krebs_cycle_genes', 'Mitochondrial_genes', 'Glycolysis_genes', 'Gluconeogenesis_genes', 'Lipogenesis_genes', 'LINE1_genes', 'IL-10', 'IL-23', 'TNFa', 'Study_name', 'Unified Cell States'], color_map = "magma", size = 7, frameon = False)

#### Upload Healthy Data

In [None]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Healthy_epithelial_scVI/Healthy_epithelial_cells_with_predicted_annotations_after_scVI.h5ad' 
adata = sc.read_h5ad(input)

In [None]:
adata.raw.X

In [None]:
adata_raw = adata.raw.to_adata()

In [None]:
sc.tl.score_genes(adata_raw, Krebs_cycle_genes)
sc.set_figure_params(figsize=(10, 10),dpi=200)
sc.pl.umap(adata_raw, color= ['score', 'Study_name', 'Unified Cell States'], color_map = "magma", size = 7, frameon = False)

In [None]:
sc.tl.score_genes(adata_raw, mitochondrial_genes)
sc.set_figure_params(figsize=(10, 10),dpi=200)
sc.pl.umap(adata_raw, color= ['score', 'Study_name', 'Unified Cell States'], color_map = "magma", size = 7, frameon = False)

In [None]:
sc.tl.score_genes(adata_raw, glycolysis_genes)
sc.set_figure_params(figsize=(10, 10),dpi=200)
sc.pl.umap(adata_raw, color= ['score', 'Study_name', 'Unified Cell States'], color_map = "magma", size = 7, frameon = False)