In [1]:
import os
import pandas as pd
from deside.utility import check_dir, sorted_cell_types
from deside.decon_cf import DeSide
from deside.utility.read_file import read_gene_set
import deside

2024-02-01 18:30:44.457118: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Training a model from scrach
- No GUP was used for this example
- MacOS (this example): 6-Core Intel Core i5, 32 GB memory
- A larger training set needs more memory. (we trained `DeSide` with dataset `D1+D2` on a computing server)

In [2]:
deside.__version__

'1.2.1'

In [3]:
# create output directory
result_dir = './results/E2'
check_dir(result_dir)
dataset_dir = './datasets/'

#### Input file (training set)
- `simu_bulk_exp_Mixed_N100K_D1.h5ad`: Dataset D1 contains the synthesized bulk gene expression profiles (GEPs) after filtering. Download link: https://doi.org/10.6084/m9.figshare.23047391.v2

In [4]:
training_set2file_path = {
    'D1': './datasets/simulated_bulk_cell_dataset/D1/simu_bulk_exp_Mixed_N100K_D1.h5ad',
}
training_set2file_path

{'D1': './datasets/simulated_bulk_cell_dataset/D1/simu_bulk_exp_Mixed_N100K_D1.h5ad'}

In [5]:
cell_type2subtypes = {'B Cells': ['Non-plasma B cells', 'Plasma B cells'],
                      'CD4 T': ['CD4 T'], 'CD8 T': ['CD8 T (GZMK high)', 'CD8 T effector'],
                      'DC': ['DC'], 'Endothelial Cells': ['Endothelial Cells'],
                      'Cancer Cells': ['Cancer Cells'],
                      'Fibroblasts': ['CAFs', 'Myofibroblasts'], 'Macrophages': ['Macrophages'],
                      'Mast Cells': ['Mast Cells'], 'NK': ['NK'], 'Neutrophils': ['Neutrophils'],
                      'Double-neg-like T': ['Double-neg-like T'], 'Monocytes': ['Monocytes']}
all_cell_types = sorted([i for v in cell_type2subtypes.values() for i in v])
all_cell_types = [i for i in sorted_cell_types if i in all_cell_types]
all_cell_types # all cell types that DeSide can predict

['Plasma B cells',
 'Non-plasma B cells',
 'CD4 T',
 'CD8 T effector',
 'CD8 T (GZMK high)',
 'Double-neg-like T',
 'Cancer Cells',
 'DC',
 'Endothelial Cells',
 'CAFs',
 'Myofibroblasts',
 'Macrophages',
 'Mast Cells',
 'NK',
 'Neutrophils',
 'Monocytes']

#### Hyper-parameters and gene sets 

In [6]:
# hyper-parameters of the DNN model
deside_parameters = {
    'architecture': ([200, 2000, 2000, 2000, 50], [0, 0, 0, 0.2, 0]),
    'architecture_for_pathway_network': ([50, 500, 500, 500, 50], [0, 0, 0, 0, 0]),
    'loss_function_alpha': 0.5,  # alpha*mae + (1-alpha)*rmse, mae means mean absolute error
    'normalization': 'layer_normalization',  # batch_normalization / layer_normalization / None
     # 1 means to add a normalization layer, input | the first hidden layer | ... | output
    'normalization_layer': [0, 0, 1, 1, 1, 0],  # 1 more parameter than the number of hidden layers
    'pathway_network': True,  # using an independent pathway network
    'last_layer_activation': 'sigmoid',  # sigmoid / softmax
    'learning_rate': 1e-4,
    'batch_size': 128}


# read two gene sets as pathway mask
gene_set_file_path1 = os.path.join(dataset_dir, 'gene_set', 'c2.cp.kegg.v2023.1.Hs.symbols.gmt')
gene_set_file_path2 = os.path.join(dataset_dir, 'gene_set', 'c2.cp.reactome.v2023.1.Hs.symbols.gmt')
all_pathway_files = [gene_set_file_path1, gene_set_file_path2]
pathway_mask = read_gene_set(all_pathway_files)  # genes by pathways

# filtered gene list (gene-level filtering, filtered by correlation coefficients and quantiles)
filtered_gene_list = None  # for other datasets
if list(training_set2file_path.keys())[0] == 'D1':
    filtered_gene_file_path = os.path.join(dataset_dir, 'simulated_bulk_cell_dataset/D1/gene_list_filtered_by_high_corr_gene_and_quantile_range.csv')
    filtered_gene_list = pd.read_csv(filtered_gene_file_path, index_col=0).index.to_list()

# input gene list type for pathway profiles
input_gene_list = 'filtered_genes'

In [7]:
# remove cancer cell during training process
remove_cancer_cell = True

#### Training

In [8]:
# set result dirtory to save DeSide model
model_dir = os.path.join(result_dir, 'DeSide_model')
log_file_path = os.path.join(result_dir, 'deside_running_log.txt')
deside_obj = DeSide(model_dir=model_dir, log_file_path=log_file_path)

# training DeSide
# - training_set_file_path is a list, multiple datasets will be combined together
deside_obj.train_model(training_set_file_path=[training_set2file_path['D1']], 
                       hyper_params=deside_parameters, cell_types=all_cell_types,
                       scaling_by_constant=True, scaling_by_sample=False,
                       remove_cancer_cell=remove_cancer_cell,
                       n_patience=100, n_epoch=3000, verbose=0, pathway_mask=pathway_mask,
                       method_adding_pathway='add_to_end', filtered_gene_list=filtered_gene_list,
                       input_gene_list=input_gene_list)


---->>> Start to training model... <<<----
Thu Feb  1 18:30:56 2024

---->>> Start to reading training set... <<<----
Thu Feb  1 18:30:56 2024
x shape: (100000, 9028) ./datasets/simulated_bulk_cell_dataset/D1/simu_bulk_exp_Mixed_N100K_D1.h5ad
x head:                      A1BG     A2M  A4GALT  AADAT  AAGAB  AAMDC   AAMP   AARD  \
s_segment_0_1008_0  6.632  10.282   5.517  3.008  4.778  6.432  6.503  1.676   
s_segment_0_101_0   5.945   9.280   4.701  2.224  4.836  6.211  6.906  2.307   
s_segment_0_1041_0  5.720  10.101   5.148  1.814  4.291  6.338  5.983  0.663   
s_segment_0_1065_0  5.850   4.258   3.562  4.564  5.534  6.669  6.729  2.840   
s_segment_0_1078_0  6.212   9.284   4.278  1.711  4.282  5.880  6.577  1.516   

                     AASS   AATF  ...  ZPBP2   ZPR1  ZRANB1  ZSCAN18  ZSWIM4  \
s_segment_0_1008_0  5.298  5.653  ...  0.000  3.044   4.522    5.748   3.381   
s_segment_0_101_0   3.888  6.014  ...  0.000  5.127   5.263    5.357   3.816   
s_segment_0_1041_0  4.435  

2024-02-01 18:35:29.197636: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


   The following loss function will be used: 0.5 * mae + 0.5 * rmse
Model: "DeSide"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 gep (InputLayer)               [(None, 9028)]       0           []                               
                                                                                                  
 dense (Dense)                  (None, 200)          1805800     ['gep[0][0]']                    
                                                                                                  
 pathway_profile (InputLayer)   [(None, 1840)]       0           []                               
                                                                                                  
 dense_1 (Dense)                (None, 2000)         400000      ['dense[0][0]']                  
                         

In [9]:
from IPython.display import Image
Image(url= "./results/E2/DeSide_model/loss.png", width=1200)

In [10]:
!tree results/E2/

[01;34mresults/E2/[0m
├── [01;34mDeSide_model[0m
│   ├── [00mcelltypes.txt[0m
│   ├── [00mgenes.txt[0m
│   ├── [00mgenes_for_gep.txt[0m
│   ├── [00mgenes_for_pathway_profile.txt[0m
│   ├── [00mhistory_reg.csv[0m
│   ├── [00mkey_params.txt[0m
│   ├── [00mloss.png[0m
│   └── [00mmodel_DeSide.h5[0m
└── [00mdeside_running_log.txt[0m

2 directories, 9 files


#### Output files
- celltypes.txt                : Cell types included in the training set (without Cancer Cells)
- genes_for_gep.txt            : Gene list included in the training set and used in gene expression profiles (GEPs)
- genes_for_pathway_profile.txt: Gene list used in Pathway profiles
- history_reg.csv              : The history of recorded loss values during the training process
- key_params.txt               : Key parameters of the model
- loss.png                     : The figure depicting loss values over epochs
- model_DeSide.h5              : Saved model after training
- deside_running_log.txt       : Log file