In [1]:
import os
import pandas as pd
from IPython.display import Image
from deside.utility import check_dir
from deside.decon_cf import DeSide
from deside.plot import plot_predicted_result
from deside.utility.read_file import read_gene_set
import deside

2024-02-01 17:34:44.771791: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Using pre-trained model
#### Input files
- `LUAD_TPM.csv`: Gene expression profiles of LUAD in the Transcript per Million (TPM) format from TCGA.
- `DeSide_model`: The folder containing the pre-trained model.

In [2]:
deside.__version__

'1.2.1'

In [3]:
dataset_dir = './datasets/'
# bulk gene expression profiles (GEPs) in TPM formart
bulk_tpm_file_path = os.path.join(dataset_dir, 'TCGA/tpm/LUAD/LUAD_TPM.csv')
bulk_tpm = pd.read_csv(bulk_tpm_file_path, index_col=0)
print(bulk_tpm.shape)
bulk_tpm.head(2)

(19712, 515)


Unnamed: 0_level_0,TCGA-55-8508-01A,TCGA-67-3771-01A,TCGA-55-A4DG-01A,TCGA-91-7771-01A,TCGA-91-6849-01A,TCGA-64-5781-01A,TCGA-44-6146-01B,TCGA-97-7552-01A,TCGA-80-5608-01A,TCGA-91-6829-01A,...,TCGA-55-A4DF-01A,TCGA-67-3773-01A,TCGA-55-7573-01A,TCGA-50-5068-01A,TCGA-49-AARN-01A,TCGA-78-7150-01A,TCGA-MP-A4TA-01A,TCGA-55-7907-01A,TCGA-55-5899-01A,TCGA-55-7574-01A
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TRIM71,0.015,0.041,0.491,0.155,0.016,0.011,1.366,1.079,0.2,0.483,...,0.094,0.0,2.909,0.024,0.155,0.009,0.088,0.053,0.011,0.688
CCR4,2.516,5.237,8.719,12.704,7.546,1.258,12.888,11.499,0.893,1.354,...,6.534,7.799,8.997,7.397,1.229,0.884,2.706,5.959,4.158,22.56


In [4]:
# create output directory
result_dir = './results/E1'
y_pred_file_path = os.path.join(result_dir, 'y_pred.csv')
check_dir(result_dir)

In [5]:
# hyper-parameters of the DNN model
deside_parameters = {
    'architecture': ([200, 2000, 2000, 2000, 50], [0.05, 0.05, 0.05, 0.2, 0]),
    'architecture_for_pathway_network': ([50, 500, 500, 500, 50], [0, 0, 0, 0, 0]),
    'loss_function_alpha': 0.5,  # alpha*mae + (1-alpha)*rmse, mae means mean absolute error
    'normalization': 'layer_normalization',  # batch_normalization / layer_normalization / None
     # 1 means to add a normalization layer, input | the first hidden layer | ... | output
    'normalization_layer': [0, 0, 1, 1, 1, 1],  # 1 more parameter than the number of hidden layers
    'pathway_network': True,  # using an independent pathway network
    'last_layer_activation': 'sigmoid',  # sigmoid / softmax
    'learning_rate': 1e-4,
    'batch_size': 128}

# read two gene sets as pathway mask
gene_set_file_path1 = os.path.join(dataset_dir, 'gene_set', 'c2.cp.kegg.v2023.1.Hs.symbols.gmt')
gene_set_file_path2 = os.path.join(dataset_dir, 'gene_set', 'c2.cp.reactome.v2023.1.Hs.symbols.gmt')
all_pathway_files = [gene_set_file_path1, gene_set_file_path2]
pathway_mask = read_gene_set(all_pathway_files)  # genes by pathways

In [6]:
# read pre-trained DeSide model
model_dir = './DeSide_model/'
deside_model = DeSide(model_dir=model_dir)

# predict by pre-trained model
# - transpose=True, if the bulk_tpm_file is provided as genes by samples (rows by columns)
# - we used scaling_by_constant in the manuscript, Scaden used scaling_by_sample
deside_model.predict(input_file=bulk_tpm_file_path, output_file_path=y_pred_file_path, 
                     exp_type='TPM', transpose=True,
                     scaling_by_sample=False, scaling_by_constant=True,
                     hyper_params=deside_parameters, pathway_mask=pathway_mask)

   Start to predict cell fractions by pre-trained model...
   9028 common genes will be used, 10684 genes will be removed.
   9028 genes will be used to construct the pathway profiles.
common genes between training set and pathway mask: 5462
genes only in training set: 3566
x shape: (515, 10868)
   10868 common genes will be used, 0 genes will be removed.
   > 10868 genes included in pre-trained model and will be used for prediction.
   The shape of X is: (515, 10868), (n_sample, n_gene)


2024-02-01 17:35:25.658341: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


   Pre-trained model loaded from ./DeSide_model/model_DeSide.h5.
   Model prediction done.


In [7]:
y_pred = pd.read_csv(y_pred_file_path, index_col=0)
print(y_pred.shape)
y_pred.head(2)

(515, 17)


Unnamed: 0,Plasma B cells,Non-plasma B cells,CD4 T,CD8 T effector,CD8 T (GZMK high),Double-neg-like T,DC,Endothelial Cells,CAFs,Myofibroblasts,Macrophages,Mast Cells,NK,Neutrophils,Monocytes,1-others,Cancer Cells
TCGA-55-8508-01A,0.138,0.014,0.019,0.003,0.001,0.0,0.015,0.035,0.1,0.018,0.052,0.001,0.005,0.019,0.0,0.581,0.581
TCGA-67-3771-01A,0.05,0.005,0.016,0.002,0.017,0.001,0.014,0.04,0.079,0.023,0.164,0.002,0.0,0.011,0.001,0.575,0.575


In [8]:
# plot predicted cell proportions
plot_predicted_result(cell_frac_result_fp=y_pred_file_path, bulk_exp_fp=bulk_tpm_file_path,
                      cancer_type='LUAD', model_name='DeSide', result_dir=result_dir, font_scale=2)

   Cell types:  CAFs, CD4 T, CD8 T (GZMK high), CD8 T effector, Cancer Cells, DC, Double-neg-like T, Endothelial Cells, Macrophages, Mast Cells, Monocytes, Myofibroblasts, NK, Neutrophils, Non-plasma B cells, Plasma B cells


In [9]:
Image(url= "./results/E1/pred_cell_prop_before_decon.png", width=1200)

In [10]:
!tree results/E1/

[01;34mresults/E1/[0m
├── [00mCD8A_vs_predicted_CD8 T_proportion.png[0m
├── [00mpred_cell_prop_before_decon.png[0m
└── [00my_pred.csv[0m

1 directory, 3 files


#### Output files
- CD8A_vs_predicted_CD8 T_proportion.png: The figure depicting the predicted CD8 T cell proportions and the expression values of marker gene CD8A
- pred_cell_prop_before_decon.png: The figure depicting the predicted cell proportions for all cell types
- y_pred.csv: The file containing the predicted cell proportions
