In [2]:
import anndata as ad
import pickle as pkl

from src.evaluator.MLP_baseline_evaluator import cross_validation_models
from src.evaluator.evaluator_utils import l2_loss


Read adata, splits information

In [3]:
ADATA_PATH = "./data/sciplex/sciplex_full_v4.h5ad"
DRUG_SPLIT = "./data/sciplex/prnet_drug_splits.pkl"
DRUG_ENCODING_NAME = "sm_morgan_emb"
DRUG_ENCODING_SIZE = 1024
LOSS_FUNCTION = l2_loss
N_TRIALS = 10
SCHEDULER_MODE = 'min'

In [4]:
with open(DRUG_SPLIT, "rb") as f:
    drug_splits = pkl.load(f)

adata = ad.read_h5ad(ADATA_PATH)

In [5]:
adata

AnnData object with n_obs × n_vars = 571696 × 17376
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'n_genes', 'SMILES', 'pubchem_id', 'sm_coati_emb', 'sm_morgan_emb', 'match_index', 'mask_50', 'mask_100', 'mask_250'
    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'n_cells'
    uns: 'gene_names_1000', 'gene_names_2000', 'gene_names_3500', 'gene_names_500', 'gene_names_5000', 'gene_names_7500'
    obsm: 'X_1000_hvg', 'X_2000_hvg', 'X_3500_hvg', 'X_5000_hvg', 'X_500_hvg', 'X_7500_hvg', 'X_uce'

In [7]:
performance_500_hvg_Morgan_full = cross_validation_models(drug_splits=drug_splits,
                                      loss_function=l2_loss,
                                      adata=adata,
                                      input_name='X_500_hvg',
                                      input_dim=500,
                                      output_name='X_500_hvg',
                                      output_dim=500,
                                      drug_rep_name=DRUG_ENCODING_NAME,
                                      drug_emb_size=DRUG_ENCODING_SIZE,
                                      n_trials=N_TRIALS,
                                      gene_names_key='gene_names_500',
                                      scheduler_mode=SCHEDULER_MODE,
                                      run_name="MLP_X_500_hvg_morgan_full")

with open("./results/res_baseline_500_Morgan_full.pkl", "wb") as f:
    pkl.dump(performance_500_hvg_Morgan_full, f)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [02:32<00:00, 3741.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [01:06<00:00, 8542.47it/s]
[I 2025-04-15 00:13:21,111] A new study created in RDB with name: MLP_X_500_hvg_morgan_full_fold0
[I 2025-04-15 00:15:39,353] Trial 0 finished with value: 0.036771318690579716 and parameters: {'lr': 8.186643054938779e-06, 'weight_decay': 2.8060432637202357e-06, 'scheduler_factor': 0.1010957428151801, 'scheduler_patience': 15, 'batch_size': 32, 'dropout': 0.13171951875712506, 'hidden_dims': 2048}. Best is trial 0 with value: 0.036771318690579716.
[I 2025-04-15 00:18:27,952] Trial 1 finish

In [8]:
performance_1000_hvg_Morgan_full = cross_validation_models(drug_splits=drug_splits,
                                      loss_function=l2_loss,
                                      adata=adata,
                                      input_name='X_1000_hvg',
                                      input_dim=1000,
                                      output_name='X_1000_hvg',
                                      output_dim=1000,
                                      drug_rep_name=DRUG_ENCODING_NAME,
                                      drug_emb_size=DRUG_ENCODING_SIZE,
                                      n_trials=N_TRIALS,
                                      gene_names_key='gene_names_1000',
                                      scheduler_mode=SCHEDULER_MODE,
                                      run_name="MLP_X_1000_hvg_morgan_full")

with open("./results/res_baseline_1000_Morgan_full.pkl", "wb") as f:
    pkl.dump(performance_1000_hvg_Morgan_full, f)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [02:41<00:00, 3530.15it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [01:10<00:00, 8156.74it/s]
[I 2025-04-15 02:52:03,532] A new study created in RDB with name: MLP_X_1000_hvg_morgan_full_fold0
[I 2025-04-15 02:53:36,050] Trial 0 finished with value: 0.028033601706881663 and parameters: {'lr': 2.303679203342532e-05, 'weight_decay': 0.0022680779494138052, 'scheduler_factor': 0.1676687252186233, 'scheduler_patience': 1, 'batch_size': 64, 'dropout': 0.14874473221538054, 'hidden_dims': 128}. Best is trial 0 with value: 0.028033601706881663.
[I 2025-04-15 03:05:39,963] Trial 1 finished

In [9]:
performance_2000_hvg_Morgan_full = cross_validation_models(drug_splits=drug_splits,
                                      loss_function=l2_loss,
                                      adata=adata,
                                      input_name='X_2000_hvg',
                                      input_dim=2000,
                                      output_name='X_2000_hvg',
                                      output_dim=2000,
                                      drug_rep_name=DRUG_ENCODING_NAME,
                                      drug_emb_size=DRUG_ENCODING_SIZE,
                                      n_trials=N_TRIALS,
                                      gene_names_key='gene_names_2000',
                                      scheduler_mode=SCHEDULER_MODE,
                                      run_name="MLP_X_2000_hvg_morgan_full")

with open("./results/res_baseline_2000_Morgan_full.pkl", "wb") as f:
    pkl.dump(performance_2000_hvg_Morgan_full, f)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [02:40<00:00, 3564.89it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [01:09<00:00, 8273.43it/s]
[I 2025-04-15 07:19:52,571] A new study created in RDB with name: MLP_X_2000_hvg_morgan_full_fold0
[I 2025-04-15 07:28:43,437] Trial 0 finished with value: 0.02330996214630001 and parameters: {'lr': 6.990936516363361e-05, 'weight_decay': 6.626064789967078e-05, 'scheduler_factor': 0.3083067624654283, 'scheduler_patience': 9, 'batch_size': 16, 'dropout': 0.23680716479736813, 'hidden_dims': 2048}. Best is trial 0 with value: 0.02330996214630001.
[I 2025-04-15 07:32:02,362] Trial 1 finished 

In [11]:
performance_3500_hvg_Morgan_full = cross_validation_models(drug_splits=drug_splits,
                                      loss_function=l2_loss,
                                      adata=adata,
                                      input_name='X_3500_hvg',
                                      input_dim=3500,
                                      output_name='X_3500_hvg',
                                      output_dim=3500,
                                      drug_rep_name=DRUG_ENCODING_NAME,
                                      drug_emb_size=DRUG_ENCODING_SIZE,
                                      n_trials=N_TRIALS,
                                      gene_names_key='gene_names_3500',
                                      scheduler_mode=SCHEDULER_MODE,
                                      run_name="MLP_X_3500_hvg_morgan_full")

with open("./results/res_baseline_3500_Morgan_full.pkl", "wb") as f:
    pkl.dump(performance_3500_hvg_Morgan_full, f)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [02:46<00:00, 3432.22it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [01:10<00:00, 8146.13it/s]
[I 2025-04-15 10:33:40,244] A new study created in RDB with name: MLP_X_3500_hvg_morgan_full_fold0
[I 2025-04-15 10:42:17,765] Trial 0 finished with value: 0.023722121034895544 and parameters: {'lr': 2.3743412301542105e-06, 'weight_decay': 3.22443904679699e-05, 'scheduler_factor': 0.38752741457012063, 'scheduler_patience': 10, 'batch_size': 64, 'dropout': 0.25293517039002467, 'hidden_dims': 512}. Best is trial 0 with value: 0.023722121034895544.
[I 2025-04-15 10:43:37,717] Trial 1 finish

In [12]:
performance_5000_hvg_Morgan_full = cross_validation_models(drug_splits=drug_splits,
                                      loss_function=l2_loss,
                                      adata=adata,
                                      input_name='X_5000_hvg',
                                      input_dim=5000,
                                      output_name='X_5000_hvg',
                                      output_dim=5000,
                                      drug_rep_name=DRUG_ENCODING_NAME,
                                      drug_emb_size=DRUG_ENCODING_SIZE,
                                      n_trials=N_TRIALS,
                                      gene_names_key='gene_names_5000',
                                      scheduler_mode=SCHEDULER_MODE,
                                      run_name="MLP_X_5000_hvg_morgan_full")

with open("./results/res_baseline_5000_Morgan_full.pkl", "wb") as f:
    pkl.dump(performance_5000_hvg_Morgan_full, f)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [02:43<00:00, 3499.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [01:12<00:00, 7852.46it/s]
[I 2025-04-15 20:42:17,349] A new study created in RDB with name: MLP_X_5000_hvg_morgan_full_fold0
[I 2025-04-15 20:47:27,787] Trial 0 finished with value: 0.023706042554963645 and parameters: {'lr': 2.053648089888123e-06, 'weight_decay': 0.001160069920536261, 'scheduler_factor': 0.13718890448359067, 'scheduler_patience': 17, 'batch_size': 32, 'dropout': 0.23569098526127813, 'hidden_dims': 1024}. Best is trial 0 with value: 0.023706042554963645.
[I 2025-04-15 20:49:48,144] Trial 1 finish

In [6]:
performance_7500_hvg_Morgan_full = cross_validation_models(drug_splits=drug_splits,
                                      loss_function=l2_loss,
                                      adata=adata,
                                      input_name='X_7500_hvg',
                                      input_dim=7500,
                                      output_name='X_7500_hvg',
                                      output_dim=7500,
                                      drug_rep_name=DRUG_ENCODING_NAME,
                                      drug_emb_size=DRUG_ENCODING_SIZE,
                                      n_trials=N_TRIALS,
                                      gene_names_key='gene_names_7500',
                                      scheduler_mode=SCHEDULER_MODE,
                                      run_name="MLP_X_7500_hvg_morgan_full")

with open("./results/res_baseline_7500_Morgan_full.pkl", "wb") as f:
    pkl.dump(performance_7500_hvg_Morgan_full, f)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [02:40<00:00, 3567.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 571696/571696 [01:09<00:00, 8173.66it/s]
[I 2025-04-16 10:40:50,573] Using an existing study with name 'MLP_X_7500_hvg_morgan_full_fold0' instead of creating a new one.
[I 2025-04-16 10:40:54,795] Trial 10 pruned. 
[I 2025-04-16 10:41:57,146] Trial 11 finished with value: 0.02441699795829515 and parameters: {'lr': 3.5928284268258196e-05, 'weight_decay': 0.0003633814006259648, 'scheduler_factor': 0.23509167667463873, 'scheduler_patience': 14, 'batch_size': 32, 'dropout': 0.2839710830241251, 'hidden_dims': 64}. Best is trial 0 wi