## Hyperparameter tuning via Optuna

### Being a binary model this notebook will be limited to predicting one class 1 or 0, yes or no.
### Here I will be predicting if a cell received a treatment or not

In [1]:
import pathlib
import sys

In [2]:
import numpy as np
import optuna
import pandas as pd
import toml
import torch
from sklearn import preprocessing

In [3]:
sys.path.append("../..")

In [4]:
from MLP_utils.parameters import Parameters
from MLP_utils.utils import (
    Dataset_formatter,
    data_split,
    extract_best_trial_params,
    objective_model_optimizer,
    parameter_set,
    plot_metric_vs_epoch,
    results_output,
    test_optimized_model,
    train_optimized_model,
    un_nest,
)
from sklearn.model_selection import train_test_split

In [5]:
sys.path.append("../../..")
from utils.utils import df_stats

## Papermill is used for executing notebooks in the CLI with multiple parameters
Here the `injected-parameters` cell is used to inject parameters into the notebook via papermill.
This enables multiple notebooks to be executed with different parameters, preventing to manually update parameters or have multiple copies of the notebook.

Parameters

In [6]:
CELL_TYPE = "SHSY5Y"
MODEL_NAME = "MultiClass_MLP"

In [7]:
ml_configs_file = pathlib.Path("../../MLP_utils/multi_class_config.toml").resolve(
    strict=True
)
ml_configs = toml.load(ml_configs_file)
params = Parameters()
mlp_params = parameter_set(params, ml_configs)

overwrite params via command line arguments from papermill

In [8]:
mlp_params.CELL_TYPE = CELL_TYPE
mlp_params.MODEL_NAME = MODEL_NAME
mlp_params.MODEL_NAME = MODEL_NAME
MODEL_TYPE = mlp_params.MODEL_TYPE
HYPERPARAMETER_BATCH_SIZE = mlp_params.HYPERPARAMETER_BATCH_SIZE

Import Data
set data file path under pathlib path for multi-system use

In [9]:
file_path = pathlib.Path(
    f"../../../data/{mlp_params.CELL_TYPE}_preprocessed_sc_norm.parquet"
).resolve(strict=True)

In [10]:
df1 = pd.read_parquet(file_path)

In [11]:
if params.MODEL_NAME == "MultiClass_MLP_h202_remove":
    # drop H2O2_100.000_uM_DMSO_0.025_% and H2O2_100.000_nM_DMSO_0.025_% while keeping the index order
    df1 = df1[
        ~df1["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
            ["H2O2_100.000_uM_DMSO_0.025_%", "H2O2_100.000_nM_DMSO_0.025_%"]
        )
    ]

get paths for toml files

In [12]:
ground_truth_file_path = pathlib.Path(f"../../MLP_utils/ground_truth.toml").resolve(
    strict=True
)
treatment_splits_file_path = pathlib.Path(f"../../MLP_utils/splits.toml").resolve(
    strict=True
)
# read toml files
ground_truth = toml.load(ground_truth_file_path)
treatment_splits = toml.load(treatment_splits_file_path)

get information from toml files

In [13]:
apoptosis_groups_list = ground_truth["Apoptosis"]["apoptosis_groups_list"]
pyroptosis_groups_list = ground_truth["Pyroptosis"]["pyroptosis_groups_list"]
healthy_groups_list = ground_truth["Healthy"]["healthy_groups_list"]
test_split_100 = treatment_splits["splits"]["data_splits_100"]
test_split_75 = treatment_splits["splits"]["data_splits_75"]

In [14]:
np.random.seed(0)
if mlp_params.DATA_SUBSET_OPTION == "True":
    df1 = df1.groupby("oneb_Metadata_Treatment_Dose_Inhibitor_Dose").apply(
        lambda x: x.sample(n=mlp_params.DATA_SUBSET_NUMBER, random_state=0)
    )
    print("Data Subset Is On")
    print(f"Data is subset to {mlp_params.DATA_SUBSET_NUMBER} per treatment group")
    print(df1.shape)
    df1.reset_index(drop=True, inplace=True)
else:
    print("Data Subset Is Off")

Data Subset Is Off


add apoptosis, pyroptosis and healthy columns to dataframe

In [15]:
df1["apoptosis"] = df1.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in apoptosis_groups_list,
    axis=1,
)
df1["pyroptosis"] = df1.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in pyroptosis_groups_list,
    axis=1,
)
df1["healthy"] = df1.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in healthy_groups_list,
    axis=1,
)

merge apoptosis, pyroptosis, and healthy columns into one column

In [16]:
df1["labels"] = df1.apply(
    lambda row: "apoptosis"
    if row["apoptosis"]
    else "pyroptosis"
    if row["pyroptosis"]
    else "healthy",
    axis=1,
)
# drop apoptosis, pyroptosis, and healthy columns
df1.drop(columns=["apoptosis", "pyroptosis", "healthy"], inplace=True)

### Split said data

randomly select wells to hold out for testing one per treatment group
stratified by treatment group

In [17]:
np.random.seed(seed=0)
wells_to_hold = (
    df1.groupby("oneb_Metadata_Treatment_Dose_Inhibitor_Dose")
    .agg(np.random.choice)["Metadata_Well"]
    .to_list()
)
df_holdout = df1[df1["Metadata_Well"].isin(wells_to_hold)]
df = df1[~df1["Metadata_Well"].isin(wells_to_hold)]

In [18]:
print("Wells held out for testing:", df_holdout["Metadata_Well"].unique())
print(
    "Wells to use for training, validation, and testing", df1["Metadata_Well"].unique()
)

Wells held out for testing: ['F14' 'D15' 'G17' 'H17' 'D18' 'E18' 'B19' 'G19' 'D20' 'F20' 'B21' 'C21'
 'E21' 'F22' 'H22' 'C23' 'J13' 'K13' 'L13' 'I14' 'L14' 'N14' 'O15' 'J16'
 'N16' 'O17' 'I18' 'L18' 'M18' 'O18' 'I20' 'K20' 'L21' 'M21' 'N22' 'O22'
 'J23' 'K23']
Wells to use for training, validation, and testing ['B13' 'C13' 'D13' 'E13' 'F13' 'G13' 'H13' 'B14' 'C14' 'D14' 'E14' 'F14'
 'G14' 'H14' 'B15' 'C15' 'D15' 'E15' 'F15' 'G15' 'H15' 'B16' 'C16' 'D16'
 'E16' 'F16' 'G16' 'H16' 'B17' 'C17' 'D17' 'E17' 'F17' 'G17' 'H17' 'B18'
 'C18' 'D18' 'E18' 'F18' 'G18' 'H18' 'B19' 'C19' 'D19' 'E19' 'F19' 'G19'
 'H19' 'B20' 'C20' 'D20' 'E20' 'F20' 'G20' 'H20' 'B21' 'C21' 'D21' 'E21'
 'F21' 'G21' 'H21' 'B22' 'C22' 'D22' 'E22' 'F22' 'G22' 'H22' 'B23' 'C23'
 'D23' 'E23' 'F23' 'G23' 'H23' 'I13' 'J13' 'K13' 'L13' 'M13' 'N13' 'O13'
 'I14' 'J14' 'K14' 'L14' 'M14' 'N14' 'O14' 'I15' 'J15' 'K15' 'L15' 'M15'
 'N15' 'O15' 'I16' 'J16' 'K16' 'L16' 'M16' 'N16' 'O16' 'I17' 'J17' 'K17'
 'L17' 'M17' 'N17' 'O17' 'I18' 

variable test and train set splits
100% test set
subset the following treatments for test set

In [19]:
test_set_all = df[
    df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(test_split_100)
]
# 75% test set and 25% train set
test_set_75 = df[df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(test_split_75)]

In [20]:
test_100_and_75 = test_split_100 + test_split_75

50% test set and 50% train set
get all treatments that are not in the_test_set_all and the test_set_75

In [21]:
test_set_50 = df[
    ~df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(test_100_and_75)
]

In [22]:
print(test_set_all.shape, test_set_75.shape, test_set_50.shape)

(22551, 1277) (23407, 1277) (401043, 1277)


get the train test splits from each group
100% test set

In [23]:
test_set_all

Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_SumVariance_CorrGasdermin_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_03_256,Metadata_Treatment,Metadata_Dose,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,twob_Metadata_Treatment_Dose_Inhibitor_Dose,threeb_Metadata_Treatment_Dose_Inhibitor_Dose,fourb_Metadata_Treatment_Dose_Inhibitor_Dose,labels
93196,SH-SY5Y,D16,4344,6,DMSO,0.025,%,LPS,1.000,µg_per_ml,...,-0.018145,0.196866,-0.027737,LPS,1.000_µg_per_ml,LPS_1.000_ug_per_ml_DMSO_0.025_%,LPS_DMSO_0.025__1.000_µg_per_ml,LPS__1.000_µg_per_ml__µg_per_ml_DMSO_0.025,LPS__1.000_µg_per_ml__µg_per_ml_DMSO__0.025,pyroptosis
93197,SH-SY5Y,D16,4344,6,DMSO,0.025,%,LPS,1.000,µg_per_ml,...,-0.062049,-0.113588,-0.045885,LPS,1.000_µg_per_ml,LPS_1.000_ug_per_ml_DMSO_0.025_%,LPS_DMSO_0.025__1.000_µg_per_ml,LPS__1.000_µg_per_ml__µg_per_ml_DMSO_0.025,LPS__1.000_µg_per_ml__µg_per_ml_DMSO__0.025,pyroptosis
93198,SH-SY5Y,D16,4344,6,DMSO,0.025,%,LPS,1.000,µg_per_ml,...,-0.071411,-0.128236,-0.065858,LPS,1.000_µg_per_ml,LPS_1.000_ug_per_ml_DMSO_0.025_%,LPS_DMSO_0.025__1.000_µg_per_ml,LPS__1.000_µg_per_ml__µg_per_ml_DMSO_0.025,LPS__1.000_µg_per_ml__µg_per_ml_DMSO__0.025,pyroptosis
93199,SH-SY5Y,D16,4344,6,DMSO,0.025,%,LPS,1.000,µg_per_ml,...,-0.066394,-0.131972,-0.063472,LPS,1.000_µg_per_ml,LPS_1.000_ug_per_ml_DMSO_0.025_%,LPS_DMSO_0.025__1.000_µg_per_ml,LPS__1.000_µg_per_ml__µg_per_ml_DMSO_0.025,LPS__1.000_µg_per_ml__µg_per_ml_DMSO__0.025,pyroptosis
93200,SH-SY5Y,D16,4344,6,DMSO,0.025,%,LPS,1.000,µg_per_ml,...,0.091016,0.315537,0.071223,LPS,1.000_µg_per_ml,LPS_1.000_ug_per_ml_DMSO_0.025_%,LPS_DMSO_0.025__1.000_µg_per_ml,LPS__1.000_µg_per_ml__µg_per_ml_DMSO_0.025,LPS__1.000_µg_per_ml__µg_per_ml_DMSO__0.025,pyroptosis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509986,SH-SY5Y,L20,3262,6,DMSO,0.025,%,LPS,1.000,µg_per_ml,...,0.478660,0.083223,-0.039358,LPS_Nigericin,1.000_µg_per_ml_3.000_µM,LPS_Nigericin_1.000_ug_per_ml_3.000_uM_DMSO_0....,LPS_Nigericin_DMSO_0.025__1.000_µg_per_ml_3.00...,LPS_Nigericin__1.000_µg_per_ml_3.000_µM__µg_pe...,LPS_Nigericin__1.000_µg_per_ml_3.000_µM__µg_pe...,pyroptosis
509987,SH-SY5Y,L20,3262,6,DMSO,0.025,%,LPS,1.000,µg_per_ml,...,-0.056714,0.088700,-0.054869,LPS_Nigericin,1.000_µg_per_ml_3.000_µM,LPS_Nigericin_1.000_ug_per_ml_3.000_uM_DMSO_0....,LPS_Nigericin_DMSO_0.025__1.000_µg_per_ml_3.00...,LPS_Nigericin__1.000_µg_per_ml_3.000_µM__µg_pe...,LPS_Nigericin__1.000_µg_per_ml_3.000_µM__µg_pe...,pyroptosis
509988,SH-SY5Y,L20,3262,6,DMSO,0.025,%,LPS,1.000,µg_per_ml,...,0.068162,9.070408,0.006835,LPS_Nigericin,1.000_µg_per_ml_3.000_µM,LPS_Nigericin_1.000_ug_per_ml_3.000_uM_DMSO_0....,LPS_Nigericin_DMSO_0.025__1.000_µg_per_ml_3.00...,LPS_Nigericin__1.000_µg_per_ml_3.000_µM__µg_pe...,LPS_Nigericin__1.000_µg_per_ml_3.000_µM__µg_pe...,pyroptosis
509989,SH-SY5Y,L20,3262,6,DMSO,0.025,%,LPS,1.000,µg_per_ml,...,-0.063567,0.115440,-0.031562,LPS_Nigericin,1.000_µg_per_ml_3.000_µM,LPS_Nigericin_1.000_ug_per_ml_3.000_uM_DMSO_0....,LPS_Nigericin_DMSO_0.025__1.000_µg_per_ml_3.00...,LPS_Nigericin__1.000_µg_per_ml_3.000_µM__µg_pe...,LPS_Nigericin__1.000_µg_per_ml_3.000_µM__µg_pe...,pyroptosis


75% test set and 25% train set

In [24]:
test_ratio = 0.75
training_data_set_75, testing_data_set_75 = train_test_split(
    test_set_75,
    test_size=test_ratio,
    stratify=test_set_75["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    random_state=0,
)

50% test set and 50% train set

In [25]:
test_ratio = 0.5
training_data_set_50, testing_data_set_50 = train_test_split(
    test_set_50,
    test_size=test_ratio,
    stratify=test_set_50["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    random_state=0,
)

verify that the correct splits have been made
100% test set

In [26]:
print(f"Shape for the 100% test set: {test_set_all.shape}\n")

Shape for the 100% test set: (22551, 1277)



75% test set and 25% train set

In [27]:
print(
    f"Shape for the 75% test set: {training_data_set_75.shape};\nShape for the 75% train set: {testing_data_set_75.shape}\n"
)

Shape for the 75% test set: (5851, 1277);
Shape for the 75% train set: (17556, 1277)



50% test set and 50% train set

In [28]:
print(
    f"Shape for the 50% test set: {training_data_set_50.shape};\nShape for the 50% train set: {testing_data_set_50.shape}"
)

Shape for the 50% test set: (200521, 1277);
Shape for the 50% train set: (200522, 1277)


In [29]:
print(f"Shape for the holdout set: {df_holdout.shape}")

Shape for the holdout set: (150901, 1277)


combine all testing sets together while preserving the index

In [30]:
testing_data_set = pd.concat(
    [test_set_all, testing_data_set_75, testing_data_set_50], axis=0
)
testing_data_set = testing_data_set.sort_index()
testing_data_set

Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_SumVariance_CorrGasdermin_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_03_256,Metadata_Treatment,Metadata_Dose,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,twob_Metadata_Treatment_Dose_Inhibitor_Dose,threeb_Metadata_Treatment_Dose_Inhibitor_Dose,fourb_Metadata_Treatment_Dose_Inhibitor_Dose,labels
0,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,0.024721,0.307472,0.092086,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
6,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,0.154886,0.537797,0.521756,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
7,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,-0.060584,-0.069607,-0.029195,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
8,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,-0.052383,0.006078,-0.060355,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
9,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,-0.067368,-0.121140,-0.057246,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597895,SH-SY5Y,O23,3555,6,Media_ctr,,,media_ctr,0.0,,...,-0.066174,-0.038651,-0.043476,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
597896,SH-SY5Y,O23,3555,6,Media_ctr,,,media_ctr,0.0,,...,-0.053823,-0.062802,0.018814,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
597898,SH-SY5Y,O23,3555,6,Media_ctr,,,media_ctr,0.0,,...,-0.008375,-0.105033,-0.053609,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
597899,SH-SY5Y,O23,3555,6,Media_ctr,,,media_ctr,0.0,,...,-0.049508,-0.085213,-0.032405,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy


combine all training sets together while preserving the index

In [31]:
training_data_set = pd.concat([training_data_set_75, training_data_set_50], axis=0)
training_data_set = training_data_set.sort_index()
training_data_set

Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_SumVariance_CorrGasdermin_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_03_256,Metadata_Treatment,Metadata_Dose,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,twob_Metadata_Treatment_Dose_Inhibitor_Dose,threeb_Metadata_Treatment_Dose_Inhibitor_Dose,fourb_Metadata_Treatment_Dose_Inhibitor_Dose,labels
1,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,-0.041156,1.443262,0.009843,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
2,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,-0.044386,-0.020445,0.000848,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
3,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,-0.058328,-0.009632,-0.005811,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
4,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,-0.035518,-0.038205,0.017690,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
5,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,0.0,,...,-0.065943,-0.045045,-0.051132,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597888,SH-SY5Y,O23,3555,6,Media_ctr,,,media_ctr,0.0,,...,-0.059705,-0.116467,-0.065225,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
597890,SH-SY5Y,O23,3555,6,Media_ctr,,,media_ctr,0.0,,...,-0.063489,0.014499,-0.055562,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
597894,SH-SY5Y,O23,3555,6,Media_ctr,,,media_ctr,0.0,,...,-0.067707,-0.088724,-0.052677,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy
597897,SH-SY5Y,O23,3555,6,Media_ctr,,,media_ctr,0.0,,...,-0.066972,-0.112742,-0.029118,media_ctr,0.0_None,media_ctr_0.0_0_Media_ctr_0.0_0,media_ctr_Media_ctr_nan__0.0_None,media_ctr__0.0_None__None_Media_ctr_nan,media_ctr__0.0_None__None_Media_ctr__nan,healthy


In [32]:
training_data_set, val_data_set = train_test_split(
    training_data_set,
    test_size=0.20,
    stratify=training_data_set["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
)
print(
    f"""
    Testing set length: {len(testing_data_set)}\n
    Training set length: {len(training_data_set)}\n
    Validation set length: {len(val_data_set)}\n
    Holdout set length: {len(df_holdout)}"""
)


    Testing set length: 240629

    Training set length: 165097

    Validation set length: 41275

    Holdout set length: 150901


In [33]:
# train
# downsample healthy and pyroptosis to match number of apoptosis
# to balance classes
df_healthy_train = training_data_set[training_data_set["labels"] == "healthy"]
df_pyroptosis_train = training_data_set[training_data_set["labels"] == "pyroptosis"]
df_apoptosis_train = training_data_set[training_data_set["labels"] == "apoptosis"]
print(df_healthy_train.shape, df_pyroptosis_train.shape, df_apoptosis_train.shape)

(93463, 1277) (63662, 1277) (7972, 1277)


In [34]:
# downsample healthy and pyroptosis to match number of apoptosis
df_healthy_train = df_healthy_train.sample(
    n=df_apoptosis_train.shape[0], random_state=0
)
df_pyroptosis_train = df_pyroptosis_train.sample(
    n=df_apoptosis_train.shape[0], random_state=0
)
print(df_healthy_train.shape, df_pyroptosis_train.shape, df_apoptosis_train.shape)
training_data_set = pd.concat(
    [df_healthy_train, df_pyroptosis_train, df_apoptosis_train]
)
# show that the df was downsampled and recombined correctly
assert (df_healthy_train + df_pyroptosis_train + df_apoptosis_train).shape[
    0
] == training_data_set.shape[0]

(7972, 1277) (7972, 1277) (7972, 1277)


In [35]:
# validation
# downsample healthy and pyroptosis to match number of apoptosis
# to balance classes
df_healthy_val = val_data_set[val_data_set["labels"] == "healthy"]
df_pyroptosis_val = val_data_set[val_data_set["labels"] == "pyroptosis"]
df_apoptosis_val = val_data_set[val_data_set["labels"] == "apoptosis"]
print(df_healthy_val.shape, df_pyroptosis_val.shape, df_apoptosis_val.shape)

(23367, 1277) (15915, 1277) (1993, 1277)


In [36]:
# downsample healthy and pyroptosis to match number of apoptosis
df_healthy_val = df_healthy_val.sample(n=df_apoptosis_val.shape[0], random_state=0)
df_pyroptosis_val = df_pyroptosis_val.sample(
    n=df_apoptosis_val.shape[0], random_state=0
)
print(df_healthy_val.shape, df_pyroptosis_val.shape, df_apoptosis_val.shape)
val_data_set = pd.concat([df_healthy_val, df_pyroptosis_val, df_apoptosis_val])
# show that the df was downsampled and recombined correctly
assert (df_healthy_val + df_pyroptosis_val + df_apoptosis_val).shape[
    0
] == val_data_set.shape[0]

(1993, 1277) (1993, 1277) (1993, 1277)


In [37]:
# test
# downsample healthy and pyroptosis to match number of apoptosis
# to balance classes
df_healthy_test = testing_data_set[testing_data_set["labels"] == "healthy"]
df_pyroptosis_test = testing_data_set[testing_data_set["labels"] == "pyroptosis"]
df_apoptosis_test = testing_data_set[testing_data_set["labels"] == "apoptosis"]
print(df_healthy_test.shape, df_pyroptosis_test.shape, df_apoptosis_test.shape)

(116830, 1277) (113834, 1277) (9965, 1277)


In [38]:
# downsample healthy and pyroptosis to match number of apoptosis
df_healthy_test = df_healthy_test.sample(n=df_apoptosis_test.shape[0], random_state=0)
df_pyroptosis_test = df_pyroptosis_test.sample(
    n=df_apoptosis_test.shape[0], random_state=0
)
print(df_healthy_test.shape, df_pyroptosis_test.shape, df_apoptosis_test.shape)
testing_data_set = pd.concat([df_healthy_test, df_pyroptosis_test, df_apoptosis_test])
# show that the df was downsampled and recombined correctly
assert (df_healthy_test + df_pyroptosis_test + df_apoptosis_test).shape[
    0
] == testing_data_set.shape[0]

(9965, 1277) (9965, 1277) (9965, 1277)


In [39]:
print(len(training_data_set), len(val_data_set), len(testing_data_set), len(df_holdout))

23916 5979 29895 150901


get the indexes for the training and testing sets

In [40]:
training_data_set_index = training_data_set.index
val_data_set_index = val_data_set.index
testing_data_set_index = testing_data_set.index
df_holdout_index = df_holdout.index

In [41]:
print(
    training_data_set_index.shape,
    val_data_set_index.shape,
    testing_data_set_index.shape,
    df_holdout_index.shape,
)
print(
    training_data_set_index.shape[0]
    + val_data_set_index.shape[0]
    + testing_data_set_index.shape[0]
    + df_holdout_index.shape[0]
)

(23916,) (5979,) (29895,) (150901,)
210691


create pandas dataframe with all indexes and their respective labels, stratified by phenotypic class

In [42]:
index_data = []
for index in training_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "train"})
for index in val_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "val"})
for index in testing_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "test"})
for index in df_holdout_index:
    index_data.append({"labeled_data_index": index, "label": "holdout"})

make index data a dataframe and sort it by labeled data index

In [43]:
index_data = pd.DataFrame(index_data)
index_data

Unnamed: 0,labeled_data_index,label
0,31546,train
1,270284,train
2,298547,train
3,276518,train
4,63938,train
...,...,...
210686,584185,holdout
210687,584186,holdout
210688,584187,holdout
210689,584188,holdout


In [44]:
save_path = pathlib.Path(f"../indexes/{CELL_TYPE}/multi_class/")

In [45]:
print(save_path)
# create save path if it doesn't exist
save_path.mkdir(parents=True, exist_ok=True)

../indexes/SHSY5Y/multi_class


save indexes as tsv file

In [46]:
index_data.to_csv(
    f"{save_path}/{params.MODEL_NAME}_data_split_indexes.tsv", sep="\t", index=False
)

#### Set up Data to be compatible with model

##### Classification Models:
Comment out code if using regression

Code snippet for metadata extraction by Jenna Tomkinson

In [47]:
df_metadata = list(df.columns[df.columns.str.contains("Metadata")])

define which columns are data and which are descriptive

In [48]:
df_descriptive = df1[df_metadata]
df_values = df1.drop(columns=df_metadata)

get the class weights for the loss function to account for class imbalance
get the number of samples in each class

In [49]:
targets, counts = np.unique(df1["labels"], return_counts=True)
print(targets, counts)
total_counts = np.sum(counts)
# get the class weights
class_weights = []
class_targets = []
for class_name in enumerate(targets):
    class_targets.append(class_name[0])
for count in enumerate(counts):
    class_weights.append(1 - (count[1] / total_counts))
print(class_targets, class_weights)
# write the class weights to a file for use in the model
class_weights_file = pathlib.Path(f"../class_weights/{CELL_TYPE}/multi_class/")
class_weights_file.mkdir(parents=True, exist_ok=True)
with open(f"{class_weights_file}/class_weights.txt", "w") as filehandle:
    for listitem in class_weights:
        filehandle.write("%s\n" % listitem)

['apoptosis' 'healthy' 'pyroptosis'] [ 26978 310701 260223]
[0, 1, 2] [0.954878893196544, 0.4803479499984947, 0.5647731568049614]


Creating label encoder

In [50]:
le = preprocessing.LabelEncoder()
df_values["new_labels"] = le.fit_transform(df_values["labels"])
# get mini dataframe that contains the decoder
decoder = df_values[["labels", "new_labels"]].drop_duplicates()
# split into X and Y where Y are the predictive column and x are the observable data
df_values_X = df_values.drop(
    ["new_labels", "labels"],
    axis=1,
)
df_values_Y = df_values["new_labels"]
df_values_Y.head()
df_values_Y.unique()

array([1, 0, 2])

#### Split Data - All Models can proceed through this point

split into train and test sets from indexes previously defined

In [51]:
X_train = df_values_X.loc[training_data_set_index]
X_val = df_values_X.loc[val_data_set_index]
X_test = df_values_X.loc[testing_data_set_index]
X_holdout = df_values_X.loc[df_holdout_index]

In [52]:
Y_train = df_values_Y.loc[training_data_set_index]
Y_val = df_values_Y.loc[val_data_set_index]
Y_test = df_values_Y.loc[testing_data_set_index]
Y_holdout = df_values_Y.loc[df_holdout_index]

produce data objects for train, val and test datasets

In [53]:
train_data = Dataset_formatter(
    torch.FloatTensor(X_train.values), torch.FloatTensor(Y_train.values)
)
val_data = Dataset_formatter(
    torch.FloatTensor(X_val.values), torch.FloatTensor(Y_val.values)
)
test_data = Dataset_formatter(
    torch.FloatTensor(X_test.values), torch.FloatTensor(Y_test.values)
)

In [54]:
mlp_params.IN_FEATURES = X_train.shape[1]
print("Number of in features: ", mlp_params.IN_FEATURES)
if mlp_params.MODEL_TYPE == "Regression":
    mlp_params.OUT_FEATURES = 1
else:
    mlp_params.OUT_FEATURES = len(df_values["labels"].unique())

Number of in features:  1251


In [55]:
print("Number of out features: ", mlp_params.OUT_FEATURES)

Number of out features:  3


In [56]:
if mlp_params.OUT_FEATURES > 2:
    mlp_params.MODEL_TYPE = "Multi_Class"
elif mlp_params.OUT_FEATURES == 2:
    mlp_params.OUT_FEATURES = mlp_params.OUT_FEATURES - 1
    mlp_params.MODEL_TYPE = "Binary_Classification"
elif mlp_params.OUT_FEATURES == 1:
    mlp_params.MODEL_TYPE = "Regression"
else:
    pass
print(mlp_params.MODEL_TYPE)

Multi_Class


convert data class into a dataloader to be compatible with pytorch

In [57]:
train_loader = torch.utils.data.DataLoader(
    dataset=train_data, batch_size=mlp_params.HYPERPARAMETER_BATCH_SIZE
)
valid_loader = torch.utils.data.DataLoader(
    dataset=val_data, batch_size=mlp_params.HYPERPARAMETER_BATCH_SIZE
)

check device

In [58]:
print(mlp_params.DEVICE)

cuda


no accuracy function must be loss for regression

In [59]:
if mlp_params.MODEL_TYPE == "Regression":
    mlp_params.METRIC = "loss"
    pass

wrap the objective function inside of a lambda function to pass args...

In [60]:
objective_lambda_func = lambda trial: objective_model_optimizer(
    train_loader,
    valid_loader,
    trial=trial,
    params=params,
    metric=mlp_params.METRIC,
    return_info=False,
    class_weights=class_weights,
)

Study is the object for model optimization

In [61]:
study = optuna.create_study(direction=f"{mlp_params.DIRECTION}")
# Here I apply the optimize function of the study to the objective function
# This optimizes each parameter specified to be optimized from the defined search space
study.optimize(objective_lambda_func, n_trials=mlp_params.N_TRIALS)
# Prints out the best trial's optimized parameters
objective_model_optimizer(
    train_loader,
    valid_loader,
    trial=study.best_trial,
    params=params,
    metric=mlp_params.METRIC,
    return_info=True,
    class_weights=class_weights,
)

[I 2023-11-16 09:10:41,093] A new study created in memory with name: no-name-b84787d5-62e5-4db2-91de-93b7f44ee844
[I 2023-11-16 09:11:06,571] Trial 0 finished with value: 1.0992593228816985 and parameters: {'n_layers': 12, 'n_units_l0': 23, 'dropout_0': 0.1759045437113675, 'n_units_l1': 39, 'dropout_1': 0.14064959840452881, 'n_units_l2': 37, 'dropout_2': 0.32689625791974547, 'n_units_l3': 40, 'dropout_3': 0.22006379080758587, 'n_units_l4': 17, 'dropout_4': 0.16673163083593018, 'n_units_l5': 4, 'dropout_5': 0.1914350194418878, 'n_units_l6': 13, 'dropout_6': 0.18146399750848285, 'n_units_l7': 29, 'dropout_7': 0.2136654303442552, 'n_units_l8': 41, 'dropout_8': 0.3988883375297919, 'n_units_l9': 32, 'dropout_9': 0.33487725996354106, 'n_units_l10': 10, 'dropout_10': 0.19862178762521937, 'n_units_l11': 35, 'dropout_11': 0.31909002631861916, 'learning_rate': 0.026867922925865526, 'optimizer': 'Adam'}. Best is trial 0 with value: 1.0992593228816985.
[I 2023-11-16 09:11:27,253] Trial 1 finished 

Validation Accuracy: 67.99147014550927
Validation Loss: 0.68889163300395
Training Accuracy: 72.51007693594246
Training Loss: 0.6113392092846334


(67.99147014550927, 0.68889163300395, 72.51007693594246, 0.6113392092846334)

create graph directory for this model

In [62]:
graph_path = pathlib.Path(
    f"../../figures/{mlp_params.MODEL_TYPE}/{mlp_params.MODEL_NAME}/{mlp_params.CELL_TYPE}/hyperparameter_optimization"
)

In [63]:
pathlib.Path(graph_path).mkdir(parents=True, exist_ok=True)
fig = optuna.visualization.plot_optimization_history(study)

In [64]:
graph_path = f"{graph_path}/plot_optimization_history_graph"

In [65]:
fig.write_image(pathlib.Path(f"{graph_path}.png"))
fig.show()

create graph directory for this model

In [66]:
graph_path = pathlib.Path(
    f"../../figures/{mlp_params.MODEL_TYPE}/{mlp_params.MODEL_NAME}/{mlp_params.CELL_TYPE}/hyperparameter_optimization"
).resolve(strict=True)

In [67]:
pathlib.Path(graph_path).mkdir(parents=True, exist_ok=True)
fig = optuna.visualization.plot_intermediate_values(study)

In [68]:
graph_path = f"{graph_path}/plot_intermediate_values_graph"

In [69]:
fig.write_image(pathlib.Path(f"{graph_path}.png"))
fig.show()

In [70]:
param_dict = extract_best_trial_params(
    study.best_params, params, model_name=mlp_params.MODEL_NAME
)