## Hyperparameter tuning via Optuna

### Being a binary model this notebook will be limited to predicting one class 1 or 0, yes or no.
### Here I will be predicting if a cell received a treatment or not

In [1]:
import pathlib
import sys

import numpy as np
import optuna
import pandas as pd
import pyarrow.parquet as pq
import toml
import torch
from sklearn import preprocessing

sys.path.append("../..")

from MLP_utils.parameters import Parameters
from MLP_utils.utils import (
    Dataset_formatter,
    data_split,
    extract_best_trial_params,
    objective_model_optimizer,
    parameter_set,
    plot_metric_vs_epoch,
    results_output,
    test_optimized_model,
    train_optimized_model,
    un_nest,
)
from sklearn.model_selection import train_test_split

sys.path.append("../../..")
from utils.utils import df_stats

## Papermill is used for executing notebooks in the CLI with multiple parameters
Here the `injected-parameters` cell is used to inject parameters into the notebook via papermill.
This enables multiple notebooks to be executed with different parameters, preventing to manually update parameters or have multiple copies of the notebook.

In [2]:
# Parameters
CELL_TYPE = "SHSY5Y"
MODEL_NAME = "MultiClass_MLP"

In [3]:
ml_configs_file = pathlib.Path("../../MLP_utils/multi_class_config.toml").resolve(
    strict=True
)
ml_configs = toml.load(ml_configs_file)
params = Parameters()
mlp_params = parameter_set(params, ml_configs)

# overwrite params via command line arguments from papermill
mlp_params.CELL_TYPE = CELL_TYPE
mlp_params.MODEL_NAME = MODEL_NAME
mlp_params.MODEL_NAME = MODEL_NAME
MODEL_TYPE = mlp_params.MODEL_TYPE
HYPERPARAMETER_BATCH_SIZE = mlp_params.HYPERPARAMETER_BATCH_SIZE

In [4]:
# Import Data
# set data file path under pathlib path for multi-system use

file_path = pathlib.Path(
    f"../../../data/{mlp_params.CELL_TYPE}_preprocessed_sc_norm.parquet"
).resolve(strict=True)

df1 = pd.read_parquet(file_path)

In [5]:
# df1['oneb_Metadata_Treatment_Dose_Inhibitor_Dose'].unique()
# # drop H2O2_100.000_uM_DMSO_0.025_% and H2O2_100.000_nM_DMSO_0.025_%
# df1 = df1[df1['oneb_Metadata_Treatment_Dose_Inhibitor_Dose'] != 'H2O2_100.000_uM_DMSO_0.025_%']
# df1 = df1[df1['oneb_Metadata_Treatment_Dose_Inhibitor_Dose'] != 'H2O2_100.000_nM_DMSO_0.025_%']

In [6]:
# get paths for toml files
ground_truth_file_path = pathlib.Path(f"../../MLP_utils/ground_truth.toml").resolve(
    strict=True
)
treatment_splits_file_path = pathlib.Path(f"../../MLP_utils/splits.toml").resolve(
    strict=True
)
# read toml files
ground_truth = toml.load(ground_truth_file_path)
treatment_splits = toml.load(treatment_splits_file_path)

In [7]:
# get information from toml files
apoptosis_groups_list = ground_truth["Apoptosis"]["apoptosis_groups_list"]
pyroptosis_groups_list = ground_truth["Pyroptosis"]["pyroptosis_groups_list"]
healthy_groups_list = ground_truth["Healthy"]["healthy_groups_list"]
test_split_100 = treatment_splits["splits"]["data_splits_100"]
test_split_75 = treatment_splits["splits"]["data_splits_75"]

In [8]:
np.random.seed(0)
if mlp_params.DATA_SUBSET_OPTION == "True":
    df1 = df1.groupby("oneb_Metadata_Treatment_Dose_Inhibitor_Dose").apply(
        lambda x: x.sample(n=mlp_params.DATA_SUBSET_NUMBER, random_state=0)
    )
    print("Data Subset Is On")
    print(f"Data is subset to {mlp_params.DATA_SUBSET_NUMBER} per treatment group")
    print(df1.shape)
    df1.reset_index(drop=True, inplace=True)
else:
    print("Data Subset Is Off")

Data Subset Is Off


In [9]:
# add apoptosis, pyroptosis and healthy columns to dataframe
df1["apoptosis"] = df1.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in apoptosis_groups_list,
    axis=1,
)
df1["pyroptosis"] = df1.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in pyroptosis_groups_list,
    axis=1,
)
df1["healthy"] = df1.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in healthy_groups_list,
    axis=1,
)

# merge apoptosis, pyroptosis, and healthy columns into one column
df1["labels"] = df1.apply(
    lambda row: "apoptosis"
    if row["apoptosis"]
    else "pyroptosis"
    if row["pyroptosis"]
    else "healthy",
    axis=1,
)
# drop apoptosis, pyroptosis, and healthy columns
df1.drop(columns=["apoptosis", "pyroptosis", "healthy"], inplace=True)

### Split said data

In [10]:
# randomly select wells to hold out for testing one per treatment group
# stratified by treatment group
np.random.seed(seed=0)
wells_to_hold = (
    df1.groupby("oneb_Metadata_Treatment_Dose_Inhibitor_Dose")
    .agg(np.random.choice)["Metadata_Well"]
    .to_list()
)
df_holdout = df1[df1["Metadata_Well"].isin(wells_to_hold)]
df = df1[~df1["Metadata_Well"].isin(wells_to_hold)]


print("Wells held out for testing:", df_holdout["Metadata_Well"].unique())
print(
    "Wells to use for training, validation, and testing", df1["Metadata_Well"].unique()
)

Wells held out for testing: ['F14' 'D15' 'G17' 'H17' 'D18' 'E18' 'B19' 'G19' 'D20' 'F20' 'B21' 'C21'
 'E21' 'F22' 'H22' 'C23' 'J13' 'K13' 'L13' 'I14' 'L14' 'N14' 'O15' 'J16'
 'N16' 'O17' 'I18' 'L18' 'M18' 'O18' 'I20' 'K20' 'L21' 'M21' 'N22' 'O22'
 'J23' 'K23']
Wells to use for training, validation, and testing ['B13' 'C13' 'D13' 'E13' 'F13' 'G13' 'H13' 'B14' 'C14' 'D14' 'E14' 'F14'
 'G14' 'H14' 'B15' 'C15' 'D15' 'E15' 'F15' 'G15' 'H15' 'B16' 'C16' 'D16'
 'E16' 'F16' 'G16' 'H16' 'B17' 'C17' 'D17' 'E17' 'F17' 'G17' 'H17' 'B18'
 'C18' 'D18' 'E18' 'F18' 'G18' 'H18' 'B19' 'C19' 'D19' 'E19' 'F19' 'G19'
 'H19' 'B20' 'C20' 'D20' 'E20' 'F20' 'G20' 'H20' 'B21' 'C21' 'D21' 'E21'
 'F21' 'G21' 'H21' 'B22' 'C22' 'D22' 'E22' 'F22' 'G22' 'H22' 'B23' 'C23'
 'D23' 'E23' 'F23' 'G23' 'H23' 'I13' 'J13' 'K13' 'L13' 'M13' 'N13' 'O13'
 'I14' 'J14' 'K14' 'L14' 'M14' 'N14' 'O14' 'I15' 'J15' 'K15' 'L15' 'M15'
 'N15' 'O15' 'I16' 'J16' 'K16' 'L16' 'M16' 'N16' 'O16' 'I17' 'J17' 'K17'
 'L17' 'M17' 'N17' 'O17' 'I18' 

In [11]:
# variable test and train set splits
# 100% test set
# subset the following treatments for test set
test_set_all = df[
    df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(test_split_100)
]
# 75% test set and 25% train set
test_set_75 = df[df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(test_split_75)]

test_100_and_75 = test_split_100 + test_split_75

# 50% test set and 50% train set
# get all treatments that are not in the_test_set_all and the test_set_75
test_set_50 = df[
    ~df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(test_100_and_75)
]

print(test_set_all.shape, test_set_75.shape, test_set_50.shape)

(22551, 1277) (23407, 1277) (401043, 1277)


In [12]:
# get the train test splits from each group
# 100% test set
test_set_all

# 75% test set and 25% train set
test_ratio = 0.75
training_data_set_75, testing_data_set_75 = train_test_split(
    test_set_75,
    test_size=test_ratio,
    stratify=test_set_75["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    random_state=0,
)

# 50% test set and 50% train set
test_ratio = 0.5
training_data_set_50, testing_data_set_50 = train_test_split(
    test_set_50,
    test_size=test_ratio,
    stratify=test_set_50["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    random_state=0,
)

# verify that the correct splits have been made
# 100% test set
print(f"Shape for the 100% test set: {test_set_all.shape}\n")

# 75% test set and 25% train set
print(
    f"Shape for the 75% test set: {training_data_set_75.shape};\nShape for the 75% train set: {testing_data_set_75.shape}\n"
)

# 50% test set and 50% train set
print(
    f"Shape for the 50% test set: {training_data_set_50.shape};\nShape for the 50% train set: {testing_data_set_50.shape}"
)

print(f"Shape for the holdout set: {df_holdout.shape}")

Shape for the 100% test set: (22551, 1277)

Shape for the 75% test set: (5851, 1277);
Shape for the 75% train set: (17556, 1277)

Shape for the 50% test set: (200521, 1277);
Shape for the 50% train set: (200522, 1277)
Shape for the holdout set: (150901, 1277)


In [13]:
# combine all testing sets together while preserving the index
testing_data_set = pd.concat(
    [test_set_all, testing_data_set_75, testing_data_set_50], axis=0
)
testing_data_set = testing_data_set.sort_index()
testing_data_set

# combine all training sets together while preserving the index
training_data_set = pd.concat([training_data_set_75, training_data_set_50], axis=0)
training_data_set = training_data_set.sort_index()
training_data_set

training_data_set, val_data_set = train_test_split(
    training_data_set,
    test_size=0.20,
    stratify=training_data_set["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
)
print(
    f"""
    Testing set length: {len(testing_data_set)}\n
    Training set length: {len(training_data_set)}\n
    Validation set length: {len(val_data_set)}\n
    Holdout set length: {len(df_holdout)}"""
)


    Testing set length: 240629

    Training set length: 165097

    Validation set length: 41275

    Holdout set length: 150901


In [14]:
# # train
# # downsample healthy and pyroptosis to match number of apoptosis
# # to balance classes
# df_healthy_train = training_data_set[training_data_set["labels"] == "healthy"]
# df_pyroptosis_train = training_data_set[training_data_set["labels"] == "pyroptosis"]
# df_apoptosis_train = training_data_set[training_data_set["labels"] == "apoptosis"]
# print(df_healthy_train.shape, df_pyroptosis_train.shape, df_apoptosis_train.shape)

# # downsample healthy and pyroptosis to match number of apoptosis
# df_healthy_train = df_healthy_train.sample(n=df_apoptosis_train.shape[0], random_state=0)
# df_pyroptosis_train = df_pyroptosis_train.sample(n=df_apoptosis_train.shape[0], random_state=0)
# print(df_healthy_train.shape, df_pyroptosis_train.shape, df_apoptosis_train.shape)
# training_data_set = pd.concat([df_healthy_train, df_pyroptosis_train, df_apoptosis_train])
# # show that the df was downsampled and recombined correctly
# assert (df_healthy_train + df_pyroptosis_train + df_apoptosis_train).shape[0] == training_data_set.shape[0]


# # validation
# # downsample healthy and pyroptosis to match number of apoptosis
# # to balance classes
# df_healthy_val = val_data_set[val_data_set["labels"] == "healthy"]
# df_pyroptosis_val = val_data_set[val_data_set["labels"] == "pyroptosis"]
# df_apoptosis_val = val_data_set[val_data_set["labels"] == "apoptosis"]
# print(df_healthy_val.shape, df_pyroptosis_val.shape, df_apoptosis_val.shape)

# # downsample healthy and pyroptosis to match number of apoptosis
# df_healthy_val = df_healthy_val.sample(n=df_apoptosis_val.shape[0], random_state=0)
# df_pyroptosis_val = df_pyroptosis_val.sample(n=df_apoptosis_val.shape[0], random_state=0)
# print(df_healthy_val.shape, df_pyroptosis_val.shape, df_apoptosis_val.shape)
# val_data_set = pd.concat([df_healthy_val, df_pyroptosis_val, df_apoptosis_val])
# # show that the df was downsampled and recombined correctly
# assert (df_healthy_val + df_pyroptosis_val + df_apoptosis_val).shape[0] == val_data_set.shape[0]


# # test
# # downsample healthy and pyroptosis to match number of apoptosis
# # to balance classes
# df_healthy_test = testing_data_set[testing_data_set["labels"] == "healthy"]
# df_pyroptosis_test = testing_data_set[testing_data_set["labels"] == "pyroptosis"]
# df_apoptosis_test = testing_data_set[testing_data_set["labels"] == "apoptosis"]
# print(df_healthy_test.shape, df_pyroptosis_test.shape, df_apoptosis_test.shape)

# # downsample healthy and pyroptosis to match number of apoptosis
# df_healthy_test = df_healthy_test.sample(n=df_apoptosis_test.shape[0], random_state=0)
# df_pyroptosis_test = df_pyroptosis_test.sample(n=df_apoptosis_test.shape[0], random_state=0)
# print(df_healthy_test.shape, df_pyroptosis_test.shape, df_apoptosis_test.shape)
# testing_data_set = pd.concat([df_healthy_test, df_pyroptosis_test, df_apoptosis_test])
# # show that the df was downsampled and recombined correctly
# assert (df_healthy_test + df_pyroptosis_test + df_apoptosis_test).shape[0] == testing_data_set.shape[0]


# print(len(training_data_set), len(val_data_set), len(testing_data_set), len(df_holdout))

In [15]:
# get the indexes for the training and testing sets

training_data_set_index = training_data_set.index
val_data_set_index = val_data_set.index
testing_data_set_index = testing_data_set.index
df_holdout_index = df_holdout.index

In [16]:
print(
    training_data_set_index.shape,
    val_data_set_index.shape,
    testing_data_set_index.shape,
    df_holdout_index.shape,
)
print(
    training_data_set_index.shape[0]
    + val_data_set_index.shape[0]
    + testing_data_set_index.shape[0]
    + df_holdout_index.shape[0]
)

(165097,) (41275,) (240629,) (150901,)
597902


In [17]:
# create pandas dataframe with all indexes and their respective labels, stratified by phenotypic class
index_data = []
for index in training_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "train"})
for index in val_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "val"})
for index in testing_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "test"})
for index in df_holdout_index:
    index_data.append({"labeled_data_index": index, "label": "holdout"})

# make index data a dataframe and sort it by labeled data index
index_data = pd.DataFrame(index_data)
index_data

Unnamed: 0,labeled_data_index,label
0,379883,train
1,6688,train
2,293841,train
3,117560,train
4,213514,train
...,...,...
597897,584185,holdout
597898,584186,holdout
597899,584187,holdout
597900,584188,holdout


In [18]:
save_path = pathlib.Path(f"../indexes/{CELL_TYPE}/multi_class/")

print(save_path)
# create save path if it doesn't exist
save_path.mkdir(parents=True, exist_ok=True)

../indexes/SHSY5Y/multi_class


In [19]:
# save indexes as tsv file
index_data.to_csv(
    f"{save_path}/multi_class_data_split_indexes.tsv", sep="\t", index=False
)

#### Set up Data to be compatible with model

##### Classification Models:
Comment out code if using regression

In [20]:
# Code snippet for metadata extraction by Jenna Tomkinson
df_metadata = list(df.columns[df.columns.str.contains("Metadata")])

# define which columns are data and which are descriptive
df_descriptive = df1[df_metadata]
df_values = df1.drop(columns=df_metadata)

In [21]:
# get the class weights for the loss function to account for class imbalance
# get the number of samples in each class
targets, counts = np.unique(df1["labels"], return_counts=True)
print(targets, counts)
total_counts = np.sum(counts)
# get the class weights
class_weights = []
class_targets = []
for class_name in enumerate(targets):
    class_targets.append(class_name[0])
for count in enumerate(counts):
    class_weights.append(1 - (count[1] / total_counts))
print(class_targets, class_weights)
# write the class weights to a file for use in the model
class_weights_file = pathlib.Path(f"../class_weights/{CELL_TYPE}/multi_class/")
class_weights_file.mkdir(parents=True, exist_ok=True)
with open(f"{class_weights_file}/class_weights.txt", "w") as filehandle:
    for listitem in class_weights:
        filehandle.write("%s\n" % listitem)

['apoptosis' 'healthy' 'pyroptosis'] [ 26978 310701 260223]
[0, 1, 2] [0.954878893196544, 0.4803479499984947, 0.5647731568049614]


In [22]:
# Creating label encoder
le = preprocessing.LabelEncoder()
df_values["new_labels"] = le.fit_transform(df_values["labels"])
# get mini dataframe that contains the decoder
decoder = df_values[["labels", "new_labels"]].drop_duplicates()
# split into X and Y where Y are the predictive column and x are the observable data
df_values_X = df_values.drop(
    ["new_labels", "labels"],
    axis=1,
)
df_values_Y = df_values["new_labels"]
df_values_Y.head()
df_values_Y.unique()

array([1, 0, 2])

#### Split Data - All Models can proceed through this point

In [23]:
# split into train and test sets from indexes previously defined

X_train = df_values_X.loc[training_data_set_index]
X_val = df_values_X.loc[val_data_set_index]
X_test = df_values_X.loc[testing_data_set_index]
X_holdout = df_values_X.loc[df_holdout_index]

Y_train = df_values_Y.loc[training_data_set_index]
Y_val = df_values_Y.loc[val_data_set_index]
Y_test = df_values_Y.loc[testing_data_set_index]
Y_holdout = df_values_Y.loc[df_holdout_index]

In [24]:
# produce data objects for train, val and test datasets
train_data = Dataset_formatter(
    torch.FloatTensor(X_train.values), torch.FloatTensor(Y_train.values)
)
val_data = Dataset_formatter(
    torch.FloatTensor(X_val.values), torch.FloatTensor(Y_val.values)
)
test_data = Dataset_formatter(
    torch.FloatTensor(X_test.values), torch.FloatTensor(Y_test.values)
)

In [25]:
mlp_params.IN_FEATURES = X_train.shape[1]
print("Number of in features: ", mlp_params.IN_FEATURES)
if mlp_params.MODEL_TYPE == "Regression":
    mlp_params.OUT_FEATURES = 1
else:
    mlp_params.OUT_FEATURES = len(df_values["labels"].unique())

print("Number of out features: ", mlp_params.OUT_FEATURES)

if mlp_params.OUT_FEATURES > 2:
    mlp_params.MODEL_TYPE = "Multi_Class"
elif mlp_params.OUT_FEATURES == 2:
    mlp_params.OUT_FEATURES = mlp_params.OUT_FEATURES - 1
    mlp_params.MODEL_TYPE = "Binary_Classification"
elif mlp_params.OUT_FEATURES == 1:
    mlp_params.MODEL_TYPE = "Regression"
else:
    pass
print(mlp_params.MODEL_TYPE)

Number of in features:  1251
Number of out features:  3
Multi_Class


In [26]:
# convert data class into a dataloader to be compatible with pytorch
train_loader = torch.utils.data.DataLoader(
    dataset=train_data, batch_size=mlp_params.HYPERPARAMETER_BATCH_SIZE
)
valid_loader = torch.utils.data.DataLoader(
    dataset=val_data, batch_size=mlp_params.HYPERPARAMETER_BATCH_SIZE
)

In [27]:
# check device
print(mlp_params.DEVICE)

cuda


In [28]:
# no accuracy function must be loss for regression
if mlp_params.MODEL_TYPE == "Regression":
    mlp_params.METRIC = "loss"
    pass


# wrap the objective function inside of a lambda function to pass args...
objective_lambda_func = lambda trial: objective_model_optimizer(
    train_loader,
    valid_loader,
    trial=trial,
    params=params,
    metric=mlp_params.METRIC,
    return_info=False,
    class_weights=class_weights,
)


# Study is the object for model optimization
study = optuna.create_study(direction=f"{mlp_params.DIRECTION}")
# Here I apply the optimize function of the study to the objective function
# This optimizes each parameter specified to be optimized from the defined search space
study.optimize(objective_lambda_func, n_trials=mlp_params.N_TRIALS)
# Prints out the best trial's optimized parameters
objective_model_optimizer(
    train_loader,
    valid_loader,
    trial=study.best_trial,
    params=params,
    metric=mlp_params.METRIC,
    return_info=True,
    class_weights=class_weights,
)

[I 2023-11-10 17:03:22,470] A new study created in memory with name: no-name-152ce70a-1866-402c-96cb-c43fd872a6d2
[I 2023-11-10 17:06:23,490] Trial 0 finished with value: 0.9215212746319318 and parameters: {'n_layers': 12, 'n_units_l0': 6, 'dropout_0': 0.3651412039596035, 'n_units_l1': 4, 'dropout_1': 0.34612179595996984, 'n_units_l2': 11, 'dropout_2': 0.11278235890099686, 'n_units_l3': 11, 'dropout_3': 0.22882293307128915, 'n_units_l4': 35, 'dropout_4': 0.31414503163502216, 'n_units_l5': 8, 'dropout_5': 0.21291532639956667, 'n_units_l6': 8, 'dropout_6': 0.34873452246348047, 'n_units_l7': 40, 'dropout_7': 0.37070983322446116, 'n_units_l8': 3, 'dropout_8': 0.2845253572620299, 'n_units_l9': 8, 'dropout_9': 0.30463806907204133, 'n_units_l10': 45, 'dropout_10': 0.36857541686305995, 'n_units_l11': 43, 'dropout_11': 0.15442638066221182, 'learning_rate': 0.05763613001026088, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.9215212746319318.
[I 2023-11-10 17:09:02,808] Trial 1 finished with

Validation Accuracy: 75.30619018776498
Validation Loss: 0.5413799398002169
Training Accuracy: 77.46340636110894
Training Loss: 0.49060751598373237


(75.30619018776498, 0.5413799398002169, 77.46340636110894, 0.49060751598373237)

In [29]:
# create graph directory for this model
graph_path = pathlib.Path(
    f"../../figures/{mlp_params.MODEL_TYPE}/{mlp_params.MODEL_NAME}/{mlp_params.CELL_TYPE}/hyperparameter_optimization"
)

pathlib.Path(graph_path).mkdir(parents=True, exist_ok=True)
fig = optuna.visualization.plot_optimization_history(study)


graph_path = f"{graph_path}/plot_optimization_history_graph"

fig.write_image(pathlib.Path(f"{graph_path}.png"))
fig.show()

In [30]:
# create graph directory for this model
graph_path = pathlib.Path(
    f"../../figures/{mlp_params.MODEL_TYPE}/{mlp_params.MODEL_NAME}/{mlp_params.CELL_TYPE}/hyperparameter_optimization"
).resolve(strict=True)

pathlib.Path(graph_path).mkdir(parents=True, exist_ok=True)
fig = optuna.visualization.plot_intermediate_values(study)

graph_path = f"{graph_path}/plot_intermediate_values_graph"

fig.write_image(pathlib.Path(f"{graph_path}.png"))
fig.show()

In [31]:
param_dict = extract_best_trial_params(
    study.best_params, params, model_name=mlp_params.MODEL_NAME
)