## Hyperparameter tuning via Optuna

### Being a binary model this notebook will be limited to predicting one class 1 or 0, yes or no.
### Here I will be predicting if a cell received a treatment or not

In [1]:
import pathlib
import sys

import numpy as np
import optuna
import pandas as pd
import pyarrow.parquet as pq
import toml
import torch
from sklearn import preprocessing

sys.path.append("../..")

from MLP_utils.parameters import Parameters
from MLP_utils.utils import (
    Dataset_formatter,
    data_split,
    extract_best_trial_params,
    objective_model_optimizer,
    parameter_set,
    plot_metric_vs_epoch,
    results_output,
    test_optimized_model,
    train_optimized_model,
    un_nest,
)

sys.path.append("../../..")
from utils.utils import df_stats

#### Set up Data to be compatible with model

##### Regression Model Data Wrangling and Set Up
comment out if not using regression

In [2]:
# Parameters
CELL_TYPE = "PBMC"
CONTROL_NAME = "DMSO_0.100_DMSO_0.025"
# TREATMENT_NAME = "LPS_100.000_DMSO_0.025"
TREATMENT_NAME = "LPS_100.000_DMSO_0.025"
MODEL_NAME = "DMSO_0.025_vs_LPS_100"

In [3]:
ml_configs_file = pathlib.Path("../../MLP_utils/regression_config.toml").resolve(
    strict=True
)
ml_configs = toml.load(ml_configs_file)
params = Parameters()
mlp_params = parameter_set(params, ml_configs)

# overwrite params via command line arguments from papermill
mlp_params.CELL_TYPE = CELL_TYPE
mlp_params.MODEL_NAME = MODEL_NAME
mlp_params.CONTROL_NAME = CONTROL_NAME
mlp_params.TREATMENT_NAME = TREATMENT_NAME
mlp_params.SHUFFLE = False

In [4]:
# Import Data
# set data file path under pathlib path for multi-system use

# Commented out for now, using a different data set to trobleshoot
# file_path = pathlib.Path(
#     f"../../../data/{mlp_params.CELL_TYPE}_preprocessed_sc_norm.parquet"
# ).resolve(strict=True)

file_path = pathlib.Path(
    "../../../data/PBMC_subset_sc_norm_DMSO_0.100_DMSO_0.025_LPS_100.000_DMSO_0.025.parquet"
)

# set path for nomic data
nomic_df_path = pathlib.Path(
    f"../../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{mlp_params.CELL_TYPE}_cleanup4correlation.csv"
).resolve(strict=True)

df = pq.read_table(file_path).to_pandas()
nomic_df = pd.read_csv(nomic_df_path)

In [5]:
# change the nomic df to standard scaler
# select the columns that contain "NSU"
nomic_df_scaled = nomic_df.filter(regex="NSU")
# standardize the nomic data
# scaler = preprocessing.StandardScaler()
# nomic_df_scaled = pd.DataFrame(scaler.fit_transform(nomic_df_scaled), columns=nomic_df_scaled.columns)
# add the nomic data metadata back
nomic_df_scaled[
    [
        "Metadata_position_x",
        "oneb_Metadata_Treatment_Dose_Inhibitor_Dose",
        "fourb_Metadata_Treatment_Dose_Inhibitor_Dose",
    ]
] = nomic_df[
    [
        "Metadata_position_x",
        "oneb_Metadata_Treatment_Dose_Inhibitor_Dose",
        "fourb_Metadata_Treatment_Dose_Inhibitor_Dose",
    ]
]
nomic_df = nomic_df_scaled.copy()
del nomic_df_scaled

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nomic_df_scaled[[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nomic_df_scaled[[


In [6]:
print(df.shape)
df = pd.merge(
    df,
    nomic_df,
    left_on=[
        "Metadata_Well",
        "oneb_Metadata_Treatment_Dose_Inhibitor_Dose",
        "fourb_Metadata_Treatment_Dose_Inhibitor_Dose",
    ],
    right_on=[
        "Metadata_position_x",
        "oneb_Metadata_Treatment_Dose_Inhibitor_Dose",
        "fourb_Metadata_Treatment_Dose_Inhibitor_Dose",
    ],
).drop(["Metadata_position_x"], axis=1)
print(nomic_df.shape)
print(df.shape)

(374565, 1270)
(154, 190)
(374565, 1457)


In [7]:
# Code snippet for metadata extraction by Jenna Tomkinson
df_metadata = list(df.columns[df.columns.str.contains("Metadata")])

# define which columns are data and which are descriptive
df_descriptive = df[df_metadata]
df_values = df.drop(columns=df_metadata)

In [8]:
df_values[
    ["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
] = df_descriptive[["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]]
df = (
    df_values.groupby(["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"])
    .median()
    .reset_index()
)

In [9]:
# filter the oneb_Metadata_Treatment_Dose_Inhibitor_Dose column to only include the treatment and control via loc
df = df.loc[
    df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
        [mlp_params.TREATMENT_NAME, mlp_params.CONTROL_NAME]
    )
]


print("Selected Catagories are:")
print(df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].unique())
df_stats(df)

if mlp_params.DATA_SUBSET_OPTION == "True":
    df = df.sample(n=mlp_params.DATA_SUBSET_NUMBER)
    print("Data Subset Is On")
    print(f"Data is subset to {mlp_params.DATA_SUBSET_NUMBER}")
else:
    print("Data Subset Is Off")

Selected Catagories are:
['DMSO_0.100_DMSO_0.025' 'LPS_100.000_DMSO_0.025']
The dimensions of the data are: (12, 1434)
Number of total missing values across all columns: 0
Data Subset Is Off


In [10]:
np.random.seed(seed=0)
# get random wells from each treatment group to hold out
wells_to_hold = (
    df.groupby("oneb_Metadata_Treatment_Dose_Inhibitor_Dose")
    .agg(np.random.choice)["Metadata_Well"]
    .to_list()
)
df_holdout = df[df["Metadata_Well"].isin(wells_to_hold)]
df = df[~df["Metadata_Well"].isin(wells_to_hold)]


print("Wells held out for testing:", df_holdout["Metadata_Well"].unique())
print(
    "Wells to use for training, validation, and testing", df["Metadata_Well"].unique()
)

Wells held out for testing: ['I06' 'J09']
Wells to use for training, validation, and testing ['B06' 'B07' 'C06' 'C07' 'I07' 'J02' 'J03' 'J06' 'J07' 'J08']


In [11]:
# Code snippet for metadata extraction by Jenna Tomkinson
df_metadata = list(df.columns[df.columns.str.contains("Metadata")])

# define which columns are data and which are descriptive
df_descriptive = df[df_metadata]
df_values = df.drop(columns=df_metadata)

In [12]:
# get all columns that contain NSU in the name
df_values_Y = df_values[df_values.columns[df_values.columns.str.contains("NSU")]]
df_values_X = df_values.drop(columns=df_values_Y.columns)
# drop all columns except for IL1B and TNFa
col = ["IL-1 beta [NSU]"]
df_values_Y = df_values_Y[col]
df_values_Y["Metadata_Well"] = df_descriptive["Metadata_Well"]
print(df_values.shape)
print(df_values_X.shape)
print(df_values_Y.shape)

(10, 1432)
(10, 1245)
(10, 2)


#### Split Data - All Models can proceed through this point

In [13]:
X_train, X_test, X_val, Y_train_well, Y_test_well, Y_val_well = data_split(
    X_vals=df_values_X,
    y_vals=df_values_Y,
    train_proportion=0.8,
    val_proportion=0.1,
    test_proportion=0.1,
    seed=0,
    params=mlp_params,
)

In [14]:
Y_train = Y_train_well.drop(columns=["Metadata_Well"])
Y_test = Y_test_well.drop(columns=["Metadata_Well"])
Y_val = Y_val_well.drop(columns=["Metadata_Well"])

In [15]:
# produce data objects for train, val and test datasets
train_data = Dataset_formatter(
    torch.FloatTensor(X_train.values), torch.FloatTensor(Y_train.values)
)
val_data = Dataset_formatter(
    torch.FloatTensor(X_val.values), torch.FloatTensor(Y_val.values)
)
test_data = Dataset_formatter(
    torch.FloatTensor(X_test.values), torch.FloatTensor(Y_test.values)
)

In [16]:
mlp_params.IN_FEATURES = X_train.shape[1]
print("Number of in features: ", mlp_params.IN_FEATURES)
if mlp_params.MODEL_TYPE == "Regression":
    mlp_params.OUT_FEATURES = Y_train.shape[1]
else:
    mlp_params.OUT_FEATURES = len(
        df_values["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].unique()
    )

print("Number of out features: ", mlp_params.OUT_FEATURES)

Number of in features:  1245
Number of out features:  1


In [17]:
# convert data class into a dataloader to be compatible with pytorch
train_loader = torch.utils.data.DataLoader(
    dataset=train_data, batch_size=mlp_params.BATCH_SIZE
)
valid_loader = torch.utils.data.DataLoader(
    dataset=val_data, batch_size=mlp_params.BATCH_SIZE
)
test_loader = torch.utils.data.DataLoader(dataset=test_data, batch_size=1)

In [18]:
df_values_X.shape
df_values_Y.shape

(10, 2)

In [19]:
# no accuracy function must be loss for regression
if mlp_params.MODEL_TYPE == "Regression":
    mlp_params.METRIC = "loss"
    pass

sampler = optuna.samplers.TPESampler(seed=0)


# wrap the objective function inside of a lambda function to pass args...
objective_lambda_func = lambda trial: objective_model_optimizer(
    train_loader,
    valid_loader,
    trial=trial,
    params=mlp_params,
    metric=mlp_params.METRIC,
    return_info=False,
)


# Study is the object for model optimization
study = optuna.create_study(direction=f"{mlp_params.DIRECTION}", sampler=sampler)
# Here I apply the optimize function of the study to the objective function
# This optimizes each parameter specified to be optimized from the defined search space
study.optimize(objective_lambda_func, n_trials=mlp_params.N_TRIALS)
# Prints out the best trial's optimized parameters
objective_model_optimizer(
    train_loader,
    valid_loader,
    trial=study.best_trial,
    params=mlp_params,
    metric=mlp_params.METRIC,
    return_info=True,
)

[I 2023-08-30 11:10:07,769] A new study created in memory with name: no-name-0fd5867f-4a7b-4548-903f-96a8465e5c29
  return F.mse_loss(input, target, reduction=self.reduction)
[I 2023-08-30 11:10:09,245] Trial 0 finished with value: 0.1788467042706907 and parameters: {'n_layers': 3, 'n_units_l0': 4, 'dropout_0': 0.4616580256429863, 'n_units_l1': 4, 'dropout_1': 0.3541928796033428, 'n_units_l2': 4, 'dropout_2': 0.3625523267576155, 'learning_rate': 0.8025967829738639, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.1788467042706907.
[I 2023-08-30 11:10:09,588] Trial 1 finished with value: 0.03480209435180086 and parameters: {'n_layers': 3, 'n_units_l0': 4, 'dropout_0': 0.6553579829755966, 'n_units_l1': 2, 'dropout_1': 0.15227757982092444, 'n_units_l2': 2, 'dropout_2': 0.5995719073287628, 'learning_rate': 0.700343294287356, 'optimizer': 'RMSprop'}. Best is trial 1 with value: 0.03480209435180086.
[I 2023-08-30 11:10:09,929] Trial 2 finished with value: 0.001757647072419104 and paramete

Validation Loss: 4.686077056248905e-05
Training Loss: 0.6312310308410792


(4.686077056248905e-05, 0.6312310308410792)

In [20]:
fig = optuna.visualization.plot_optimization_history(study)
graph_path = pathlib.Path(f"../../figures/{params.MODEL_TYPE}/{params.MODEL_NAME}/")
# if path doesn't exist, make path with pathlib
graph_path.mkdir(parents=True, exist_ok=True)

graph_path = f"../../figures/{params.MODEL_TYPE}/{params.MODEL_NAME}/plot_optimization_history_graph"
fig.write_image(pathlib.Path(f"{graph_path}.png"))
fig.show()

In [21]:
fig = optuna.visualization.plot_intermediate_values(study)
graph_path = pathlib.Path(f"../../figures/{params.MODEL_TYPE}/{params.MODEL_NAME}/")
# if path doesn't exist, make path with pathlib
graph_path.mkdir(parents=True, exist_ok=True)

graph_path = f"../../figures/{params.MODEL_TYPE}/{params.MODEL_NAME}/plot_intermediate_values_graph"
fig.write_image(pathlib.Path(f"{graph_path}.png"))
fig.show()

In [22]:
param_dict = extract_best_trial_params(
    study.best_params, params, model_name=params.MODEL_NAME
)