In [1]:
from arg_utils import is_notebook, get_cfg
cfg = get_cfg()
# choices
classical_models = ["linear", "xgb", "rf"]
nn_models = ["mlp", "rnn", "transformer", "clip", "gpt"]
# override variables to experiment in notebook
if is_notebook():    
    cfg["target_name"] = "ICP_Vital"   # ICP_Vital" , long_icp_hypertension_2
    cfg["db_name"] = "UKE"  # "UKE", "MIMIC"
    cfg["minutes"] = 60
    cfg["model_type"] = "rnn"
    
    # do experiments on:fill_type, target_nan_quantile, train_noise_std, 
    #  min_len(increase from 20 to higher), grad_clip_val (at 1 so far), weight_decay (at 0.2 so far)
    
    cfg["fill_type"] = "median" # "pat_mean", "median", "pat_ema" "pat_ema_mask"
    cfg["norm_method"] = None # z, or none

    
    cfg["bs"] = 32 # 8 best for rnn, 32 for GPT
    cfg["max_len"] = 128
    cfg["min_len"] = 128
    cfg["target_nan_quantile"] = 0.9999
    cfg["block_size"] = 0

    # classical model args
    cfg["flat_block_size"] = 8
    # general args
    cfg["max_epochs"] = 20
    cfg["use_nan_embed"] = False
    cfg["weight_decay"] = 0.2
    
    
    # rnn params
    cfg["hidden_size"] = 2048
    cfg["rnn_type"] = "gru"
    
    # transformer params
    cfg["mode"] = "train_mlp_norm"  # "adapters", "train_mlp_norm",  "train_norm", "freeze" (does not train)
    
    cfg["gpu"] = 1
    
    
# overrides and calculated default vals
if cfg["lr"] is None:
    model_type = cfg["model_type"]
    if model_type == "clip":
        cfg["lr"] = 0.001
    elif model_type == "gpt":
        # bs 8 and gpt2 take 9.8GB with max seq len of 512
        # bs 16 with max seq len of 256
        # bs 32 with max seq len 128 only 7.4GB, good performance and fast - 6.9 if mlp_norm
        # bs 64 with len 128 and mlp_norm = 10.9GB. 9.4GB for freeze
        cfg["lr"] = 0.00005
    else:
        cfg["lr"] = 0.0001  # 0.01 works kind of for nan_embed
        
#cfg["val_check_interval"] = int(cfg["val_check_interval"] * (32 / cfg["batch_size"]))
    
import pytorch_lightning as pl
pl.utilities.seed.seed_everything(seed=cfg["seed"], workers=False)
locals().update(cfg)

Global seed set to 2


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)
# disable wandb printing
os.environ['WANDB_SILENT'] = "true"


In [None]:
import os
        
import torch
import pytorch_lightning as pl
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt

from data_utils import SeqDataModule

import logging
import pytorch_lightning
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
pytorch_lightning.utilities.distributed.log.setLevel(logging.ERROR)

In [None]:
# load df
path = f"data/DB_{db_name}_{minutes}_final_df.pkl"
df = pd.read_pickle(path)

if "Bili_BGA" in df:
    print("Drop Bili")
    df = df.drop(columns=["Bili_BGA"])

In [None]:
import sklearn
import copy

from train_utils import train_model
from data_utils import SeqDataModule
from eval_utils import get_all_dfs

def obj_hebo(*args, **kwargs):
    return np.array([obj(*args, **kwargs)])
    

def obj(df, cfg, opt_df, num_seeds=3, split="val"):
    # put in op_df
    cfg = copy.deepcopy(cfg)
    if isinstance(opt_df, pd.DataFrame):
        opt_dict = opt_df.iloc[0].to_dict()
    else:
        opt_dict = opt_df
    if "bs" in opt_dict:
        opt_dict["bs"] = int(opt_dict["bs"])
    cfg.update(opt_dict)
    cfg = scale_hyperparameters(cfg)
    if cfg["fill_type"] == "none":
        cfg["use_nan_embed"] = True
    # calculate metrics for number of seeds
    if num_seeds is None:
        metric = train_and_eval_model(df, cfg, split=split)
    else:
        metrics = []
        for seed in range(num_seeds):
            cfg["seed"] = seed
            metrics.append(train_and_eval_model(df, cfg, split=split))
        metric = np.mean(metrics)
    return metric


def scale_hyperparameters(opt_df):
    if "bs" in opt_df:
        opt_df["bs"] = 2 ** opt_df["bs"]
    if "n_estimators" in opt_df:
        opt_df["n_estimators"] *= 10

    if "train_noise_std" in opt_df:
        opt_df["train_noise_std"] = opt_df["train_noise_std"] * 0.1

    if "weight_decay" in opt_df:
        opt_df["weight_decay"] = opt_df["weight_decay"] * 0.1

    if "grad_clip_val" in opt_df:
        opt_df["grad_clip_val"] = opt_df["grad_clip_val"] * 0.1

    return opt_df


def train_and_eval_model(df, cfg, split):
    dm, models, trainers = setup_dm_and_train(df, cfg)
    metric = eval_model(dm, models, trainers, cfg, split)
    return metric

def setup_dm_and_train(df, cfg):
    # create datamodule with dataloaders
    dm = SeqDataModule(df, cfg["db_name"],
                       target_name=cfg["target_name"],
                       random_starts=cfg["random_starts"], 
                       min_len=cfg["min_len"], 
                       max_len=cfg["max_len"],
                       train_noise_std=cfg["train_noise_std"], 
                       batch_size=cfg["bs"], 
                       fill_type=cfg["fill_type"], 
                       flat_block_size=cfg["flat_block_size"],
                       target_nan_quantile=cfg["target_nan_quantile"],
                       block_size=cfg["block_size"],
                       )
    dm.setup()
    # train model on datamodule
    models, trainers = train_model(cfg["model_type"], [dm], cfg, verbose=False)
    return dm, models, trainers

def eval_model(dm, models, trainers, cfg, split):
    # make preds on val set
    pred_df = get_all_dfs(models, trainers, cfg["model_type"], dm.regression, dl_type=split, dl=None, calc_new_norm_stats=False)
    
    # calc target metrics
    pred_targets = pred_df["targets"].dropna()
    preds = pred_df["preds"][~pred_df["targets"].isna()]
    if dm.regression:
        score = sklearn.metrics.r2_score(pred_targets, preds)
    else:
        score = sklearn.metrics.roc_auc_score(pred_targets, preds)
    metric = 1 - score
        
    return np.array([metric])

In [None]:
import time

import pandas as pd
import numpy  as np
from hebo.design_space.design_space import DesignSpace
from hebo.optimizers.hebo import HEBO

if cfg["model_type"] in ["rnn", "gpt", "mlp"]:
    space = DesignSpace().parse([{'name': 'lr', 'type' : 'num', 'lb' : 0.00001, 'ub' : 0.1},
                                 {'name': 'bs', 'type' : 'int', 'lb' : 2, 'ub' : 5},  # 2 ** bs
                                 
                                 {'name': 'fill_type', 'type' : 'cat', 'categories' : ['median', 'none']},
                                 
                                 #{'name': 'min_len', 'type' : 'int', 'lb': 2, 'ub':128},
                                 #{'name': 'max_len', 'type' : 'int', 'lb': 64, 'ub':512},
                                 
                                 {'name': 'train_noise_std', 'type' : 'int', 'lb' : 0, 'ub' : 2},
                                 {'name': 'weight_decay', 'type' : 'int', 'lb' : 0, 'ub' : 4},
                                 {'name': 'grad_clip_val', 'type' : 'int', 'lb' : 0, 'ub' : 5},
                                 #{'name': 'norm_method', 'type' : 'cat', 'categories' : ["z", None]},
                                ])
    cfg["fill_type"] = "none"
    opt = HEBO(space)
elif cfg["model_type"] == "xgb":
    space = DesignSpace().parse([{'name': 'lr', 'type' : 'num', 'lb' : 0.00005, 'ub' : 0.5},
                                 {'name': 'n_estimators', 'type' : 'int', 'lb' : 1, 'ub' : 20},  # multiplied by 10
                                 {'name': 'max_depth', 'type' : 'int', 'lb' : 2, 'ub' : 10},
                                 {'name': 'subsample', 'type' : 'num', 'lb' : 0.5, 'ub' : 0.99},
                                 {'name': 'colsample_bytree', 'type' : 'num', 'lb' : 0.5, 'ub' : 0.99},
                                 {'name': 'gamma', 'type' : 'num', 'lb' : 0.01, 'ub' : 5.0},
                                 {'name': 'min_child_weight', 'type' : 'num', 'lb' : 0.01, 'ub' : 5},
                                 
                                 {'name': 'fill_type', 'type' : 'cat', 'categories' : ['median', 'none']},
                                 {'name': 'flat_block_size', 'type' : 'int', 'lb' : 1, 'ub' : 4}
                                ])
    #cfg["flat_block_size"] = 8


tune_hebo = cfg["tune_hebo"]

if tune_hebo:
    opt = HEBO(space, rand_sample=0,
            model_name="gpy")#"rf")#"gpy")

    opt_steps = cfg["opt_steps"]

    cfg["verbose"] = False

    for i in range(opt_steps):
        rec = opt.suggest()
        print(i)
        print(list(zip(rec.columns, rec.values[0])))
        start_time = time.time()
        opt.observe(rec, obj_hebo(df, cfg, rec))
        print("Opt time: ", time.time() - start_time)
        min_idx = np.argmin(opt.y)
        print("Current score:", 1 - opt.y[-1][0])
        print(f'After {i} iterations, best obj is {1 - opt.y[min_idx][0]:.4f}')
        print()

    opt_df = opt.X
    opt_df["y"] = opt.y
    opt_df["y"].plot()
    opt_df["score"] = 1 - opt_df["y"]

    plt.show()
    opt_df.plot.scatter(x="lr", y="score")
    plt.show()

    # create a folder to save the results
    import os
    import datetime
    # create folder name according to the database name, minutes, model type and date
    folder_name = f"hebo_tunings/{cfg['db_name']}_{cfg['minutes']}/{cfg['model_type']}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
    os.makedirs(folder_name, exist_ok=True)
    # save the results
    opt_df.to_csv(f"{folder_name}/results.csv", index=False)


    # one-hot encode
    input_df = opt.X.drop(columns=["y"])
    if "score" in input_df:
        input_df = input_df.drop(columns=["score"])

    if "fill_type" in input_df:
        one_hot_fill = pd.get_dummies(opt.X.fill_type, prefix='fill')
        input_df = pd.concat([input_df, one_hot_fill], axis=1).drop(columns=["fill_type"]).astype(float)
    else:
        input_df = input_df.astype(float)

    # feat importance using rf
    rf = sklearn.ensemble.RandomForestRegressor(100)
    rf.fit(input_df, 1 - opt.y)
    pd.Series(data=rf.feature_importances_, index=input_df.columns).sort_values()

    import shap
    expl = shap.TreeExplainer(rf, data=input_df, model_output='raw', 
                            feature_perturbation='interventional')
    shap_vals = expl.shap_values(input_df, check_additivity=False)
    shap.summary_plot(shap_vals, input_df.astype(float))
    plt.show()

    mean_shap_vals = np.abs(shap_vals).mean(axis=0)
    mean_shap_vals /= mean_shap_vals.sum()
    pd.Series(data=mean_shap_vals, index=input_df.columns).sort_values().plot.bar()
    plt.show()

In [None]:
# OPTUNA

import optuna
import pandas as pd


def objective(trial):
    # Invoke suggest methods of a Trial object to generate hyperparameters.
    if cfg["model_type"] in ["rnn", "gpt", "mlp"]:
        rec = {'lr': trial.suggest_float("lr", 0.00005, 0.1),
               #'min_len': trial.suggest_int("min_len", 2, 128),
               #'train_noise_std': trial.suggest_float("train_noise_std", 0.001, 0.2),
               #'weight_decay': trial.suggest_float("weight_decay", 0.001, 0.4),
               #'grad_clip_val': trial.suggest_float("grad_clip_val", 0.1, 5.0),   
               #'train_noise_std': trial.suggest_int("train_noise_std", 0, 2),
               'weight_decay': trial.suggest_int("weight_decay", 0, 4),
               'grad_clip_val': trial.suggest_int("grad_clip_val", 0, 5),               
               #'fill_type': trial.suggest_categorical("fill_type", ["median", "none"]),
            }

        if cfg["model_type"] == "gpt":
            rec["bs"] = trial.suggest_int("bs", 2, 5)
        else:
            rec["bs"] = trial.suggest_int("bs", 2, 5)
        #cfg["fill_type"] = "none"
    elif cfg["model_type"] == "xgb":
        rec = {'lr': trial.suggest_float("lr", 0.00005, 0.5),
           'n_estimators': trial.suggest_int("n_estimators", 1, 20),
           'max_depth': trial.suggest_int("max_depth", 2, 10),
           'subsample': trial.suggest_float("subsample", 0.5, 0.99),
           'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 0.99),
           'gamma': trial.suggest_float("gamma", 0.01, 5.0),
           'min_child_weight': trial.suggest_float("min_child_weight", 0.01, 5.0),
           'fill_type': trial.suggest_categorical("fill_type", ["median", "none"]),
           'flat_block_size': trial.suggest_int("flat_block_size", 1, 4),
          }
    
    for key in rec:
        rec[key] = [rec[key]]
    rec = pd.DataFrame(rec)
    
    error = obj(df, cfg, rec)
    return error  


study = optuna.create_study()  # Create a new study.
study.optimize(objective, n_trials=cfg["opt_steps"])  # Invoke optimization of the objective function.












In [None]:
# create a folder to save the results
import os
import datetime
# create folder name according to the database name, minutes, model type and date
folder_name = f"tunings/{cfg['db_name']}_{cfg['minutes']}/{cfg['model_type']}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
os.makedirs(folder_name, exist_ok=True)

# create a dataframe with the results
tune_result_df = pd.DataFrame(study.trials_dataframe())
tune_result_df["score"] = 1 - tune_result_df["value"]
tune_result_df.to_csv(f"{folder_name}/results.csv")

def save_plot(name):
    plt.tight_layout()
    plt.savefig(f"{folder_name}/{name}.png")
    plt.close()

optuna.visualization.plot_slice(study)
save_plot("slice")
optuna.visualization.plot_param_importances(study)
save_plot("param_importances")
optuna.visualization.plot_optimization_history(study)
save_plot("optimization_history")
if cfg["model_type"] in ["rnn", "gpt", "mlp"]:
    optuna.visualization.plot_contour(study, params=["lr", "bs", "weight_decay", "grad_clip_val"]),
    save_plot("contour")
    optuna.visualization.plot_intermediate_values(study),
    save_plot("intermediate_values")
else:
    optuna.visualization.plot_contour(study, params=["lr", "n_estimators", "max_depth", "subsample", "colsample_bytree", "gamma", "min_child_weight"]),
    save_plot("contour")
    optuna.visualization.plot_intermediate_values(study),
    save_plot("intermediate_values")


In [None]:
tune_result_df



Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bs,params_grad_clip_val,params_lr,params_weight_decay,state,score
0,0,0.488807,2022-03-11 14:49:08.823950,2022-03-11 14:53:09.510202,0 days 00:04:00.686252,4,1,0.080372,4,COMPLETE,0.511193
1,1,0.481927,2022-03-11 14:53:09.510349,2022-03-11 14:57:07.848430,0 days 00:03:58.338081,4,4,0.028529,1,COMPLETE,0.518073
2,2,0.444031,2022-03-11 14:57:07.848561,2022-03-11 15:00:12.246134,0 days 00:03:04.397573,5,2,0.030603,1,COMPLETE,0.555969


In [None]:
# get the best hyperparameters
best_trial = study.best_trial
best_params = best_trial.params


# train the model with the best hyperparameters and test it on test split
def train_and_test(df, cfg, best_params, num_seeds=5):
    cfg = copy.deepcopy(cfg)
    # put the best hyperparameters in the config
    cfg.update(best_params)
    cfg = scale_hyperparameters(cfg)
    if cfg["fill_type"] == "none":
        cfg["use_nan_embed"] = True

    print(cfg)

    val_scores = []
    test_scores = []
    for seed in tqdm(range(num_seeds), desc="Training models with best parameters"):
        cfg["seed"] = seed

        dm, models, trainers = setup_dm_and_train(df, cfg)
        val_metric = eval_model(dm, models, trainers, cfg, "val")
        test_metric = eval_model(dm, models, trainers, cfg, "test")
        val_score = 1 - val_metric
        test_score = 1 - test_metric
        val_scores.append(val_score)
        test_scores.append(test_score)
    return val_scores, test_scores
val_scores, test_scores = train_and_test(df, cfg, best_params, num_seeds=5)

# store best params and scores in a dataframe
df = pd.DataFrame(best_params, index=[0])
df["val_score_mean"] = np.mean(val_scores)
df["val_score_std"] = np.std(val_scores)
df["test_score_mean"] = np.mean(test_scores)
df["test_score_std"] = np.std(test_scores)
df.to_csv(f"{folder_name}/best_params.csv")

# save cfg
import json
with open(f"{folder_name}/cfg.json", "w+") as f:
    json.dump(cfg, f)


{'model_type': 'rnn', 'target_name': 'ICP_Vital', 'db_name': 'UKE', 'minutes': 60, 'seed': 2, 'features': None, 'fill_type': 'none', 'target_nan_quantile': 0.9999, 'block_size': 0, 'random_starts': True, 'train_noise_std': 0.001, 'bs': 32, 'min_len': 128, 'max_len': 128, 'max_epochs': 20, 'lr': 0.03060312028749815, 'use_nan_embed': True, 'weight_decay': 0.1, 'grad_clip_val': 0.2, 'val_check_interval': None, 'max_steps': -1, 'use_macro_loss': False, 'use_pos_weight': True, 'use_huber': False, 'dropout': 0.1, 'hidden_size': 2048, 'use_static': False, 'rnn_layers': 1, 'rnn_type': 'gru', 'mode': 'train_mlp_norm', 'clip_name': 'ViT-B/16', 'gpt_name': 'gpt2', 'flat_block_size': 8, 'alpha': 1, 'l1_ratio': 0.5, 'n_estimators': 500, 'max_depth': 6, 'min_child_weight': None, 'gamma': 0.0, 'subsample': 1.0, 'colsample_bytree': 1.0, 'tree_method': 'gpu_hist', 'norm_method': None, 'gpu': 1}


Training models with best parameters:   0%|          | 0/5 [00:00<?, ?it/s]




Training models with best parameters:  20%|██        | 1/5 [01:03<04:12, 63.09s/it]




Training models with best parameters:  40%|████      | 2/5 [02:15<03:26, 68.76s/it]




Training models with best parameters:  60%|██████    | 3/5 [03:38<02:30, 75.13s/it]




Training models with best parameters:  80%|████████  | 4/5 [04:42<01:10, 70.59s/it]




Training models with best parameters: 100%|██████████| 5/5 [05:45<00:00, 69.02s/it]


AttributeError: 'dict' object has no attribute 'to_csv'

In [None]:
# store best params and scores in a dataframe
df = pd.DataFrame(best_params, index=[0])
df["val_score_mean"] = np.mean(val_scores)
df["val_score_std"] = np.std(val_scores)
df["test_score_mean"] = np.mean(test_scores)
df["test_score_std"] = np.std(test_scores)
df.to_csv(f"{folder_name}/best_params.csv")

# save cfg
import json
with open(f"{folder_name}/cfg.json", "w+") as f:
    json.dump(cfg, f)


In [None]:
df

Unnamed: 0,lr,weight_decay,grad_clip_val,bs,val_score_mean,val_score_std,test_score_mean,test_score_std
0,0.030603,1,2,5,0.55132,0.002324,0.616117,0.02301
