# Tuning Script

In [11]:
# Parameters subject to change in every script - involved in naming

MODEL = 'cbc'
PREPROCESSING = 'smote_normalise'
TUNER = 'yangzhoub'
OPTIMISED_METRIC = 'f1'

---

In [12]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import sys
import os
import pickle

if COLAB_ENVIRONMENT:
    home_directory = './drive/MyDrive/LAB/COMP90089__GroupWork__Py/' # my home directory is stored in ./LAB of google drive
    if MODEL == 'ebc':
        !pip install interpret==0.5.0
    elif MODEL == 'cbc':
        !pip install catboost
    elif MODEL in ['tf', 'mlp']:
        !pip install PyTorch2Sklearn
    !pip install JXAutoML
else:
    home_directory = '../../'

from JXAutoML.JiaoCheng_10CV import JiaoCheng_10CV as Tuner

import pandas as pd



In [14]:
# Parameters that are same for every script

FOLDS = 10  # we have decided to do 10 fold validation

SEED = 42

TASK_TYPE = 'Classification'  # this task is a classification task

# we have decided to tune features as hp
TUNE_FEATURES_AS_HYPERPARAMETERS = True

---

In [15]:
# model to use to tune
from catboost import CatBoostClassifier as clf

# what values to try for each hyperparameter
parameter_choices = {

    'n_estimators': (25, 50, 100, 200),
    'max_depth': (2, 4, 8, 16),
    'subsample': (0.4, 0.55, 0.7, 0.85, 1),
    'colsample_bylevel': (0.4, 0.55, 0.7, 0.85, 1),
    'reg_lambda': (0.01, 0.1, 1),
    'learning_rate': (0.001, 0.01, 0.1, 1),

}

# what values to set non-tuneable parameters/hyperparameters
non_tunable_hyperparameters_dict = {
      'random_state': 19260817,
      'verbose': False,
      'max_bin': 64
      }

# what order to tune hyperparameters in (JiaoCheng specific)
tuning_order = ['features',
                'max_depth',
                'colsample_bylevel',
                'subsample',
                'n_estimators',
                'reg_lambda',
                'learning_rate']

# what order to tune hyperparameters in (JiaoCheng specific)
default_hyperparameter_values = {
                                'features': 71,
                                'n_estimators': 50,
                                'max_depth': 4,
                                'subsample': 0.7,
                                'colsample_bylevel': 0.7,
                                'learning_rate': 0.01,
                                'reg_lambda': 0.01
                                }

---

Read in and Prepare Data

In [16]:
# Import data and store as lists

train_x_list = []
train_y_list = []
val_x_list = []
val_y_list = []

for i in range(FOLDS):
    train_data = pd.read_csv(
        f'{home_directory}data/curated/modelling/{PREPROCESSING}/train_{i}.csv')
    val_data = pd.read_csv(
        f'{home_directory}data/curated/modelling/{PREPROCESSING}/val_{i}.csv')

    # get the correct rows for X and y
    train_x = train_data.drop(
        ['mortality_status', 'subject_id', 'hadm_id', 'stay_id'], axis=1)
    train_y = train_data['mortality_status']
    val_x = val_data.drop(
        ['mortality_status', 'subject_id', 'hadm_id', 'stay_id'], axis=1)
    val_y = val_data['mortality_status']

    train_x_list.append(train_x)
    train_y_list.append(train_y)
    val_x_list.append(val_x)
    val_y_list.append(val_y)

In [17]:
# Import feature importance ordering

with open(f'{home_directory}models/xgb_feature_importance_ordering_{PREPROCESSING}.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

Set parameters

In [18]:
# initialisation
tuner = Tuner()

print('---')

# define what model we are tuning
tuner.read_in_model(clf, TASK_TYPE, optimised_metric = OPTIMISED_METRIC)

print('---')

# read in the data for training and validation
tuner.read_in_data(train_x_list, train_y_list, val_x_list, val_y_list)

print('---')

# set what hp values to tune
tuner.set_hyperparameters(parameter_choices)
# WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up hp values that need to be changed from default but NOT to be tuned
tuner.set_non_tuneable_hyperparameters(non_tunable_hyperparameters_dict)

print('---')

# set up feature importance ordering

if TUNE_FEATURES_AS_HYPERPARAMETERS:
  tuner.set_features(feature_importance_ordering)
  # WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up the order of hyperparameters when iteratively tuning using JiaoCheng
tuner.set_tuning_order(tuning_order)

print('---')

# set up the default hp values for first iteration of tuning JiaoCheng
tuner.set_hyperparameter_default_values(default_hyperparameter_values)

print('---')

try: # try to read in previous results to continue tuning ...
  tuner.read_in_tuning_result_df(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{PREPROCESSING}.csv')
except:
  print('No previous tuning data read in')

print('---')

# set up where to save the tuning result csv
tuner.set_tuning_result_saving_address(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{PREPROCESSING}.csv')

print('---')

# set up where to save the current best model
tuner.set_best_model_saving_address(f'{home_directory}models/tmp_models/{TUNER}_{MODEL}_{PREPROCESSING}')

JiaoCheng Initialised
---
Successfully read in model <class 'catboost.core.CatBoostClassifier'>, which is a Classification model optimising for f1
---
Read in Train X data list
Read in Train y data list
Read in Val X data list
Read in Val y data list
---
Successfully recorded hyperparameter choices
---
Successfully recorded non_tuneable_hyperparameter choices
---
Successfully recorded tuneable feature combination choices and updated relevant internal structures
---
---
---
Successfully read in tuning result of 91 rows, for 91.0 combos
---
Successfully set tuning output address
---
Successfully set best model output address


In [19]:
# ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [20]:
# begin tuning ... (or continue training)

tuner.tune()


Default combo: [1, 1, 2, 2, 0, 1, 71] 


ROUND 1

Round 1 
Hyperparameter: features (index: 6) 

As new Best Combo (1, 1, 2, 2, 0, 1, 0) was read in, best_clf is set to None
Already Trained and Tested combination (val score of 0.3244):
            {'n_estimators': 50, 'max_depth': 4, 'subsample': 0.7, 'colsample_bylevel': 0.7, 'reg_lambda': 0.01, 'learning_rate': 0.01, 'features': 0}
            Current best combo (with val score 0.3244):
                    {'n_estimators': 50, 'max_depth': 4, 'subsample': 0.7, 'colsample_bylevel': 0.7, 'reg_lambda': 0.01, 'learning_rate': 0.01, 'features': 0} 
        Has trained 91 of 345600 combinations so far
Already Trained and Tested combination (val score of 0.1321):
            {'n_estimators': 50, 'max_depth': 4, 'subsample': 0.7, 'colsample_bylevel': 0.7, 'reg_lambda': 0.01, 'learning_rate': 0.01, 'features': 1}
            Current best combo (with val score 0.3244):
                    {'n_estimators': 50, 'max_depth': 4, 'subsample': 0.7,