# Tuning Script

In [1]:
# Parameters subject to change in every script - involved in naming

MODEL = 'cbc'
FEATURE_IMPORTANCE_ORDERING = 'xgb'
PREPROCESSING = 'Baseline'
TUNER = 'jiaochengb'
FEATURE_ENGINEERING = 'bow'

---

In [2]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

Mounted at /content/drive


In [3]:
import sys
import os
import pickle
from scipy.sparse import load_npz

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/PrivatePackages" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/' # my home directory is stored in ./LAB of google drive
    if MODEL == 'ebc':
        !pip install interpret==0.5.0
    elif MODEL == 'cbc':
        !pip install catboost
else:
    py_file_location = '../../PrivatePackages'
    home_directory = '../../'

sys.path.append(os.path.abspath(py_file_location))

from Tuners import JiaoChengB

import pandas as pd

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [4]:
## Parameters that are same for every script

SEED = 2608

TASK_TYPE = 'Classification' # this task is a classification task

TUNE_FEATURES_AS_HYPERPARAMETERS = True # we have decided to tune features as hp

---

In [5]:
# model to use to tune
from catboost import CatBoostClassifier as clf

# what values to try for each hyperparameter
parameter_choices = {

    'n_estimators': (25, 50, 100, 200),
    'max_depth': (2, 4, 8, 16),
    'subsample': (0.4, 0.55, 0.7, 0.85, 1),
    'colsample_bylevel': (0.4, 0.55, 0.7, 0.85, 1),
    'reg_lambda': (0.01, 0.1, 1),
    'learning_rate': (0.001, 0.01, 0.1, 1),

}

# what values to set non-tuneable parameters/hyperparameters
non_tunable_hyperparameters_dict = {
      'random_state': 19260817,
      'verbose': False,
      'max_bin': 64
      }

# what order to tune hyperparameters in (JiaoCheng specific)
tuning_order = ['features',
                'max_depth',
                'colsample_bylevel',
                'subsample',
                'n_estimators',
                'reg_lambda',
                'learning_rate']

# what order to tune hyperparameters in (JiaoCheng specific)
default_hyperparameter_values = {
                                'features': 0,
                                'n_estimators': 50,
                                'max_depth': 4,
                                'subsample': 0.7,
                                'colsample_bylevel': 0.7,
                                'learning_rate': 0.01,
                                'reg_lambda': 0.01
                                }

---

Read in and Prepare Data

In [6]:
train_x = load_npz(home_directory + f'data/curated/baseline/train_{FEATURE_ENGINEERING}_x.npz')
val_x = load_npz(home_directory + f'data/curated/baseline/val_{FEATURE_ENGINEERING}_x.npz')
test_x = load_npz(home_directory + f'data/curated/baseline/test_{FEATURE_ENGINEERING}_x.npz')

with open(home_directory + 'data/curated/baseline/train_y.pkl', 'rb') as f:
    train_y = pickle.load(f)
with open(home_directory + 'data/curated/baseline/val_y.pkl', 'rb') as f:
    val_y = pickle.load(f)
with open(home_directory + 'data/curated/baseline/test_y.pkl', 'rb') as f:
    test_y = pickle.load(f)

In [7]:
# Import feature importance ordering

with open(f'{home_directory}models/{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_feature_importance_ordering_{PREPROCESSING}.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

Set parameters

In [8]:
# initialisation
tuner = JiaoChengB.JiaoChengB()

print('---')

# define what model we are tuning
tuner.read_in_model(clf, TASK_TYPE)

print('---')

# read in the data for training and validation
tuner.read_in_data(train_x, train_y, val_x, val_y, test_x, test_y)

print('---')

# set what hp values to tune
tuner.set_hyperparameters(parameter_choices)
# WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up hp values that need to be changed from default but NOT to be tuned
tuner.set_non_tuneable_hyperparameters(non_tunable_hyperparameters_dict)

print('---')

# set up feature importance ordering

if TUNE_FEATURES_AS_HYPERPARAMETERS:
  tuner.set_features(feature_importance_ordering)
  # WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up the order of hyperparameters when iteratively tuning using JiaoCheng
tuner.set_tuning_order(tuning_order)

print('---')

# set up the default hp values for first iteration of tuning JiaoCheng
tuner.set_hyperparameter_default_values(default_hyperparameter_values)

print('---')

try: # try to read in previous results to continue tuning ...
  tuner.read_in_tuning_result_df(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')
except:
  print('No previous tuning data read in')

print('---')

# set up where to save the tuning result csv
tuner.set_tuning_result_saving_address(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')

print('---')

# set up where to save the current best model
tuner.set_best_model_saving_address(f'{home_directory}models/tmp_models/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}')

JiaoCheng Initialised
---
Successfully read in model <class 'catboost.core.CatBoostClassifier'>, which is a Classification model
---
Read in Train X data
Read in Train y data
Read in Val X data
Read in Val y data
Read in Test X data
Read in Test y data
---
Successfully recorded hyperparameter choices
---
Successfully recorded non_tuneable_hyperparameter choices
---
Successfully recorded tuneable feature combination choices and updated relevant internal structures
---
---
---
Successfully read in tuning result of 573 rows, for 211.0 combos
---
Successfully set tuning output address
---
Successfully set best model output address


In [9]:
# ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [None]:
# begin tuning ... (or continue training)

tuner.tune()


FEATURE_INDEX: 0 (features) ROUND 1

Round 1 
Hyperparameter: features (index: 6) 

As new Best Combo (1, 1, 2, 2, 0, 1, 0) was read in, best_clf is set to None
Already Trained and Tested combination (1, 1, 2, 2, 0, 1, 0), which had val score of 0.7933
        Current best combo: (1, 1, 2, 2, 0, 1, 0) with val score 0.7933. 
        Has trained 573 of 542400 combinations so far
Already Trained and Tested combination (1, 1, 2, 2, 0, 1, 1), which had val score of 0.7922
        Current best combo: (1, 1, 2, 2, 0, 1, 0) with val score 0.7933. 
        Has trained 573 of 542400 combinations so far
As new Best Combo (1, 1, 2, 2, 0, 1, 2) was read in, best_clf is set to None
Already Trained and Tested combination (1, 1, 2, 2, 0, 1, 2), which had val score of 0.7937
        Current best combo: (1, 1, 2, 2, 0, 1, 2) with val score 0.7937. 
        Has trained 573 of 542400 combinations so far
Already Trained and Tested combination (1, 1, 2, 2, 0, 1, 3), which had val score of 0.7926
        C