# Tuning Script

In [1]:
# Parameters subject to change in every script - involved in naming

MODEL = 'lgbc'
FEATURE_IMPORTANCE_ORDERING = 'xgb'
PREPROCESSING = 'Baseline'
TUNER = 'jiaochengb-balaccu'
FEATURE_ENGINEERING = 'tfidf'

---

In [2]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

In [3]:
import sys
import os
import pickle
from scipy.sparse import load_npz

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/PrivatePackages" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/' # my home directory is stored in ./LAB of google drive
    if MODEL == 'ebc':
        !pip install interpret==0.5.0
    elif MODEL == 'cbc':
        !pip install catboost
else:
    py_file_location = '../../PrivatePackages'
    home_directory = '../../'

sys.path.append(os.path.abspath(py_file_location))

from Tuners import JiaoChengB_BalAccu

import pandas as pd

ImportError: cannot import name 'JiaoChengB_BalAccu' from 'Tuners' (unknown location)

In [None]:
## Parameters that are same for every script

SEED = 2608

TASK_TYPE = 'Classification' # this task is a classification task

TUNE_FEATURES_AS_HYPERPARAMETERS = True # we have decided to tune features as hp

---

In [None]:
# model to use to tune
from lightgbm import LGBMClassifier as clf

# what values to try for each hyperparameter
parameter_choices = {
    'n_estimators': (25, 50, 100, 200, 400, 800),
    'max_depth': (3, 6, 12, 24, 48, 96, 192),
    'subsample': (0.4, 0.55, 0.7, 0.85, 1),
    'colsample_bytree': (0.4, 0.55, 0.7, 0.85, 1),
    'learning_rate': (0.00000001, 0.000001, 0.0001, 0.01, 0.1),
    'reg_alpha': (0.0001, 0.001, 0.01, 0.1, 1, 10)
}

# what values to set non-tuneable parameters/hyperparameters
non_tunable_hyperparameters_dict = {
                                    'random_state': 19260817,
                                    'n_jobs': -1,
                                    'warnings': 'none'
                                    }

tuning_order = [
    'features',
    'max_depth',
    'subsample',
    'colsample_bytree',
    'n_estimators',
    'reg_alpha',
    'learning_rate',
]

default_hyperparameter_values = {'n_estimators': 100, 'max_depth': 12, 'subsample': 0.7, 'colsample_bytree': 0.7, 'reg_alpha': 0.01, 'learning_rate': 0.01, 'features': 0}

---

Read in and Prepare Data

In [None]:
train_x = load_npz(home_directory + f'data/curated/baseline/train_{FEATURE_ENGINEERING}_x.npz')
val_x = load_npz(home_directory + f'data/curated/baseline/val_{FEATURE_ENGINEERING}_x.npz')
test_x = load_npz(home_directory + f'data/curated/baseline/test_{FEATURE_ENGINEERING}_x.npz')

with open(home_directory + 'data/curated/baseline/train_y.pkl', 'rb') as f:
    train_y = pickle.load(f)
with open(home_directory + 'data/curated/baseline/val_y.pkl', 'rb') as f:
    val_y = pickle.load(f)
with open(home_directory + 'data/curated/baseline/test_y.pkl', 'rb') as f:
    test_y = pickle.load(f)

In [None]:
# Import feature importance ordering

with open(f'{home_directory}models/{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_feature_importance_ordering_{PREPROCESSING}.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

Set parameters

In [None]:
# initialisation
tuner = JiaoChengB_BalAccu.JiaoChengB()

print('---')

# define what model we are tuning
tuner.read_in_model(clf, TASK_TYPE)

print('---')

# read in the data for training and validation
tuner.read_in_data(train_x, train_y, val_x, val_y, test_x, test_y)

print('---')

# set what hp values to tune
tuner.set_hyperparameters(parameter_choices)
# WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up hp values that need to be changed from default but NOT to be tuned
tuner.set_non_tuneable_hyperparameters(non_tunable_hyperparameters_dict)

print('---')

# set up feature importance ordering

if TUNE_FEATURES_AS_HYPERPARAMETERS:
  tuner.set_features(feature_importance_ordering)
  # WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up the order of hyperparameters when iteratively tuning using JiaoCheng
tuner.set_tuning_order(tuning_order)

print('---')

# set up the default hp values for first iteration of tuning JiaoCheng
tuner.set_hyperparameter_default_values(default_hyperparameter_values)

print('---')

try: # try to read in previous results to continue tuning ...
  tuner.read_in_tuning_result_df(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')
except:
  print('No previous tuning data read in')

print('---')

# set up where to save the tuning result csv
tuner.set_tuning_result_saving_address(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')

print('---')

# set up where to save the current best model
tuner.set_best_model_saving_address(f'{home_directory}models/tmp_models/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
max_depth                                                                     48.0
subsample                                                                      0.7
colsample_bytree                                                               0.7
learning_rate                                                                  0.1
reg_alpha                                                                      0.1
features                         [3414, 29667, 887, 14524, 6603, 67869, 15936, ...
feature combo ningxiang score                                             0.486439
Train accu                                                                0.994048
Val accu                                                                  0.827407
Test accu                                                                 0.826296
Train balanced_accu                                                            NaN
Val balanced_accu     

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [None]:
# begin tuning ... (or continue training)

tuner.tune()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Already Trained and Tested combination (5, 4, 2, 2, 4, 3, 89), which had val score of 0.7548
        Current best combo: (5, 4, 2, 2, 4, 3, 92) with val score 0.7612. 
        Has trained 366 of 2929500 combinations so far
Already Trained and Tested combination (5, 4, 2, 2, 4, 3, 90), which had val score of 0.7561
        Current best combo: (5, 4, 2, 2, 4, 3, 92) with val score 0.7612. 
        Has trained 366 of 2929500 combinations so far
Already Trained and Tested combination (5, 4, 2, 2, 4, 3, 91), which had val score of 0.7526
        Current best combo: (5, 4, 2, 2, 4, 3, 92) with val score 0.7612. 
        Has trained 366 of 2929500 combinations so far
Already Trained and Tested combination (5, 4, 2, 2, 4, 3, 92), which had val score of 0.7612
        Current best combo: (5, 4, 2, 2, 4, 3, 92) with val score 0.7612. 
        Has trained 366 of 2929500 combinations so far

Best combo after this hyperparameter: (5, 

TypeError: Cannot index by location index with a non-integer key

In [None]:
tuner.tuning_result['Val balanced_accuracy'].max()