# Tuning Script

In [1]:
# Parameters subject to change in every script - involved in naming

MODEL = 'lgbc'
FEATURE_IMPORTANCE_ORDERING = 'xgb'
PREPROCESSING = 'Baseline'
TUNER = 'jiaochengb-balaccu'
FEATURE_ENGINEERING = 'tfidf'
OVERSAMPLE = 'oversample_domain2'

---

In [2]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

In [3]:
import sys
import os
import pickle
from scipy.sparse import load_npz

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/PrivatePackages" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/' # my home directory is stored in ./LAB of google drive
    if MODEL == 'ebc':
        !pip install interpret==0.5.0
    elif MODEL == 'cbc':
        !pip install catboost
else:
    py_file_location = '../../PrivatePackages'
    home_directory = '../../'

sys.path.append(os.path.abspath(py_file_location))

from Tuners import JiaoChengB_BalAccu

import pandas as pd

In [4]:
## Parameters that are same for every script

SEED = 2608

TASK_TYPE = 'Classification' # this task is a classification task

TUNE_FEATURES_AS_HYPERPARAMETERS = False # we have decided to tune features as hp

---

In [13]:
# model to use to tune
from lightgbm import LGBMClassifier as clf

# what values to try for each hyperparameter
parameter_choices = {
    'n_estimators': (25, 50, 100, 200, 400, 800),
    'max_depth': (3, 6, 12, 24, 48, 96, 192),
    'subsample': (0.4, 0.55, 0.7, 0.85, 1),
    'colsample_bytree': (0.4, 0.55, 0.7, 0.85, 1),
    'learning_rate': (0.00000001, 0.000001, 0.0001, 0.01, 0.1),
    'reg_alpha': (0.0001, 0.001, 0.01, 0.1, 1, 10)
}

# what values to set non-tuneable parameters/hyperparameters
non_tunable_hyperparameters_dict = {
                                    'random_state': 19260817,
                                    'n_jobs': -1,
                                    'warnings': 'none'
                                    }

tuning_order = [
    # 'features',
    'max_depth',
    'subsample',
    'colsample_bytree',
    'n_estimators',
    'reg_alpha',
    'learning_rate',
]

default_hyperparameter_values = {'n_estimators': 100, 'max_depth': 12, 'subsample': 0.7, 'colsample_bytree': 0.7, 'reg_alpha': 0.01, 'learning_rate': 0.01, 'features': 0}
default_hyperparameter_values = {'n_estimators': 100, 'max_depth': 12, 'subsample': 0.7, 'colsample_bytree': 0.7, 'reg_alpha': 0.01, 'learning_rate': 0.01}

---

Read in and Prepare Data

In [6]:
train_x = load_npz(home_directory + f'data/curated/baseline/train_{FEATURE_ENGINEERING}_{OVERSAMPLE}_x.npz')
val_x = load_npz(home_directory + f'data/curated/baseline/val_{FEATURE_ENGINEERING}_x.npz')
test_x = load_npz(home_directory + f'data/curated/baseline/test_{FEATURE_ENGINEERING}_x.npz')

with open(home_directory + f'data/curated/baseline/train_{OVERSAMPLE}_y.pkl', 'rb') as f:
    train_y = pickle.load(f)
with open(home_directory + 'data/curated/baseline/val_y.pkl', 'rb') as f:
    val_y = pickle.load(f)
with open(home_directory + 'data/curated/baseline/test_y.pkl', 'rb') as f:
    test_y = pickle.load(f)

In [8]:
# Import feature importance ordering

with open(f'{home_directory}models/{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_feature_importance_ordering_{OVERSAMPLE}.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

Set parameters

In [14]:
# initialisation
tuner = JiaoChengB_BalAccu.JiaoChengB()

print('---')

# define what model we are tuning
tuner.read_in_model(clf, TASK_TYPE)

print('---')

# read in the data for training and validation
tuner.read_in_data(train_x, train_y, val_x, val_y, test_x, test_y)

print('---')

# set what hp values to tune
tuner.set_hyperparameters(parameter_choices)
# WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up hp values that need to be changed from default but NOT to be tuned
tuner.set_non_tuneable_hyperparameters(non_tunable_hyperparameters_dict)

print('---')

# set up feature importance ordering

if TUNE_FEATURES_AS_HYPERPARAMETERS:
  tuner.set_features(feature_importance_ordering)
  # WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up the order of hyperparameters when iteratively tuning using JiaoCheng
tuner.set_tuning_order(tuning_order)

print('---')

# set up the default hp values for first iteration of tuning JiaoCheng
tuner.set_hyperparameter_default_values(default_hyperparameter_values)

print('---')

try: # try to read in previous results to continue tuning ...
  tuner.read_in_tuning_result_df(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')
except:
  print('No previous tuning data read in')

print('---')

# set up where to save the tuning result csv
tuner.set_tuning_result_saving_address(f'{home_directory}models/tuning_results/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{OVERSAMPLE}_{PREPROCESSING}.csv')

print('---')

# set up where to save the current best model
tuner.set_best_model_saving_address(f'{home_directory}models/tmp_models/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{OVERSAMPLE}_{PREPROCESSING}')

JiaoCheng Initialised
---
Successfully read in model <class 'lightgbm.sklearn.LGBMClassifier'>, which is a Classification model
---
Read in Train X data
Read in Train y data
Read in Val X data
Read in Val y data
Read in Test X data
Read in Test y data
---
Successfully recorded hyperparameter choices
---
Successfully recorded non_tuneable_hyperparameter choices
---
---
---
---
No previous tuning data read in
---
Successfully set tuning output address
---
Successfully set best model output address


In [15]:
# ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [16]:
# begin tuning ... (or continue training)

tuner.tune()


Default combo: [2, 2, 2, 2, 3, 2] 


FEATURE_INDEX: 0 (max_depth) ROUND 1

Round 1 
Hyperparameter: max_depth (index: 1) 

[LightGBM] [Info] Number of positive: 2800, number of negative: 9800
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.110584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 354416
[LightGBM] [Info] Number of data points in the train set: 12600, number of used features: 6996
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.222222 -> initscore=-1.252763
[LightGBM] [Info] Start training from score -1.252763
Trained and Tested combination 1 of 31500: (2, 0, 2, 2, 3, 2), taking 1.52 seconds to get val score of 0.5017
        Current best combo: (2, 0, 2, 2, 3, 2) with val score 0.5017
[LightGBM] [Info] Number of positive: 2800, number of negative: 9800
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhe

In [17]:
max_index = tuner.tuning_result['Test balanced_accu'].idxmax()
tuner.tuning_result.loc[max_index]

n_estimators                800.0
max_depth                    24.0
subsample                     0.7
colsample_bytree             0.85
learning_rate                 0.1
reg_alpha                     0.1
Train accu                    1.0
Val accu                 0.861481
Test accu                0.871481
Train balanced_accu           1.0
Val balanced_accu        0.757976
Test balanced_accu       0.775119
Train f1                      1.0
Val f1                   0.854568
Test f1                  0.865551
Train precision               1.0
Val precision            0.854242
Test precision           0.865506
Train recall                  1.0
Val recall               0.861481
Test recall              0.871481
Time                        18.98
random_state           19260817.0
n_jobs                       -1.0
Train AP                      1.0
Val AP                    0.52145
Test AP                  0.551635
Train AUC                     1.0
Val AUC                  0.757976
Test AUC      