# Tuning Script

In [1]:
# Parameters subject to change in every script - involved in naming

MODEL = 'gnb'
FEATURE_IMPORTANCE_ORDERING = 'xgb'
PREPROCESSING = 'Baseline'
TUNER = 'jiaochengb-balaccu'
FEATURE_ENGINEERING = 'tfidf'

---

In [2]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

Mounted at /content/drive


In [3]:
import sys
import os
import pickle
from scipy.sparse import load_npz

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/PrivatePackages" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/' # my home directory is stored in ./LAB of google drive
    if MODEL == 'ebc':
        !pip install interpret==0.5.0
    elif MODEL == 'cbc':
        !pip install catboost
else:
    py_file_location = '../../PrivatePackages'
    home_directory = '../../'

sys.path.append(os.path.abspath(py_file_location))

from Tuners import JiaoChengB_BalAccu

import pandas as pd

In [4]:
## Parameters that are same for every script

SEED = 2608

TASK_TYPE = 'Classification' # this task is a classification task

TUNE_FEATURES_AS_HYPERPARAMETERS = True # we have decided to tune features as hp

---

In [5]:
# model to use to tune
from xgboost import XGBClassifier as clf

# what values to try for each hyperparameter
parameter_choices = {
    'var_smoothing': (0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100),
}

# what values to set non-tuneable parameters/hyperparameters
non_tunable_hyperparameters_dict = {
                                    }

tuning_order = [
    'features',
    'var_smoothing'
]

default_hyperparameter_values = {'var_smoothing': 1, 'features': 0}

---

Read in and Prepare Data

In [6]:
train_x = load_npz(home_directory + f'data/curated/baseline/train_{FEATURE_ENGINEERING}_x.npz')
val_x = load_npz(home_directory + f'data/curated/baseline/val_{FEATURE_ENGINEERING}_x.npz')
test_x = load_npz(home_directory + f'data/curated/baseline/test_{FEATURE_ENGINEERING}_x.npz')

with open(home_directory + 'data/curated/baseline/train_y.pkl', 'rb') as f:
    train_y = pickle.load(f)
with open(home_directory + 'data/curated/baseline/val_y.pkl', 'rb') as f:
    val_y = pickle.load(f)
with open(home_directory + 'data/curated/baseline/test_y.pkl', 'rb') as f:
    test_y = pickle.load(f)

In [7]:
# Import feature importance ordering

with open(f'{home_directory}models/{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_feature_importance_ordering_{PREPROCESSING}.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

Set parameters

In [8]:
# initialisation
tuner = JiaoChengB_BalAccu.JiaoChengB()

print('---')

# define what model we are tuning
tuner.read_in_model(clf, TASK_TYPE)

print('---')

# read in the data for training and validation
tuner.read_in_data(train_x, train_y, val_x, val_y, test_x, test_y)

print('---')

# set what hp values to tune
tuner.set_hyperparameters(parameter_choices)
# WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up hp values that need to be changed from default but NOT to be tuned
tuner.set_non_tuneable_hyperparameters(non_tunable_hyperparameters_dict)

print('---')

# set up feature importance ordering

if TUNE_FEATURES_AS_HYPERPARAMETERS:
  tuner.set_features(feature_importance_ordering)
  # WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up the order of hyperparameters when iteratively tuning using JiaoCheng
tuner.set_tuning_order(tuning_order)

print('---')

# set up the default hp values for first iteration of tuning JiaoCheng
tuner.set_hyperparameter_default_values(default_hyperparameter_values)

print('---')

try: # try to read in previous results to continue tuning ...
  tuner.read_in_tuning_result_df(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')
except:
  print('No previous tuning data read in')

print('---')

# set up where to save the tuning result csv
tuner.set_tuning_result_saving_address(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')

print('---')

# set up where to save the current best model
tuner.set_best_model_saving_address(f'{home_directory}models/tmp_models/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}')

JiaoCheng Initialised
---
Successfully read in model <class 'xgboost.sklearn.XGBClassifier'>, which is a Classification model
---
Read in Train X data
Read in Train y data
Read in Val X data
Read in Val y data
Read in Test X data
Read in Test y data
---
Successfully recorded hyperparameter choices
---
Successfully recorded non_tuneable_hyperparameter choices
---
Successfully recorded tuneable feature combination choices and updated relevant internal structures
---
---
---
Error message: ('3414', '29667', '887', '14524', '6603', '67869', '15936', '63974', '66808', '3426')
Error Importing this Row: (0, var_smoothing                                                                  1.0
features                         [3414, 29667, 887, 14524, 6603, 67869, 15936, ...
feature combo ningxiang score                                             0.054853
Train accu                                                                0.820476
Val accu                                                    

In [9]:
# ignore warnings

import warnings
warnings.filterwarnings("ignore")

Error message: ('3414', '29667', '887', '14524', '6603', '67869', '15936', '63974', '66808', '3426', '66664', '29750', '41191', '55340', '12469', '67858', '66575', '65072', '16037', '1740', '13507', '2688', '2215', '52187', '23378', '6233', '10403', '37744', '14712', '32284', '1674', '45854', '3216', '2359', '30255', '34377', '8995', '5290', '15598', '6860', '3403', '46916', '17969', '13664', '32363', '36120', '40128', '36203', '51794', '55739', '3907', '15310', '11612', '9426', '567', '44892', '22090', '30337', '18791', '24916', '31782', '50216', '50215', '35702', '7824', '11463', '42186', '52031', '55970', '49708', '6092', '16198', '40946', '11229', '50463', '33791', '13877', '21818', '67106', '24711', '62981', '45306', '19094', '57360', '10927', '48377', '4654', '8387', '13044', '5192', '31387', '2181', '38937', '6276', '42062', '36443', '21494', '4522', '16137', '775', '67394', '8778', '68147', '1396', '3995', '27108', '67483', '599', '2325', '11358', '5027', '1275', '60996', '200'

In [10]:
# begin tuning ... (or continue training)

tuner.tune()


FEATURE_INDEX: 0 (features) ROUND 1

Round 1 
Hyperparameter: features (index: 1) 

Trained and Tested combination 1 of 744: (5, 0), taking 0.64 seconds to get val score of 0.5983
        Current best combo: (5, 0) with val score 0.5983
Trained and Tested combination 2 of 744: (5, 1), taking 1.78 seconds to get val score of 0.6061
        Current best combo: (5, 1) with val score 0.6061
Trained and Tested combination 3 of 744: (5, 2), taking 0.99 seconds to get val score of 0.6235
        Current best combo: (5, 2) with val score 0.6235
Trained and Tested combination 4 of 744: (5, 3), taking 0.76 seconds to get val score of 0.6189
        Current best combo: (5, 2) with val score 0.6235
Trained and Tested combination 5 of 744: (5, 4), taking 0.91 seconds to get val score of 0.6258
        Current best combo: (5, 4) with val score 0.6258
Trained and Tested combination 6 of 744: (5, 5), taking 1.01 seconds to get val score of 0.634
        Current best combo: (5, 5) with val score 0.634

TypeError: Cannot index by location index with a non-integer key

In [25]:
max_val_id = tuner.tuning_result['Val balanced_accuracy'].idxmax()
tuner.tuning_result.iloc[max_val_id]['Test balanced_accuracy']

0.719405