# Tuning Script

In [1]:
# Parameters subject to change in every script - involved in naming

MODEL = 'svc(rbf)'
FEATURE_IMPORTANCE_ORDERING = 'xgb'
PREPROCESSING = 'Baseline'
TUNER = 'jiaochengb-balaccu'
FEATURE_ENGINEERING = 'tfidf'

---

In [2]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

Mounted at /content/drive


In [3]:
import sys
import os
import pickle
from scipy.sparse import load_npz

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/PrivatePackages" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/' # my home directory is stored in ./LAB of google drive
    if MODEL == 'ebc':
        !pip install interpret==0.5.0
    elif MODEL == 'cbc':
        !pip install catboost
else:
    py_file_location = '../../PrivatePackages'
    home_directory = '../../'

sys.path.append(os.path.abspath(py_file_location))

from Tuners import JiaoChengB_BalAccu

import pandas as pd

In [4]:
## Parameters that are same for every script

SEED = 2608

TASK_TYPE = 'Classification' # this task is a classification task

TUNE_FEATURES_AS_HYPERPARAMETERS = False # we have decided to tune features as hp

---

In [5]:
# model to use to tune
from sklearn.svm import SVC as clf

# what values to try for each hyperparameter
parameter_choices = {
    'C': (0.0001, 0.001, 0.01, 0.1, 1, 10),
    'tol': (0.0001, 0.001, 0.01, 0.1, 1, 10),
    'max_iter': (50, 100, 200, 400, 800, 1600),
    'gamma': ('scale', 'auto')
}

# what values to set non-tuneable parameters/hyperparameters
non_tunable_hyperparameters_dict = {
                                    'verbose': False,
                                    'decision_function_shape': 'ovr',
                                    'random_state': 42,
                                    'kernel': 'poly'
                                    }

tuning_order = [
    'C',
    'max_iter',
    'gamma',
    'tol',
]

default_hyperparameter_values = {'C': 1, 'tol': 0.001, 'max_iter': 800, 'gamma': 'auto'}

---

Read in and Prepare Data

In [6]:
train_x = load_npz(home_directory + f'data/curated/baseline/train_{FEATURE_ENGINEERING}_x.npz')
val_x = load_npz(home_directory + f'data/curated/baseline/val_{FEATURE_ENGINEERING}_x.npz')
test_x = load_npz(home_directory + f'data/curated/baseline/test_{FEATURE_ENGINEERING}_x.npz')

with open(home_directory + 'data/curated/baseline/train_y.pkl', 'rb') as f:
    train_y = pickle.load(f)
with open(home_directory + 'data/curated/baseline/val_y.pkl', 'rb') as f:
    val_y = pickle.load(f)
with open(home_directory + 'data/curated/baseline/test_y.pkl', 'rb') as f:
    test_y = pickle.load(f)

In [7]:
# Import feature importance ordering

with open(f'{home_directory}models/{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_feature_importance_ordering_{PREPROCESSING}.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

Set parameters

In [8]:
# initialisation
tuner = JiaoChengB_BalAccu.JiaoChengB()

print('---')

# define what model we are tuning
tuner.read_in_model(clf, TASK_TYPE)

print('---')

# read in the data for training and validation
tuner.read_in_data(train_x, train_y, val_x, val_y, test_x, test_y)

print('---')

# set what hp values to tune
tuner.set_hyperparameters(parameter_choices)
# WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up hp values that need to be changed from default but NOT to be tuned
tuner.set_non_tuneable_hyperparameters(non_tunable_hyperparameters_dict)

print('---')

# set up feature importance ordering

if TUNE_FEATURES_AS_HYPERPARAMETERS:
  tuner.set_features(feature_importance_ordering)
  # WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up the order of hyperparameters when iteratively tuning using JiaoCheng
tuner.set_tuning_order(tuning_order)

print('---')

# set up the default hp values for first iteration of tuning JiaoCheng
tuner.set_hyperparameter_default_values(default_hyperparameter_values)

print('---')

try: # try to read in previous results to continue tuning ...
  tuner.read_in_tuning_result_df(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')
except:
  print('No previous tuning data read in')

print('---')

# set up where to save the tuning result csv
tuner.set_tuning_result_saving_address(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')

print('---')

# set up where to save the current best model
tuner.set_best_model_saving_address(f'{home_directory}models/tmp_models/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
feature combo ningxiang score                                             0.054853
Train accu                                                                0.274286
Val accu                                                                  0.272593
Test accu                                                                 0.272222
Train balanced_accu                                                            NaN
Val balanced_accu                                                              NaN
Test balanced_accu                                                             NaN
Train f1                                                                   0.18934
Val f1                                                                    0.189135
Test f1                                                                   0.184935
Train precision                                                            0.76324
Val precision         

In [9]:
# ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [10]:
# begin tuning ... (or continue training)

tuner.tune()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        Current best combo: (4, 0, 1, 4, 0, 6) with val score 0.5055
Trained and Tested combination 36 of 361584: (4, 0, 1, 4, 0, 35), taking 3.5 seconds to get val score of 0.4946
        Current best combo: (4, 0, 1, 4, 0, 6) with val score 0.5055
Trained and Tested combination 37 of 361584: (4, 0, 1, 4, 0, 36), taking 4.42 seconds to get val score of 0.4705
        Current best combo: (4, 0, 1, 4, 0, 6) with val score 0.5055
Trained and Tested combination 38 of 361584: (4, 0, 1, 4, 0, 37), taking 3.8 seconds to get val score of 0.4852
        Current best combo: (4, 0, 1, 4, 0, 6) with val score 0.5055
Trained and Tested combination 39 of 361584: (4, 0, 1, 4, 0, 38), taking 4.06 seconds to get val score of 0.5115
        Current best combo: (4, 0, 1, 4, 0, 38) with val score 0.5115
Trained and Tested combination 40 of 361584: (4, 0, 1, 4, 0, 39), taking 4.83 seconds to get val score of 0.4958
        Current best combo

TypeError: Cannot index by location index with a non-integer key

In [11]:
max_val_id = tuner.tuning_result['Val balanced_accuracy'].idxmax()
tuner.tuning_result.iloc[max_val_id]['Test balanced_accuracy']

0.720476