# Tuning Script

In [2]:
# Parameters subject to change in every script - involved in naming

MODEL = 'eb'
TUNER = 'yangzhoub'
EXPERIMENT_TYPE = 'title&pyq'
EMBEDDING_MODEL = 'GanymedeNil_text2vec-large-chinese'

In [5]:
pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl (12.6 MB)
[K     |████████████████████████████████| 12.6 MB 450 kB/s eta 0:00:01  | 11.5 MB 450 kB/s eta 0:00:03
[?25hCollecting tzdata>=2022.7
  Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[K     |████████████████████████████████| 345 kB 8.6 MB/s eta 0:00:01
[?25hCollecting pytz>=2020.1
  Downloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
[K     |████████████████████████████████| 505 kB 22.1 MB/s eta 0:00:01
Collecting numpy>=1.22.4
  Downloading numpy-2.0.1-cp39-cp39-macosx_10_9_x86_64.whl (21.2 MB)
[K     |████████████████████████████████| 21.2 MB 72 kB/s s eta 0:00:01   |█▋                              | 1.0 MB 1.7 MB/s eta 0:00:1214.6 MB 15.6 MB/s eta 0:00:010:01
Installing collected packages: tzdata, pytz, numpy, pandas
Successfully installed numpy-2.0.1 pandas-2.2.2 pytz-2024.1 tzdata-2024.1
You should co

---

In [3]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

In [4]:
import sys
import os
import pickle

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/' # my home directory is stored in ./LAB of google drive
    if MODEL == 'ebr':
        !pip install interpret==0.5.0
    elif MODEL == 'cbr':
        !pip install catboost
else:
    py_file_location = '../../../PrivatePackages'
    home_directory = '../../../'

sys.path.append(os.path.abspath(py_file_location))

import YangZhouB

import pandas as pd

ModuleNotFoundError: No module named 'pandas'

In [None]:
## Parameters that are same for every script
SEED = 19260817

TASK_TYPE = 'Regression' # this task is a classification task

TUNE_FEATURES_AS_HYPERPARAMETERS = False # we have decided to tune features as hp

---

In [None]:
# model to use to tune
from interpret.glassbox import ExplainableBoostingRegressor as clf

# what values to try for each hyperparameter
parameter_choices = {
    'max_bins': (4, 6, 64, 256, 1024, 4096),
    'min_samples_leaf': (2, 4, 8, 16, 32),
    'interactions': (2, 4, 8),
    'max_leaves': (100, 1000, 10000),
    'learning_rate': (0.001, 0.01, 0.1, 1),
    'max_rounds': (25, 50, 100, 200),

}

# what values to set non-tuneable parameters/hyperparameters
non_tunable_hyperparameters_dict = {
                                    'random_state': 19260817,
                                    'n_jobs': -1
                                    }

---

Read in and Prepare Data

In [None]:
# Import data and store as lists

train_data = pd.read_csv(f'{home_directory}/data/curated/{EMBEDDING_MODEL}/train_dataset_{EXPERIMENT_TYPE}.csv')
val_data = pd.read_csv(f'{home_directory}/data/curated/{EMBEDDING_MODEL}/val_dataset_{EXPERIMENT_TYPE}.csv')
test_data = pd.read_csv(f'{home_directory}/data/curated/{EMBEDDING_MODEL}/test_dataset_{EXPERIMENT_TYPE}.csv')

train_X = train_data.drop('label', axis = 1)
train_y = train_data['label']
val_X = val_data.drop('label', axis = 1)
val_y = val_data['label']
test_X = test_data.drop('label', axis = 1)
test_y = test_data['label']

In [None]:
# # Import feature importance ordering

# with open(f'{home_directory}models/{FEATURE_IMPORTANCE_ORDERING}_feature_importance_ordering_{PREPROCESSING}.pickle', 'rb') as f:
#     feature_importance_ordering = pickle.load(f)

Set parameters

In [None]:
# initialisation
tuner = YangZhouB.YangZhouB()

print('---')

# define what model we are tuning
tuner.read_in_model(clf, TASK_TYPE)

print('---')

# read in the data for training and validation
tuner.read_in_data(train_X, train_y, val_X, val_y, test_X, test_y)

print('---')

# set what hp values to tune
tuner.set_hyperparameters(parameter_choices)
# WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up hp values that need to be changed from default but NOT to be tuned
tuner.set_non_tuneable_hyperparameters(non_tunable_hyperparameters_dict)

print('---')

# set up feature importance ordering

if TUNE_FEATURES_AS_HYPERPARAMETERS:
  tuner.set_features(feature_importance_ordering)
  # WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

try: # try to read in previous results to continue tuning ...
  tuner.read_in_tuning_result_df(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{EXPERIMENT_TYPE}_{EMBEDDING_MODEL}.csv')
except:
  print('No previous tuning data read in')

print('---')

# set up where to save the tuning result csv
tuner.set_tuning_result_saving_address(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{EXPERIMENT_TYPE}_{EMBEDDING_MODEL}.csv')

print('---')

# set up where to save the current best model
tuner.set_best_model_saving_address(f'{home_directory}models/tmp_models/{TUNER}_{MODEL}_{EXPERIMENT_TYPE}_{EMBEDDING_MODEL}')

YangZhouB Initialised
---
Successfully read in model <class 'interpret.glassbox._ebm._ebm.ExplainableBoostingRegressor'>, which is a Regression model
---
Read in Train X data
Read in Train y data
Read in Val X data
Read in Val y data
Read in Test X data
Read in Test y data
---
Successfully recorded hyperparameter choices
---
Successfully recorded non_tuneable_hyperparameter choices
---
---
No previous tuning data read in
---
Successfully set tuning output address
---
Successfully set best model output address


In [None]:
# ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [None]:
# begin tuning ... (or continue training)

tuner.tune()

NameError: name 'tuner' is not defined