# Tuning Script

In [1]:
# Parameters subject to change in every script - involved in naming

MODEL = 'svc(rbf)'
FEATURE_IMPORTANCE_ORDERING = 'None'
PREPROCESSING = 'TextFeatures'
TUNER = 'jiaochengb-balaccu'
FEATURE_ENGINEERING = 'Manual'

---

In [2]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

In [3]:
import sys
import os
import pickle
from scipy.sparse import load_npz

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/PrivatePackages" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/' # my home directory is stored in ./LAB of google drive
    if MODEL == 'ebc':
        !pip install interpret==0.5.0
    elif MODEL == 'cbc':
        !pip install catboost
else:
    py_file_location = '../../PrivatePackages'
    home_directory = '../../'

sys.path.append(os.path.abspath(py_file_location))

from Tuners import JiaoChengB_BalAccu

import pandas as pd

In [4]:
## Parameters that are same for every script

SEED = 2608

TASK_TYPE = 'Classification' # this task is a classification task

TUNE_FEATURES_AS_HYPERPARAMETERS = False # we have decided to tune features as hp

---

In [5]:
# model to use to tune
from sklearn.svm import SVC as clf

# what values to try for each hyperparameter
parameter_choices = {
    'C': (0.0001, 0.001, 0.01, 0.1, 1, 10),
    'tol': (0.0001, 0.001, 0.01, 0.1, 1, 10),
    'max_iter': (50, 100, 200, 400, 800, 1600),
    'gamma': ('scale', 'auto')
}

# what values to set non-tuneable parameters/hyperparameters
non_tunable_hyperparameters_dict = {
                                    'verbose': False,
                                    'decision_function_shape': 'ovr',
                                    'random_state': 42,
                                    'kernel': 'poly'
                                    }

tuning_order = [
    'C',
    'max_iter',
    'gamma',
    'tol',
]

default_hyperparameter_values = {'C': 1, 'tol': 0.001, 'max_iter': 800, 'gamma': 'auto'}

---

Read in and Prepare Data

In [6]:
data1 = pd.read_csv('../../data/curated/comp90051-2024s1-project-1/data1_additional.csv')
data2 = pd.read_csv('../../data/curated/comp90051-2024s1-project-1/data2_additional.csv')
future = pd.read_csv('../../data/curated/comp90051-2024s1-project-1/data1_additional.csv')

In [7]:
from sklearn.model_selection import train_test_split

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = data1['label'])
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = data2['label'])
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1.iloc[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2.iloc[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = data1.iloc[train_ix_1]
val_data_1 = data1.iloc[val_ix_1]
test_data_1 = data1.iloc[test_ix_1]
train_data_2 = data2.iloc[train_ix_2]
val_data_2 = data2.iloc[val_ix_2]
test_data_2 = data2.iloc[test_ix_2]

# combine the data
train_data = pd.concat([train_data_1, train_data_2])
val_data = pd.concat([val_data_1, val_data_2])
test_data = pd.concat([test_data_1, test_data_2])

In [8]:
train_x = train_data[['perplexity', 'burstiness', 'length', 'domain', 'unique_word_ratio']]
train_y = train_data['label']
val_x = val_data[['perplexity', 'burstiness', 'length', 'domain', 'unique_word_ratio']]
val_y = val_data['label']
test_x = test_data[['perplexity', 'burstiness', 'length', 'domain', 'unique_word_ratio']]
test_y = test_data['label']

Set parameters

In [9]:
# initialisation
tuner = JiaoChengB_BalAccu.JiaoChengB()

print('---')

# define what model we are tuning
tuner.read_in_model(clf, TASK_TYPE)

print('---')

# read in the data for training and validation
tuner.read_in_data(train_x, train_y, val_x, val_y, test_x, test_y)

print('---')

# set what hp values to tune
tuner.set_hyperparameters(parameter_choices)
# WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up hp values that need to be changed from default but NOT to be tuned
tuner.set_non_tuneable_hyperparameters(non_tunable_hyperparameters_dict)

print('---')

# set up feature importance ordering

if TUNE_FEATURES_AS_HYPERPARAMETERS:
  tuner.set_features(feature_importance_ordering)
  # WARNING: this may take a while if no. tuneable hyperparameters are large

print('---')

# set up the order of hyperparameters when iteratively tuning using JiaoCheng
tuner.set_tuning_order(tuning_order)

print('---')

# set up the default hp values for first iteration of tuning JiaoCheng
tuner.set_hyperparameter_default_values(default_hyperparameter_values)

print('---')

try: # try to read in previous results to continue tuning ...
  tuner.read_in_tuning_result_df(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')
except:
  print('No previous tuning data read in')

print('---')

# set up where to save the tuning result csv
tuner.set_tuning_result_saving_address(f'{home_directory}models/tuning/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}.csv')

print('---')

# set up where to save the current best model
tuner.set_best_model_saving_address(f'{home_directory}models/tmp_models/{TUNER}_{MODEL}_{FEATURE_IMPORTANCE_ORDERING}_{FEATURE_ENGINEERING}_{PREPROCESSING}')

JiaoCheng Initialised
---
Successfully read in model <class 'sklearn.svm._classes.SVC'>, which is a Classification model
---
Read in Train X data
Read in Train y data
Read in Val X data
Read in Val y data
Read in Test X data
Read in Test y data
---
Successfully recorded hyperparameter choices
---
Successfully recorded non_tuneable_hyperparameter choices
---
---
---
---
Error message: nan
Error Importing this Row: (32, C                                 NaN
tol                               NaN
max_iter                          NaN
gamma                             NaN
Train accu                   0.777857
Val accu                     0.778148
Test accu                    0.777778
Train balanced_accu          0.500561
Val balanced_accu            0.500833
Test balanced_accu                0.5
Train f1                     0.681193
Val f1                       0.681437
Test f1                      0.680556
Train precision              0.732073
Val precision                0.827385
Test pre

In [10]:
# ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [11]:
# begin tuning ... (or continue training)

tuner.tune()


Default combo: [4, 1, 4, 0] 


FEATURE_INDEX: 0 (C) ROUND 1

Round 1 
Hyperparameter: C (index: 0) 

As new Best Combo (0, 1, 4, 0) was read in, best_clf is set to None
Already Trained and Tested combination (0, 1, 4, 0), which had val score of 0.4467
        Current best combo: (0, 1, 4, 0) with val score 0.4467. 
        Has trained 32 of 432 combinations so far
As new Best Combo (1, 1, 4, 0) was read in, best_clf is set to None
Already Trained and Tested combination (1, 1, 4, 0), which had val score of 0.5375
        Current best combo: (1, 1, 4, 0) with val score 0.5375. 
        Has trained 32 of 432 combinations so far
Already Trained and Tested combination (2, 1, 4, 0), which had val score of 0.446
        Current best combo: (1, 1, 4, 0) with val score 0.5375. 
        Has trained 32 of 432 combinations so far
As new Best Combo (3, 1, 4, 0) was read in, best_clf is set to None
Already Trained and Tested combination (3, 1, 4, 0), which had val score of 0.5704
        Current be

In [12]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
import json

with open('../../models/tmp_models/jiaochengb-balaccu_svc(rbf)_xgb_tfidf_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
# with open('../models/xgb_tfidf_feature_importance_ordering_Baseline.pickle', 'rb') as f:
#     feature_importance_ordering = pickle.load(f)
    

data1 = []
with open('../../data/raw/comp90051-2024s1-project-1/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open('../../data/raw/comp90051-2024s1-project-1/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

data_test = []
with open('../../data/raw/comp90051-2024s1-project-1/test_data.json', 'r') as f:
    for line in f:
        data_test.append(json.loads(line))

future_tfidf = load_npz('../../data/curated/baseline/future_tfidf_x.npz')

# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 1
for i in range(len(data2)):
    data2[i]['domain'] = 2

label1 = [instance['label'] for instance in data1]
label2 = [instance['label'] for instance in data2]

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = label1)
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = label2)
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = [data1[i] for i in train_ix_1]
val_data_1 = [data1[i] for i in val_ix_1]
test_data_1 = [data1[i] for i in test_ix_1]
train_data_2 = [data2[i] for i in train_ix_2]
val_data_2 = [data2[i] for i in val_ix_2]
test_data_2 = [data2[i] for i in test_ix_2]

# combine the data
train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2
test_data = test_data_1 + test_data_2

train_label = [instance['label'] for instance in train_data]
val_label = [instance['label'] for instance in val_data]
test_label = [instance['label'] for instance in test_data]


train_domain = [instance['domain'] for instance in train_data]
val_domain = [instance['domain'] for instance in val_data]
test_domain = [instance['domain'] for instance in test_data]

val_pred = clf.predict(val_x)
test_pred = clf.predict(test_x)

print(balanced_accuracy_score(val_y, val_pred), balanced_accuracy_score(test_y, test_pred))

print()

val_dom1_pred = [val_pred[i] for i in range(len(val_pred)) if val_domain[i] == 1]
val_dom2_pred = [val_pred[i] for i in range(len(val_pred)) if val_domain[i] == 2]
val_dom1_true = [val_label[i] for i in range(len(val_label)) if val_domain[i] == 1]
val_dom2_true = [val_label[i] for i in range(len(val_label)) if val_domain[i] == 2]

test_dom1_pred = [test_pred[i] for i in range(len(test_pred)) if test_domain[i] == 1]
test_dom2_pred = [test_pred[i] for i in range(len(test_pred)) if test_domain[i] == 2]
test_dom1_true = [test_label[i] for i in range(len(test_label)) if test_domain[i] == 1]
test_dom2_true = [test_label[i] for i in range(len(test_label)) if test_domain[i] == 2]


print(balanced_accuracy_score(val_dom1_true, val_dom1_pred), balanced_accuracy_score(val_dom2_true, val_dom2_pred), \
      (balanced_accuracy_score(val_dom1_true, val_dom1_pred) + balanced_accuracy_score(val_dom2_true, val_dom2_pred)) / 2, balanced_accuracy_score(val_label, val_pred))
print(balanced_accuracy_score(test_dom1_true, test_dom1_pred), balanced_accuracy_score(test_dom2_true, test_dom2_pred), \
        (balanced_accuracy_score(test_dom1_true, test_dom1_pred) + balanced_accuracy_score(test_dom2_true, test_dom2_pred)) / 2, balanced_accuracy_score(test_label, test_pred))

future_predict = clf.predict(future_tfidf)
predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
predictions.to_csv('../..//predictions/jiaochengb-balaccu_svm_tfidf_Baseline.csv', index=False)

ValueError: X has 5 features, but SVC is expecting 68213 features as input.