In [97]:
import pandas as pd
import numpy as np
import statistics as s
import copy
import time
from itertools import combinations
import pickle

from scipy.spatial.distance import cdist
from scipy.stats import t
from scipy import stats

from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score

In [91]:
class YangZhou:

    def __init__(self):
        """ Initialise class """
        self._initialise_objects()

        print('YangZhou Initialised')



    def _initialise_objects(self):
        """ Helper to initialise objects """
        self.train_x = None
        self.train_y = None
        self.val_x = None
        self.val_y = None
        self.test_x = None
        self.test_y = None
        self.tuning_result = None
        self.model = None
        self.parameter_choices = None
        self.hyperparameters = None
        self.checked = None
        self.result = None
        self.tuning_result_saving_address = None
        self.object_saving_address = None
        self._up_to = 0
        self._seed = 19260817
        self.best_score = -np.inf
        self.best_combo = None
        self.best_clf = None
        self.clf_type = None
        self.combos = None
        self.n_items = None
        self.outmost_layer = None
        self._core = None
        self._relative_combos = None
        self._both_combos = None
        self._dealt_with = None
        self._pos_neg_combos = None
        self._abs_max = None
        self._new_combos = None

        self.regression_extra_output_columns = ['Train r2', 'Val r2', 'Test r2', 
            'Train RMSE', 'Val RMSE', 'Test RMSE', 'Train MAPE', 'Val MAPE', 'Test MAPE', 'Time']
        self.classification_extra_output_columns = ['Train accu', 'Val accu', 'Test accu', 
            'Train balanced_accu', 'Val balanced_accu', 'Test balanced_accu', 'Train f1', 'Val f1', 'Test f1', 
            'Train precision', 'Val precision', 'Test precision', 'Train recall', 'Val recall', 'Test recall', 'Time']



    def read_in_data(self, train_x, train_y, val_x, val_y, test_x, test_y):
        """ Reads in train validate test data for tuning """

        self.train_x = train_x
        print("Read in Train X data")

        self.train_y = train_y
        print("Read in Train x data")

        self.val_x = val_x
        print("Read in Val X data")

        self.val_y = val_y
        print("Read in Val y data")

        self.test_x = test_x
        print("Read in Test X data")

        self.test_y = test_y
        print("Read in Test y data")



    def set_hyperparameters(self, parameter_choices):
        """ Input hyperparameter choices """

        self.parameter_choices = parameter_choices

        self.hyperparameters = list(parameter_choices.keys())

        # automatically calculate how many different values in each hyperparameter
        self.n_items = [len(parameter_choices[key]) for key in self.hyperparameters]
        self.num_hyperparameters = {hyperparameter:len(parameter_choices[hyperparameter]) for hyperparameter in self.hyperparameters}

        # automatically calculate all combinations and setup checked and result arrays and tuning result dataframe
        # self._get_combinations() ## TODO: discard
        self._get_checked_and_result_array()
        self._setup_tuning_result_df()

        print("Successfully recorded hyperparameter choices")



    def set_tuning_result_saving_address(self, address):
        """ Read in where to save tuning object """

        self.tuning_result_saving_address = address
        print('Successfully set tuning output address')



    def _set_object_saving_address(self, address):
        """ Read in where to save the YangZhou object """

        self.object_saving_address = address
        print('Successfully set object output address')



    def _get_checked_and_result_array(self):
        """ Helper to set up checked and result array """

        self.checked = np.zeros(shape=self.n_items)
        self.result = np.zeros(shape=self.n_items)
        self.checked_core = np.zeros(shape=self.n_items)
        self.been_best = np.zeros(shape=self.n_items)
        self.been_cruised = np.zeros(shape=self.n_items)



    def _setup_tuning_result_df(self):
        """ Helper to set up tuning result dataframe """

        tune_result_columns = copy.deepcopy(self.hyperparameters)

        # Different set of metric columns for different types of models
        if self.clf_type == 'Classification':
            tune_result_columns.extend(self.classification_extra_output_columns)
        elif self.clf_type == 'Regression':
            tune_result_columns.extend(self.regression_extra_output_columns)

        self.tuning_result = pd.DataFrame({col:list() for col in tune_result_columns})



    def read_in_model(self, model, type):
        """ Reads in underlying model object for tuning, and also read in what type of model it is """

        assert type == 'Classification' or type == 'Regression' # check

        # record
        self.model = model
        self.clf_type = type 

        print(f'Successfully read in model {self.model}, which is a {self.clf_type} model')



    def _save_tuning_result(self):
        """ Helper to export tuning result csv """

        self.tuning_result.to_csv(self.tuning_result_saving_address, index=False)

    

    def read_in_tuning_result_df(self, address): 
        """ Read in tuning result csv and read data into checked and result arrays """

        if self.parameter_choices is None:
            print("Missing parameter_choices to build parameter_value_map_index, please run set_hyperparameters() first")

        if self.clf_type is None:
            print('Missing clf_type. Please run .read_in_model() first.')

        self.tuning_result = pd.read_csv(address)
        print(f"Successfully read in tuning result of {len(self.tuning_result)} rows")

        self._create_parameter_value_map_index()

        # read DataFrame data into internal governing DataFrames of YangZhou
        for row in self.tuning_result.iterrows():
    
            combo = tuple([self.parameter_value_map_index[hyperparam][row[1][hyperparam]] for hyperparam in self.hyperparameters])
            
            self.checked[combo] = 1
            
            if self.clf_type == 'Regression':
                self.result[combo] = row[1]['Val r2']
            elif self.clf_type == 'Classification':
                self.result[combo] = row[1]['Val accu']
        
            # update best score stats
            if self.result[combo] > self.best_score: 
                self.best_score = self.result[combo]
                self.best_clf = None
                print(f"As new Best Combo {combo} is read in, best_clf is set to None")
                self.best_combo = combo


    
    def _create_parameter_value_map_index(self):
        """ Helper to create parameter-value index map """

        self.parameter_value_map_index = dict()
        for key in self.parameter_choices.keys():
            tmp = dict()
            for i in range(len(self.parameter_choices[key])):
                tmp[self.parameter_choices[key][i]] = i
            self.parameter_value_map_index[key] = tmp



    def export_yangzhou(self, address):
        """ Export yangzhou object """

        self._set_object_saving_address(address)

        # copy object and set big objects to None
        object_save = copy.deepcopy(self)
        
        object_save.train_x = None
        object_save.train_y = None
        object_save.val_x = None
        object_save.val_y = None
        object_save.test_x = None
        object_save.test_y = None
        object_save._up_to = 0

        # Export
        with open(f'{self.object_saving_address}.pickle', 'wb') as f:
            pickle.dump(object_save, f)

        print(f'Successfully exported YangZhou object as {self.object_saving_address}')



    def view_best_combo_and_score(self):
        """ View best combination and its validation score """
        
        print(f'(Current) Best combo: {self.best_combo} with val score {self.best_score}')



    def _get_core(self):
        """ Helper to calculate core """
        self._core = [int(i/2) for i in self.n_items]



    def get_cruise_combinations(self):

        self.get_cruise_indices_values()
        self.generate_cruise_combinations()



    def get_cruise_indices_values(self):
        """ get cruise indices values of each dimension which serves as building blocks for cruise combinations """

        self.cruise_indices = dict()
        for hyperparameter in self.hyperparameters:
            self.cruise_indices[hyperparameter] = self.get_cruise_indices_1d(d_val = self.num_hyperparameters[hyperparameter], max_jump = 5)

        ##TODO: Can toggle with get_cruise_indices!!!! should add parameter to do so!!
        self.cruise_indices_values = list(self.cruise_indices.values())



    def get_cruise_indices_1d(self, d_val, max_jump = 5): ## TODO: max_jump
        """ Returns the appropriate cruise indices based on the number of values in dimension. Second argument controls maximum split size, defaulted to 5 """

        assert type(d_val) is int and type(max_jump) is int, "Error: type of input(s) is not int"
        assert d_val >= 2, "Error: argument 1 (number of values in this dimension) must be >= 2"
        assert max_jump >= 1, "Error: max_jump must be >= 1"

        gap = d_val - 1
        split = ((gap-1)//max_jump)

        jump = self.find_gaps(split, gap)

        cruise_indices_1d = self.find_cruise_indices_1d(jump)

        return cruise_indices_1d



    def find_gaps(self, split, gap):
        """ find the size of jumps between each element of the final cruise indices, as evenly split as possible with jump size <= 5 """

        if split > 0:
            jump = [gap//(split+1) for i in range(split+1)]
            diff = gap - sum(jump)
            if diff:
                for i in range(diff):
                    jump[i] += 1
        else:
            jump = [gap]

        return jump



    def find_cruise_indices_1d(self, jump):
        """ find the actual cruise_indices based on gaps """

        cruise_indices_1d = [0]
        for i in range(len(jump)):
            cruise_indices_1d.append(sum(jump[:i+1]))

        return cruise_indices_1d



    def generate_cruise_combinations(self):
        
        self.cruise_combinations = [[]]
        for i in range(len(self.cruise_indices_values)):

            tmp = copy.deepcopy(self.cruise_combinations)
            self.cruise_combinations = list()

            for x in tmp:

                for k in self.cruise_indices_values[i]:
                    y = copy.deepcopy(x)
                    
                    y.append(k)

                    self.cruise_combinations.append(y)



    def sort_cruise_coordinates(self, max_combo):
        """ sort the cruise coordinates based on distance from current max"""


        edist = list(cdist([max_combo], self.cruise_coordinates).flatten())
        ordered_cruise_coordinates = [(self.cruise_coordinates[i], edist[i]) for i in range(len(self.cruise_coordinates))]

        ordered_cruise_coordinates.sort(reverse=True, key=lambda x: x[1])

        sorted_cruise_coordinates = [ordered_cruise_coordinates[i][0] for i in range(len(ordered_cruise_coordinates))]

        return sorted_cruise_coordinates

    

    def get_max_surrounding_mean_sd(self):

        best_combo_surrounding_combos = self.get_surrounding_coordinates(self.best_combo)
        best_combo_surrounding_scores = [self.best_score]
        for combo in best_combo_surrounding_combos:
            best_combo_surrounding_scores.append(self.result[tuple(combo)])

        max_surrounding_sd = s.stdev(best_combo_surrounding_scores)

        return max_surrounding_sd



    def cruise_warning_threshold(max_accuracy, max_surrounding_sd, max_surrounding_n):
        """ max - halfwidth """

        # TO ADJUST: 0.95 to 0.975; 
        # TO ADJUST: max change to mean value
        qt = t.ppf(0.95, max_surrounding_n-1) # One sided test
        halfwidth = max_surrounding_sd * qt * 1/np.sqrt(max_surrounding_n)

        return max_accuracy - halfwidth



    def CruiseSystem(self):

        print(f"BEGIN CRUISING: ROUND {self.been_cruisedrestarts}\n") # TODO

        # get cruise coordinates in sorted order (furthest away from current max)
        sorted_cruise_coordinates = self.sort_cruise_coordinates(self.best_combo)
            # 理论：如果往少train，则应该randomise，或者就近来train (更可能得早点把score刷高，不触发)

        # calculate warning threshold
        max_surrounding_sd = self.get_max_surrounding_mean_sd()


        warning_threshold = self.cruise_warning_threshold(self.best_score, max_surrounding_sd, len(self.surrounding_vectors)-1)

        # check each cruise coordinate
        for cruise_combo in sorted_cruise_coordinates:

            # only search if it hasn't been cruised before (if has then is not an artifect of significance)
            if not self.been_cruised[tuple(cruise_combo)]:
                
                self.been_cruised[tuple(cruise_combo)] == 1

                # if above warning threshold, then stop cruise and restart guide
                if self.result[tuple(cruise_combo)] >= warning_threshold:
                   
                    print(f"YANGZHOU CRUISE SUSPECTED DUE TO SUSPICIOUS CASE")
                    print(f"YANGZHOU CRUISE STAGE {self.restarts} ENDED, RESTARTING GUIDANCE\n")
                    
                    self._core = cruise_combo
                    return

        # if reach here then all cruise indicies checked. can safely say end cruise
        print(f"YANGZHOU CRUISE STAGE {self.restarts} ENDED\n")
        print(f"YANGZHOU CRUISE SYSTEM SHUTDOWN\n\n")
        
        self._core = []
        self.cruising = False
        return



    def _get_core(self):
        """ Helper to calculate core """
        self._core = [int(i/2) for i in self.n_items]



    def get_surrounding_vectors(self, core):
        """ Get the VECTORS that moves the core to the COORDINATES that form the 3^d object around it """

        values = [-1, 0, 1]
        new_surroundings = [[-1], [0], [1]]

        for i in range(len(core) - 1):
            old_surroundings = copy.deepcopy(new_surroundings)
            new_surroundings = list()

            for surrounding in old_surroundings:
                for value in values:
                    new_surroundings.append(
                        [surrounding[i] if i < len(surrounding) else value for i in range(len(surrounding) + 1)])

        return new_surroundings



    def get_surrounding_coordinates(self, core):
        """ Use surrounding VECTORS to find surrounding COORDINATES """

        assert len(self.surrounding_vectors) > 0
        assert len(self.surrounding_vectors[0]) == len(core)

        surrounding_coords = list()
        for i in range(len(self.surrounding_vectors)):
            new_coord = self.new_coordinates(core, self.surrounding_vectors[i])
            if new_coord is not False:
                surrounding_coords.append(new_coord)

        return surrounding_coords



    def new_coordinates(self, core, vector):
        """ Get particular COORDINATE using a move in direction of VECTOR from particular CORE """

        assert len(core) == len(vector)

        new_coord = list()
        for i in range(len(vector)):
            val = core[i] + vector[i]
            if val >= self.n_items[i] or val < 0:
                return False
            new_coord.append(val)

        return new_coord



    def find_horizontal(self, surrounding_coordinates, core):
        """ Find the treatment and nulls block from a 'Horizontal' vector move """

        treatment = list()
        null = list()
        direction = list()

        for i in range(len(core)):

            for move in [-1, 1]:
                treatment_target = core[i] + move
                null_target = core[i]

                treatment_tmp = list()
                null_tmp = list()

                for vector in surrounding_coordinates:
                    if vector[i] == treatment_target:
                        treatment_tmp.append(vector)
                    elif vector[i] == null_target:
                        null_tmp.append(vector)

                treatment.append(treatment_tmp)
                null.append(null_tmp)
                direction.append([move if j == i else 0 for j in range(len(core))])

        return treatment, null, direction



    def pick_x(self, i, core):
        """ Pick all combinations of range(len(core)) for indexing when getting diagonal treatments """

        return list(combinations(list(range(len(core))), i))



    def get_indices(self, core):
        """ Get combinations of index to be used to find diagonal treatments: part of special algorithm """

        indices = list()
        for i in range(len(core)):
            for obj in self.pick_x(i, core):
                indices.append(obj)

        return indices



    def find_diagonal(self, core, indices):
        """ Find the treatment and nulls block from a 'Diagonal' vector move (effectively interaction of all params) """

        treatment = list()
        null = list()

        diagonals = self.get_diagonals()

        for diagonal in diagonals:
            treatment.append(self.get_diagonal_treatment(core, diagonal, indices))

            null.append(self.get_diagonal_null(core, diagonal))

        return treatment, null, diagonals



    def get_diagonals(self):
        """ Find all the diagonal vectors """

        return [obj for obj in self.surrounding_vectors if 0 not in obj]



    def get_diagonal_treatment(self, core, diagonal, indices):
        """ Find all diagonal treatments of this diagonal direction - any vector that has from 1 to d elements in same
        direction diagonal, and all other vector positions 0 """
        treatment = self.get_surrounding_coordinates(core, self.get_diag_treatment_vectors(indices, diagonal))

        return treatment



    def get_diag_treatment_vectors(self, indices, diagonal):
        """ Find all vectors for diagonal treatments (orthogonal to direction or 0 vector) """

        diag_vectors = list()

        for index in indices:
            tmp = [0 for i in range(len(diagonal))]
            for i in index:
                tmp[i] = diagonal[i]
            diag_vectors.append(tmp)

        return diag_vectors



    def get_diagonal_null(self, core, diagonal):
        """ Find all diagonal nulls of this diagonal direction - any vector that is orthogonal to the current direction """

        null = list()
        for surrounding_vector in self.surrounding_vectors:
            if np.dot(surrounding_vector, diagonal) == 0:
                new_coord = self.new_coordinates(core, surrounding_vector)
                if new_coord is not False:
                    null.append(new_coord)

        return null



    def get_blocks(self, core, surrounding_coordinates, indices):
        """ Get all blocks' treatments and nulls (in respective lists) (both horizontal and diagonal) """

        treatment = list()
        null = list()
        direction = list()

        hor_treatment, hor_null, hor_dir = self.find_horizontal(surrounding_coordinates, core)

        diag_treatment, diag_null, vert_dir = self.find_diagonal(core, indices)

        treatment.extend(hor_treatment)
        treatment.extend(diag_treatment)

        null.extend(hor_null)
        null.extend(diag_null)

        direction.extend(hor_dir)
        direction.extend(vert_dir)

        return treatment, null, direction



    def get_treat_or_null_tune_scores(self, treat_or_null):
        """ Return as the relevant scores as a list to be used for t_test """

        treat_or_null_score = dict()

        for combo in treat_or_null:
            treat_or_null_score[tuple(combo)] = self.result[tuple(combo)]

        return treat_or_null_score



    def dict_arg_max(self, dic):
        """ find key of maximum dict value """
        
        max_val = -np.inf
        for key in dic:
            if dic[key] > max_val:
                max_val = dic[key]
                max_key = key

        return max_key



    def find_new_core(self, treatment, null, direction):
        """ Only positive mean and < 0.05 """

        assert len(treatment) == len(null)
        assert len(treatment) == len(direction)

        new_cores = list()

        for i in range(len(treatment)): # TODO: also record the p values, so we only accept those with lowerst p value - but may need to adjust bottom's 'checked_core'
            if len(treatment[i]) <= 1 or len(null[i]) <= 1:
                continue

            bool_inc = np.mean(list(self.get_treat_or_null_tune_scores(treatment[i]).values())) > np.mean(
                list(self.get_treat_or_null_tune_scores(null[i]).values()))
            
            if bool_inc:

                p_val = stats.ttest_ind(list(self.get_treat_or_null_tune_scores(treatment[i]).values()),
                                    list(self.get_treat_or_null_tune_scores(null[i]).values()),
                                    equal_var=False).pvalue #TODO: True or False #尝试改one sided？

                # print(direction[i])
                # print(bool_inc, '\t', p_val)

                if p_val < 0.05:
                    
                    max_combo_of_treatment = self.dict_arg_max(self.get_treat_or_null_tune_scores(treatment[i]))

                    if not self.checked_core[max_combo_of_treatment]:
                        new_cores.append(max_combo_of_treatment)
                        self.checked_core[max_combo_of_treatment] = 1

        # print('\n')

        return new_cores



    def get_new_cores(self, core):

        # if (should be rare) case where core has been a core before, then skip. For prevention of infinite loops
        # 2 means actual checked core, 1 means appended to checked core list but not checked
        if self.checked_core[tuple(core)] == 2:
            print('Prev checked:', core, '!\n')
            return
        else:
            self.checked_core[tuple(core)] = 2
            

        # prepare data for welch test
        surrounding_coordinates = self.get_surrounding_coordinates(core)

        indices = self.get_indices(core)

        # put coordinates into treatments and nulls
        treatment, null, direction = self.get_blocks(core, surrounding_coordinates, indices)

        # actually tune the surrounding coordinates
        for combo in surrounding_coordinates:
            
            if self.checked[tuple(combo)] == 0:
                self._train_and_test_combo(combo)

        # perform welch test and return surrounding coordinates that should be used as new core
        new_cores = self.find_new_core(treatment, null, direction)

        return new_cores  



    def GuidanceSystem(self, core):

        if self.restarts == 0:
            print("BEGIN INITIAL GUIDANCE\n")
        else:
            print("RESTART GUIDANCE: ROUND", self.restarts, '\n')

        print('ROUND', self.restarts, 'ITERATION: ', 0, '\n')

        # first get a surrounding 3^d tuned
        new_cores = self.get_new_cores(core)

        round = 1
        while new_cores: # while new cores are being added
            print('ROUND', self.restarts, "ITERATION: ", round, "\n") #TODO
            round += 1

            print('New cores:', new_cores, '\n')
            old_new_cores = copy.deepcopy(new_cores)
            new_cores = list()

            # for each of the new cores, 'recursively' tune and grab new cores; but each Iteration doesn't end until all cores of current round has been checked
            for new_core in old_new_cores:
                
                new_new_cores = self.get_new_cores(new_core)
                for new_new_core in new_new_cores:
                    if self.checked_core[tuple(new_new_core)] == 0:
                        new_cores.append(new_new_core)
                        self.checked_core[tuple(new_new_core)] = 1

        # for current max, get 3^d block. if new max happens to be found, continue to do 3^d block until no new max is found
        # just a cheap way to flesh out the max (the goal of YangZhou)

        while self.been_best[tuple(self.best_combo)] == 0:

            self.been_best[tuple(self.best_combo)] = 1
            #add surrounding find!! ##functionalise
            surrounding_coordinates = self.get_surrounding_coordinates(self.best_combo)
            for combo in surrounding_coordinates:
                
                if self.checked[tuple(combo)] == 0:
                    self._train_and_test_combo(combo)
                    self.checked[combo] = 1

        # print information of this round # TODO: comment
        if self.restarts == 0:
            print(f"\nYANGZHOU INITIAL GUIDE STAGE ENDED\n")
        else:
            print(f"\nYANGZHOU GUIDE STAGE {self.restarts} ENDED\n")


        if len(self.hyperparameters) == 2:
            print('Score: \n', self.result.round(4), '\n')
            print('Checked Boxes: \n', self.checked.round(4), '\n')

        print('Max Accuracy From This Guidance Round: \n', self.best_score)
        print('Max Combo From This Guidance Round: \n', self.best_combo)

        print('% Combos Checked Thus Far:', int(sum(self.checked)), 'out of', np.prod(self.n_items), 'which is', f'{np.mean(self.checked).round(8)*100}%')

        # print('Best Combo Found Thus Far?', max_accuracy == max(synthetic_data))
        # print('Accuracy Diff Between Max Combo and Max Found Thus Far:', max(synthetic_data)-max_accuracy)
        print("\n")



    def tune(self):

        print("YANGZHOU SYSTEM ACTIVATED\n\n") ##TODO
        
        self._get_core()
        self.get_cruise_combinations() #TODO: should append core too, also potential problem for there not to be enough cores to go around

        first_round_combinations = copy.deepcopy(self.cruise_combinations)
        first_round_combinations.append(self._core) 

        #TODO: use type B to randomise first round

        #TODO: Message; 应该可以functionalise
        # First, tune all cruise combinations and initial core
        for combo in first_round_combinations:
            
            if not self.checked[tuple(combo)]:

                self._train_and_test_combo(combo)
            
            else:
                #TODO: Message
                print(f'Already Trained and Tested combination {self._up_to}')
        
        print('Initial core:', self._core, '\n') #TODO: Message
        self._core = self.best_combo
        self.surrounding_vectors = self.get_surrounding_vectors(self.best_core)
        


        print("YANGZHOU GUIDE SYSTEM ACTIVATED\n") #TODO

        # Initial Round of Guidance
        self.GuidanceSystem(self.core)

        # Recursively Cruise and restart Guide if find a combo that is within halfwidth of max
        print("YANGZHOU CRUISE SYSTEM ACTIVATED\n")
        self.cruising = True
        self.restarts = 1
        while self.cruising:
            self.CruiseSystem()

            if self.cruising:
                self.GuidanceSystem(self._core)
                self.restarts += 1

        # Final extensive search around maxes.
        print("YANGZHOU FINAL GUIDANCE ACTIVATED\n")
        old_max_accuracy = copy.deepcopy(self.best_score)
        self.restarts = 'FINAL'

        self.GuidanceSystem(self.best_combo)

        while(self.max_score-old_max_accuracy > 0):
            old_max_accuracy = copy.deepcopy(self.best_score)
            self.GuidanceSystem(self.best_combo)


        # Display final information
        print("YANGZHOU FINAL GUIDANCE STAGE ENDED")
        print("YANGZHOU MISSION ACCOMPLISHED\n")

        if len(self.hyperparameters) == 2:
            print('Final Found: \n', self.result.round(4), '\n')
            print('Final Checked Boxes: \n', self.checked.round(4), '\n')
            print('Final Checked Cores: \n', self.checked_core.round(4), '\n')

        print('Max Accuracy: \n', self.max_score)
        print('Max Combo: \n', self.best_combo)

        print('% Combos Checked:', int(sum(self.checked)), 'out of', np.prod(self.n_items), 'which is', f'{np.mean(self.checked).round(8)*100}%')

        # print('Best Combo Found?', max_accuracy == max(synthetic_data))
        # print('Accuracy Diff Between Max Combo and Max Found:', max(synthetic_data)-max_accuracy)

    


    def _train_and_test_combo(self, combo):
        """ Helper to train and test each combination as part of tune() """

        combo = tuple(combo)
        
        params = {self.hyperparameters[i]:self.parameter_choices[self.hyperparameters[i]][combo[i]] for i in range(len(self.hyperparameters))}

        # initialise object
        clf = self.model(**params)

        # get time and fit
        start = time.time()
        clf.fit(self.train_x, self.train_y)
        end = time.time()

        # get predicted labels/values for three datasets
        train_pred = clf.predict(self.train_x)
        val_pred = clf.predict(self.val_x)
        test_pred = clf.predict(self.test_x)

        # get scores and time used
        time_used = end-start

        # build output dictionary and save result
        df_building_dict = params

        if self.clf_type == 'Regression':
            train_score = r2_score(self.train_y, train_pred)
            val_score = r2_score(self.val_y, val_pred)
            test_score = r2_score(self.test_y, test_pred)

            train_rmse = np.sqrt(mean_squared_error(self.train_y, train_pred))
            val_rmse = np.sqrt(mean_squared_error(self.val_y, val_pred))
            test_rmse = np.sqrt(mean_squared_error(self.test_y, test_pred))

            train_mape = mean_absolute_percentage_error(self.train_y, train_pred)
            val_mape = mean_absolute_percentage_error(self.val_y, val_pred)
            test_mape = mean_absolute_percentage_error(self.test_y, test_pred)

            df_building_dict['Train r2'] = [np.round(train_score, 4)]
            df_building_dict['Val r2'] = [np.round(val_score, 4)]
            df_building_dict['Test r2'] = [np.round(test_score, 4)]
            df_building_dict['Train RMSE'] = [np.round(train_rmse, 4)]
            df_building_dict['Val RMSE'] = [np.round(val_rmse, 4)]
            df_building_dict['Test RMSE'] = [np.round(test_rmse, 4)]
            df_building_dict['Train MAPE'] = [np.round(train_mape, 4)]
            df_building_dict['Val MAPE'] = [np.round(val_mape, 4)]
            df_building_dict['Test MAPE'] = [np.round(test_mape, 4)]
            df_building_dict['Time'] = [np.round(time_used, 2)]

        
        elif self.clf_type == 'Classification':
            train_score = accuracy_score(self.train_y, train_pred)
            val_score = clf.score(self.val_y, val_pred)
            test_score = clf.score(self.test_y, test_pred)

            train_bal_accu = balanced_accuracy_score(self.train_y, train_pred)
            val_bal_accu = balanced_accuracy_score(self.val_y, val_pred)
            test_bal_accu = balanced_accuracy_score(self.test_y, test_pred)

            train_f1 = f1_score(self.train_y, train_pred, average='weighted')
            val_f1 = f1_score(self.val_y, val_pred, average='weighted')
            test_f1 = f1_score(self.test_y, test_pred, average='weighted')

            train_precision = precision_score(self.train_y, train_pred, average='weighted')
            val_precision = precision_score(self.val_y, val_pred, average='weighted')
            test_precision = precision_score(self.test_y, test_pred, average='weighted')
        
            train_recall = recall_score(self.train_y, train_pred, average='weighted')
            val_recall = recall_score(self.val_y, val_pred, average='weighted')
            test_recall = recall_score(self.test_y, test_pred, average='weighted')

            df_building_dict['Train accu'] = [np.round(train_score, 4)]
            df_building_dict['Val accu'] = [np.round(val_score, 4)]
            df_building_dict['Test accu'] = [np.round(test_score, 4)]
            df_building_dict['Train balanced_accuracy'] = [np.round(train_bal_accu, 4)]
            df_building_dict['Val balanced_accuracy'] = [np.round(val_bal_accu, 4)]
            df_building_dict['Test balanced_accuracy'] = [np.round(test_bal_accu, 4)]
            df_building_dict['Train f1'] = [np.round(train_f1, 4)]
            df_building_dict['Val f1'] = [np.round(val_f1, 4)]
            df_building_dict['Test f1'] = [np.round(test_f1, 4)]
            df_building_dict['Train precision'] = [np.round(train_precision, 4)]
            df_building_dict['Val precision'] = [np.round(val_precision, 4)]
            df_building_dict['Test precision'] = [np.round(test_precision, 4)]
            df_building_dict['Train recall'] = [np.round(train_recall, 4)]
            df_building_dict['Val recall'] = [np.round(val_recall, 4)]
            df_building_dict['Test recall'] = [np.round(test_recall, 4)]
            df_building_dict['Time'] = [np.round(time_used, 2)]

        tmp = pd.DataFrame(df_building_dict)

        self.tuning_result = self.tuning_result.append(tmp)
        self._save_tuning_result()

        # update best score stats
        if val_score > self.best_score: 
            self.best_score = val_score
            self.best_clf = clf
            self.best_combo = combo

        # update internal governing DataFrames
        self.checked[combo] = 1
        self.result[combo] = val_score

        print(f'''Trained and Tested combination {self._up_to}, taking {np.round(time_used, 2)} seconds
        Current best combo: {self.best_combo} with val score {self.best_score}''')

In [92]:
parameter_choices = {
    'gamma': (0, 0.0001, 0.01, 1, 100),
    'subsample': (0.25, 0.5, 0.75),
    'colsample_bytree': (0.25, 0.5, 0.75),
    'max_depth': (5, 10, 25, 50, 100),
    'eta': (0.15, 0.3, 0.45, 0.6, 0.75, 0.9)

}

In [93]:
yangzhou = YangZhou()

YangZhou Initialised
