In [1]:
DATA_TYPE = 'RS'
LABEL = '3'

In [2]:
import pandas as pd
import copy
import time
import numpy as np
import random
import pickle

from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score





class JiXi:



    def __init__(self):
        """ Initialise class """
        self._initialise_objects()

        print('JiXi Initialised')



    def _initialise_objects(self):
        """ Helper to initialise objects """

        self.train_x = None
        self.train_y = None
        self.val_x = None
        self.val_y = None
        self.test_x = None
        self.test_y = None
        self.tuning_result = None
        self.model = None
        self.parameter_choices = None
        self.hyperparameters = None
        self.checked = None
        self.result = None
        self.tuning_result_saving_address = None
        self.object_saving_address = None
        self._up_to = 0
        self._seed = 19421221
        self.best_score = -np.inf
        self.best_combo = None
        self.best_clf = None
        self.clf_type = None
        self.combos = None
        self.n_items = None
        self.outmost_layer = None
        self._core = None
        self._relative_combos = None
        self._both_combos = None
        self._dealt_with = None
        self._pos_neg_combos = None
        self._abs_max = None
        self._new_combos = None
        self._parameter_value_map_index = None

        self.regression_extra_output_columns = ['Train r2', 'Val r2', 'Test r2', 
            'Train RMSE', 'Val RMSE', 'Test RMSE', 'Train MAPE', 'Val MAPE', 'Test MAPE', 'Time']
        self.classification_extra_output_columns = ['Train accu', 'Val accu', 'Test accu', 
            'Train balanced_accu', 'Val balanced_accu', 'Test balanced_accu', 'Train f1', 'Val f1', 'Test f1', 
            'Train precision', 'Val precision', 'Test precision', 'Train recall', 'Val recall', 'Test recall', 'Time']

        

    def read_in_data(self, train_x, train_y, val_x, val_y, test_x, test_y):
        """ Reads in train validate test data for tuning """

        self.train_x = train_x
        print("Read in Train X data")

        self.train_y = train_y
        print("Read in Train x data")

        self.val_x = val_x
        print("Read in Val X data")

        self.val_y = val_y
        print("Read in Val y data")

        self.test_x = test_x
        print("Read in Test X data")

        self.test_y = test_y
        print("Read in Test y data")



    def read_in_model(self, model, type):
        """ Reads in underlying model object for tuning, and also read in what type of model it is """

        assert type == 'Classification' or type == 'Regression' # check

        # record
        self.model = model
        self.clf_type = type 

        print(f'Successfully read in model {self.model}, which is a {self.clf_type} model')



    def set_hyperparameters(self, parameter_choices):
        """ Input hyperparameter choices """

        self.parameter_choices = parameter_choices
        self._sort_hyperparameter_choices()

        self.hyperparameters = list(parameter_choices.keys())

        # automatically calculate how many different values in each hyperparameter
        self.n_items = [len(parameter_choices[key]) for key in self.hyperparameters]

        # automatically calculate all combinations and setup checked and result arrays and tuning result dataframe
        self._get_combinations()
        self._get_checked_and_result_array()
        self._setup_tuning_result_df()

        print("Successfully recorded hyperparameter choices")



    def _sort_hyperparameter_choices(self):
        """ Helper to ensure all hyperparameter choice values are in order from lowest to highest """

        for key in self.parameter_choices:
            tmp = copy.deepcopy(list(self.parameter_choices[key]))
            tmp.sort()
            self.parameter_choices[key] = tuple(tmp)



    def _get_combinations(self):
        """ Helper to calculate all combinations """

        ##ALGORITHM

        # recursively append values to get every combination in ordinal/numerical form
        self.combos = [[]]
        for i in range(len(self.n_items)):

            tmp = copy.deepcopy(self.combos)
            self.combos = list()

            for x in tmp:

                for k in range(self.n_items[i]):
                    y = copy.deepcopy(x)
                    
                    y.append(k)

                    self.combos.append(y)



    def _get_checked_and_result_array(self):
        """ Helper to set up checked and result array """

        self.checked = np.zeros(shape=self.n_items)
        self.result = np.zeros(shape=self.n_items)



    def _setup_tuning_result_df(self):
        """ Helper to set up tuning result dataframe """

        tune_result_columns = copy.deepcopy(self.hyperparameters)

        # Different set of metric columns for different types of models
        if self.clf_type == 'Classification':
            tune_result_columns.extend(self.classification_extra_output_columns)
        elif self.clf_type == 'Regression':
            tune_result_columns.extend(self.regression_extra_output_columns)

        self.tuning_result = pd.DataFrame({col:list() for col in tune_result_columns})


    
    def change_tuning_style(self, type, seed = None, outer_most_layer = 2, randomise = True): 
        # Function which determines how to order the combinations for tuning

        if not self.combos:
            print("Missing hyperparameter choices, please run .set_hyperparameters() first")
            return

        self.combos.sort() # to ensure functionality of seed, always sort first

        if type == 'a': # Sorted order (nested loops)
            # sorting operation conducted previously
            print('Changed tuning order to sorted')
        
        elif type == 'b': # Random order
            
            if seed:
                random.seed(seed)
            else:
                random.seed(self._seed)
            
            random.shuffle(self.combos)
            print('Changed tuning_order to randomised')
        
        elif type == 'c': # Layer by Layer 
            self._change_tuning_style_c(outer_most_layer, randomise, seed)
            if randomise:
                print(f'Changed tuning order to "Layer by Layer": {self.outmost_layer} Layers, randomised')
            else:
                print(f'Changed tuning order to "Layer by Layer": {self.outmost_layer} Layers, not randomised')

        elif type == 'd': # Diagonal + Horizontal first, before conducting layer by layer
            self._change_tuning_style_d()
            print(f'Changed tuning order to Diag-Hor -> "Layer by Layer": {self.outmost_layer} Layers, randomised')



    def _get_core(self):
        """ Helper to calculate core """
        self._core = [int(i/2) for i in self.n_items]



    def _get_relative_combos(self):
        """ Helper to calculate relative coordinates of combinations""" 
        self._relative_combos = [[combo[j] - self._core[j] for j in range(len(self.n_items))] for combo in self.combos]



    ### TYPE C
    def _get_both_combos(self):
        """ Helper to put (combos, relative combos) together into a tuple """
        self._get_relative_combos()
        self._both_combos = [[self.combos[i], self._relative_combos[i]] for i in range(len(self.combos))]



    def _get_layer_by_layer(self):
        """ Helper to get Layer by Layer order """

        retain = copy.deepcopy(self._both_combos)
        self._dealt_with = list()

        # starting with the outmost layer, ending with -1 because of the >
        for i in range(self.outmost_layer, -2, -1):
            tmp_retain = list()
            tmp_dealt_with = list()
            
            for item in retain:

                trigger = 1
                for j in item[1]:
                    if abs(j) > i:
                        tmp_dealt_with.append(item[0])
                        trigger = 0
                        break
                
                if trigger:
                    tmp_retain.append(item)

            retain = tmp_retain
            self._dealt_with.append(tmp_dealt_with)



    def _get_c_combos(self):
        """ Helper to get the combinations into Layer by Layer order """

        self.combos = list()
        # working with dealt_with backwards because we want the innermost layers first
        for i in range(len(self._dealt_with)-1, -1, -1):
            self.combos.extend(self._dealt_with[i])
            


    def _change_tuning_style_c(self, outmost_layer, randomise, seed):
        """ Helper to run all type-c helpers to get combinations into Layer by Layer order """

        if randomise:
            if seed:
                random.seed(seed)
            else:
                random.seed(self._seed)
        
        random.shuffle(self.combos)

        self.outmost_layer = outmost_layer
        self._get_core()
        self._get_both_combos()
        self._get_layer_by_layer()
        self._get_c_combos()



    ### TYPE D    
    def _change_tuning_style_d(self):
        """ Helper to all type-d helpers to get combinations into Layer by Layer order """

        self._change_tuning_style_c(outmost_layer= max(self.n_items), randomise = True, seed = self._seed)

        self._get_hor_combos()
        self._get_diag_combos()

        self._get_d_combos()



    def _get_hor_combos(self):
        """ Helper to get all combinations that lie on horizontal line from core """
        self._hor_combos = list()
        for i in range(len(self.n_items)):
            for j in range(self.n_items[i]):
                tmp = copy.deepcopy(self._core)
                tmp[i] = j
                self._hor_combos.append(tmp)



    def _get_pos_neg_combos(self):
        """ Helper to get all combinations of -1 and 1 """

        ##ALGORITHM
        self._pos_neg_combos = [[]]
        for i in range(len(self.n_items)):

            tmp = copy.deepcopy(self._pos_neg_combos)
            self._pos_neg_combos = list()

            for x in tmp:

                for k in (-1, 1):
                    y = copy.deepcopy(x)
                    
                    y.append(k)

                    self._pos_neg_combos.append(y)



    def _get_abs_max(self):
        """ Helper to get maximum absolute value of the relative combos """

        max_pos = max([self.n_items[i] - self._core[i] for i in range(len(self.n_items))])
        min_neg = min([0-self._core[i] for i in range(len(self.n_items))])

        self._abs_max = max((max_pos, abs(min_neg)))



    def _get_diag_combos(self):
        """ Helper to get all combinations that lie on diagonal line from core """

        # Implementation idea: first get all the diagonal combos (even those that go outside the field-space) 
        # by using all combinations of (-1, 1) multipled by each value from 1 to _abs_max, before eliminating combos that go
        # outside the field-space
        self._get_pos_neg_combos()
        self._get_abs_max()

        diag_rel_combos = list()
        for i in range(self._abs_max):
            diag_rel_combos.extend([[(i+1)*pos_neg_combo[j] for j in range(len(self.n_items))] for pos_neg_combo in self._pos_neg_combos])
        
        tmp_diag_combos = [[combo[j] + self._core[j] for j in range(len(self.n_items))] for combo in diag_rel_combos]

        self._diag_combos = list()
        
        for combo in tmp_diag_combos:
            trigger = 1
            for i in range(len(combo)):
                if combo[i] < 0 or combo[i] >= self.n_items[i]: # if outside field space then eliminate
                    trigger = 0
                    break
            
            if trigger:
                self._diag_combos.append(combo)



    def _get_d_combos(self):
        """ Helper to run all type-c helpers to get combinations into Diag-Hor -> Layer by Layer order """

        self._new_combos = list()

        self._new_combos.append(self._core)

        # put in diagonal first to get more variety
        for combo in self._diag_combos:
            if combo not in self._new_combos:
                self._new_combos.append(combo)

        for combo in self._hor_combos:
            if combo not in self._new_combos:
                self._new_combos.append(combo)

        # put in rest of the combos - already sorted in layer by layer order
        for combo in self.combos:
            if combo not in self._new_combos:
                self._new_combos.append(combo)
        
        self.combos = self._new_combos


        
    def tune(self): #TODO
        """ Begin tuning """

        if self.train_x is None or self.train_y is None or self.val_x is None or self.val_y is None or self.test_x is None or self.test_y is None:
            print(" Missing one of the datasets, please run .read_in_data() ")
            return

        if self.model is None:
            print(" Missing model, please run .read_in_model() ")
            return
        
        if self.combos is None:
            print("Missing hyperparameter choices, please run .set_hyperparameters() first")
            return

        if self.tuning_result_saving_address is None:
            print("Missing tuning result csv saving address, please run ._save_tuning_result() first")


        self._up_to = 0     # reset

        for combo in self.combos:
            
            self._up_to += 1

            if not self.checked[tuple(combo)]:

                self._train_and_test_combo(combo)
            
            else:
                print(f'Already Trained and Tested combination {self._up_to}')
      


    def _train_and_test_combo(self, combo):
        """ Helper to train and test each combination as part of tune() """

        combo = tuple(combo)
        
        params = {self.hyperparameters[i]:self.parameter_choices[self.hyperparameters[i]][combo[i]] for i in range(len(self.hyperparameters))}

        # initialise object
        clf = self.model(**params)

        # get time and fit
        start = time.time()
        clf.fit(self.train_x, self.train_y)
        end = time.time()

        # get predicted labels/values for three datasets
        train_pred = clf.predict(self.train_x)
        val_pred = clf.predict(self.val_x)
        test_pred = clf.predict(self.test_x)

        # get scores and time used
        time_used = end-start

        # build output dictionary and save result
        df_building_dict = params

        if self.clf_type == 'Regression':
            train_score = r2_score(self.train_y, train_pred)
            val_score = r2_score(self.val_y, val_pred)
            test_score = r2_score(self.test_y, test_pred)

            train_rmse = np.sqrt(mean_squared_error(self.train_y, train_pred))
            val_rmse = np.sqrt(mean_squared_error(self.val_y, val_pred))
            test_rmse = np.sqrt(mean_squared_error(self.test_y, test_pred))

            train_mape = mean_absolute_percentage_error(self.train_y, train_pred)
            val_mape = mean_absolute_percentage_error(self.val_y, val_pred)
            test_mape = mean_absolute_percentage_error(self.test_y, test_pred)

            df_building_dict['Train r2'] = [np.round(train_score, 4)]
            df_building_dict['Val r2'] = [np.round(val_score, 4)]
            df_building_dict['Test r2'] = [np.round(test_score, 4)]
            df_building_dict['Train RMSE'] = [np.round(train_rmse, 4)]
            df_building_dict['Val RMSE'] = [np.round(val_rmse, 4)]
            df_building_dict['Test RMSE'] = [np.round(test_rmse, 4)]
            df_building_dict['Train MAPE'] = [np.round(train_mape, 4)]
            df_building_dict['Val MAPE'] = [np.round(val_mape, 4)]
            df_building_dict['Test MAPE'] = [np.round(test_mape, 4)]
            df_building_dict['Time'] = [np.round(time_used, 2)]

        
        elif self.clf_type == 'Classification':
            train_score = accuracy_score(self.train_y, train_pred)
            val_score = clf.score(self.val_y, val_pred)
            test_score = clf.score(self.test_y, test_pred)

            train_bal_accu = balanced_accuracy_score(self.train_y, train_pred)
            val_bal_accu = balanced_accuracy_score(self.val_y, val_pred)
            test_bal_accu = balanced_accuracy_score(self.test_y, test_pred)

            train_f1 = f1_score(self.train_y, train_pred, average='weighted')
            val_f1 = f1_score(self.val_y, val_pred, average='weighted')
            test_f1 = f1_score(self.test_y, test_pred, average='weighted')

            train_precision = precision_score(self.train_y, train_pred, average='weighted')
            val_precision = precision_score(self.val_y, val_pred, average='weighted')
            test_precision = precision_score(self.test_y, test_pred, average='weighted')
        
            train_recall = recall_score(self.train_y, train_pred, average='weighted')
            val_recall = recall_score(self.val_y, val_pred, average='weighted')
            test_recall = recall_score(self.test_y, test_pred, average='weighted')

            df_building_dict['Train accu'] = [np.round(train_score, 4)]
            df_building_dict['Val accu'] = [np.round(val_score, 4)]
            df_building_dict['Test accu'] = [np.round(test_score, 4)]
            df_building_dict['Train balanced_accuracy'] = [np.round(train_bal_accu, 4)]
            df_building_dict['Val balanced_accuracy'] = [np.round(val_bal_accu, 4)]
            df_building_dict['Test balanced_accuracy'] = [np.round(test_bal_accu, 4)]
            df_building_dict['Train f1'] = [np.round(train_f1, 4)]
            df_building_dict['Val f1'] = [np.round(val_f1, 4)]
            df_building_dict['Test f1'] = [np.round(test_f1, 4)]
            df_building_dict['Train precision'] = [np.round(train_precision, 4)]
            df_building_dict['Val precision'] = [np.round(val_precision, 4)]
            df_building_dict['Test precision'] = [np.round(test_precision, 4)]
            df_building_dict['Train recall'] = [np.round(train_recall, 4)]
            df_building_dict['Val recall'] = [np.round(val_recall, 4)]
            df_building_dict['Test recall'] = [np.round(test_recall, 4)]
            df_building_dict['Time'] = [np.round(time_used, 2)]

        tmp = pd.DataFrame(df_building_dict)

        self.tuning_result = self.tuning_result.append(tmp)
        self._save_tuning_result()

        # update best score stats
        if val_score > self.best_score: 
            self.best_score = val_score
            self.best_clf = clf
            self.best_combo = combo

        # update internal governing DataFrames
        self.checked[combo] = 1
        self.result[combo] = val_score


        print(f'''Trained and Tested combination {self._up_to}, taking {np.round(time_used, 2)} seconds
        Current best combo: {self.best_combo} with val score {self.best_score}''')



    def _save_tuning_result(self):
        """ Helper to export tuning result csv """

        self.tuning_result.to_csv(f'{self.tuning_result_saving_address}.csv', index=False)


    
    def view_best_combo_and_score(self):
        """ View best combination and its validation score """
        
        print(f'(Current) Best combo: {self.best_combo} with val score {self.best_score}')

    

    def read_in_tuning_result_df(self, address): 
        """ Read in tuning result csv and read data into checked and result arrays """

        if self.parameter_choices is None:
            print("Missing parameter_choices to build parameter_value_map_index, please run set_hyperparameters() first")

        if self.clf_type is None:
            print('Missing clf_type. Please run .read_in_model() first.')

        self.tuning_result = pd.read_csv(address)
        print(f"Successfully read in tuning result of {len(self.tuning_result)} rows")

        self._create_parameter_value_map_index()

        # read DataFrame data into internal governing DataFrames of JiXi
        for row in self.tuning_result.iterrows():
    
            combo = tuple([self._parameter_value_map_index[hyperparam][row[1][hyperparam]] for hyperparam in self.hyperparameters])
            
            self.checked[combo] = 1
            
            if self.clf_type == 'Regression':
                self.result[combo] = row[1]['Val r2']
            elif self.clf_type == 'Classification':
                self.result[combo] = row[1]['Val accu']
        
            # update best score stats
            if self.result[combo] > self.best_score: 
                self.best_score = self.result[combo]
                self.best_clf = None
                print(f"As new Best Combo {combo} is read in, best_clf is set to None")
                self.best_combo = combo


    
    def _create_parameter_value_map_index(self):
        """ Helper to create parameter-value index map """

        self._parameter_value_map_index = dict()
        for key in self.parameter_choices.keys():
            tmp = dict()
            for i in range(len(self.parameter_choices[key])):
                tmp[self.parameter_choices[key][i]] = i
            self._parameter_value_map_index[key] = tmp
    


    def set_tuning_result_saving_address(self, address):
        """ Read in where to save tuning object """

        self.tuning_result_saving_address = address
        print('Successfully set tuning output address')


    
    def _set_object_saving_address(self, address):
        """ Read in where to save the JiXi object """

        self.object_saving_address = address
        print('Successfully set object output address')



    def export_jixi(self, address):
        """ Export jixi object """

        self._set_object_saving_address(address)

        # copy object and set big objects to None
        object_save = copy.deepcopy(self)
        
        object_save.train_x = None
        object_save.train_y = None
        object_save.val_x = None
        object_save.val_y = None
        object_save.test_x = None
        object_save.test_y = None
        object_save._up_to = 0

        # Export
        with open(f'{self.object_saving_address}.pickle', 'wb') as f:
            pickle.dump(object_save, f)

        print(f'Successfully exported JiXi object as {self.object_saving_address}')

In [3]:
from ZhongShan import *
import pickle
import copy
import numpy as np
import json

In [4]:
class JiaoCheng:

    def __init__(self, sanmin):
        """ Initialise class """
        self.sanmin = sanmin
        self._initialise_objects()

        print('JiaoCheng Initialised')



    def _initialise_objects(self):
        pass
    

    
    def get_feature_combinations(self, score_type, label, penalty_function_type, export_address = None):
        """ Function that gets combinations based on JiaoCheng's algorithm along with its score, based on
        inputted score type, label and penalty function type. Has option to export"""

        if score_type not in ('NMI', 'Abs Corr'):
            print("score_type should be either 'NMI' or 'Abs Corr'")
            return
        
        if label not in self.sanmin.label_columns:
            print("label should be in the designated labels column")
            return

        if penalty_function_type not in ('None', 'Mean', 'Max'):
            print("penalty_function_type should be either 'None' or 'Mean' or 'Max'")
            return

        # get the correct matrix
        if score_type == 'Abs Corr':
            score_matrix = self.sanmin.abs_corr_matrix
        elif score_type == 'NMI':
            score_matrix = self.sanmin.NMI_matrix
            
        # get the correct penalty function
        if penalty_function_type == 'None':
            funct = self._return_zero
        elif penalty_function_type == 'Mean':
            funct = np.mean
        elif penalty_function_type == 'Max':
            funct = max

        # object to output
        feature_combos_with_score = list()

        # starting with each individual feature
        for feature in self.sanmin.final_features[label]:
            if feature in self.sanmin.label_columns: # don't add if it is a label
                continue

            # initial current combo
            curr_combo = [feature]

            # initial current score
            curr_combo_score = score_matrix.loc[feature][label]

            # initial combo appended with its score
            feature_combos_with_score.append((copy.deepcopy(curr_combo), curr_combo_score))

            switch = True

            while switch is True:

                # temporary scores
                curr_added_value = 0
                curr_feature_to_add = None

                # for all try-able combinations
                for candidate_feature in self.sanmin.final_features[label]: 
                    if candidate_feature in curr_combo or candidate_feature in self.sanmin.label_columns: # don't try those already in, nor those that are labels
                        continue
                    
                    # get candidate's own corr with label
                    candidate_feature_score = score_matrix.loc[candidate_feature][label]

                    # get list of corr between candidate and features currently in combo
                    candidate_feature_relation_scores = list()
                    for curr_combo_feature in curr_combo:
                        candidate_feature_relation_scores.append(score_matrix.loc[candidate_feature][curr_combo_feature])
                    
                    # if candidate score - penalty > current best, then accept; else, don't accept
                    if candidate_feature_score - funct(candidate_feature_relation_scores) >= curr_added_value:
                        curr_added_value = candidate_feature_score - funct(candidate_feature_relation_scores)
                        curr_feature_to_add = candidate_feature

                # if managed to add something to combo, then continue loop and add to overall list, else break this loop
                if curr_feature_to_add is None:
                    switch = False

                else:
                    curr_combo.append(curr_feature_to_add)
                    curr_combo_score += curr_added_value
                    feature_combos_with_score.append((copy.deepcopy(curr_combo), curr_combo_score))

        # Remove duplicates and sort
        feature_combos_with_score = self._features_duplicate_removal(feature_combos_with_score)
        feature_combos_with_score.sort(key = lambda x: x[1])

        # Get pure feature combinations, and scores of feature combinations
        feature_combo = [feature_combo[0] for feature_combo in feature_combos_with_score]
        feature_combo_scores = [feature_combo[1] for feature_combo in feature_combos_with_score]

        print(f"{len(feature_combos_with_score)} feature combinations, with combo scores ranging from {round(feature_combo_scores[0], 4)} to {round(feature_combo_scores[-1], 4)}")
        

        # Export combinations and scores as a json
        if export_address:
            json_output = {'feature_combos_with_score': feature_combos_with_score, 
                            'feature_combo': feature_combo, 
                            'feature_combo_scores': feature_combo_scores}
                            
            with open(f'{export_address}.json', 'w') as f:
                json.dump(json_output, f, indent=2) 
            print("Export Completed")
    
        return feature_combos_with_score, feature_combo, feature_combo_scores


    def _return_zero(self, dummy):
        """ Helper function that returns 0 for penalty, no matter input """
        return 0
    


    def _features_duplicate_removal(self, feature_combos_with_score):
        """ Helper function that remove duplicate combinations """
        for i in range(len(feature_combos_with_score)):
            feature_combos_with_score[i][0].sort()

        feature_combos_with_score.sort(key = lambda x:x[0])

        duplicate_i = []

        feature_combos_no_duplicates = []

        for i in range(len(feature_combos_with_score)-1):
            if i in duplicate_i:
                continue

            if feature_combos_with_score[i][0] == feature_combos_with_score[i+1][0]:
                # retain the higher score
                if feature_combos_with_score[i][1] >= feature_combos_with_score[i+1][1]:
                    duplicate_i.append(i)
                else:
                    duplicate_i.append(i+1)
            
            if i not in duplicate_i:
                feature_combos_no_duplicates.append(feature_combos_with_score[i])

        return feature_combos_no_duplicates

In [5]:
with open(f'../models/AFL_pipeline_{DATA_TYPE}.pickle', 'rb') as f:
    sanmin = pickle.load(f)

In [6]:
jiaocheng = JiaoCheng(sanmin)

JiaoCheng Initialised


In [7]:
feature_combos_with_score_none, feature_combo_none, feature_combo_scores_none = jiaocheng.get_feature_combinations('Abs Corr', LABEL, 'None')
feature_combos_with_score_mean, feature_combo_mean, feature_combo_scores_mean =jiaocheng.get_feature_combinations('Abs Corr', LABEL, 'Mean')
feature_combos_with_score_max, feature_combo_max, feature_combo_scores_max =jiaocheng.get_feature_combinations('Abs Corr', LABEL, 'Max')

409 feature combinations, with combo scores ranging from 0.1016 to 3.6463
36 feature combinations, with combo scores ranging from 0.1016 to 0.2017
36 feature combinations, with combo scores ranging from 0.1016 to 0.2017


In [8]:
all_combos = list()
all_combos.extend(feature_combos_with_score_none)
all_combos.extend(feature_combos_with_score_mean)
all_combos.extend(feature_combos_with_score_max)
len(all_combos)

481

In [9]:
def features_duplicate_removal(feature_combos_with_score):
        """ Helper function that remove duplicate combinations """
        for i in range(len(feature_combos_with_score)):
            feature_combos_with_score[i][0].sort()

        feature_combos_with_score.sort(key = lambda x:x[0])

        duplicate_i = []

        feature_combos_no_duplicates = []

        for i in range(len(feature_combos_with_score)-1):
            if i in duplicate_i:
                continue

            if feature_combos_with_score[i][0] == feature_combos_with_score[i+1][0]:
                duplicate_i.append(i)
            
            if i not in duplicate_i:
                feature_combos_no_duplicates.append(feature_combos_with_score[i])

        return feature_combos_no_duplicates

In [10]:
all_combos = features_duplicate_removal(all_combos)

In [11]:
len(all_combos)

386

In [12]:
import pandas as pd
train_data = pd.read_csv(f'../data/curated/modelling/{DATA_TYPE}_Train_{LABEL}.csv')
val_data = pd.read_csv(f'../data/curated/modelling/{DATA_TYPE}_Validate_{LABEL}.csv')
test_data = pd.read_csv(f'../data/curated/modelling/{DATA_TYPE}_Test_{LABEL}.csv')

In [13]:
train_x = train_data.drop([LABEL], axis=1)
train_y = train_data[LABEL]
val_x = val_data.drop([LABEL], axis=1)
val_y = val_data[LABEL]
test_x = test_data.drop([LABEL], axis=1)
test_y = test_data[LABEL]

In [14]:
import statsmodels.api as sm

In [15]:
def get_accuracy_df(model, x, y, combo):
    pred_y = list((model.predict(x[combo])))

    accu_df = pd.DataFrame({'pred': pred_y, 'y': list(y)})

    accu_df['y_mean'] = np.mean(y)
    
    accu_df['TotErr'] = accu_df['y'] - accu_df['y_mean']
    accu_df['TotSqErr'] = np.power(accu_df['TotErr'], 2)

    accu_df['ResErr'] = accu_df['y'] - accu_df['pred']
    accu_df['ResSqErr'] = np.power(accu_df['ResErr'], 2)

    return accu_df

In [16]:
def get_r2(accu_df):

    return 1 - sum(accu_df['ResSqErr'])/sum(accu_df['TotSqErr'])

In [17]:
def get_adj_r2(accu_df, n, k):
    
    return 1-(sum(accu_df['ResSqErr'])/sum(accu_df['TotSqErr']))*((n-1)/(n-k-1))

In [18]:
def get_norm_llh(accu_df, n):

    llh = -(n/2)*np.log(sum(accu_df['ResSqErr'])/n)
    
    # sd_hat = sum(np.sqrt(accu_df['ResSqErr']))

    # llh = -0.5*n * np.log(2*np.pi) - n * np.log(sd_hat) - (0.5*sd_hat) * sum(accu_df['ResSqErr'])

    return llh

In [19]:
def get_aic(llh, k):

    aic = 2*k - 2*llh
    
    return aic

In [20]:
def get_bic(llh, n, k):

    bic = (k+1)*np.log(n) - 2*llh
    
    return bic

In [21]:
tuning_result = pd.DataFrame()

for combo in all_combos:

    OLS_lm_fit = sm.OLS(train_y, train_x[combo[0]]).fit()


    train_accu_df = get_accuracy_df(OLS_lm_fit, train_x, train_y, combo[0])
    val_accu_df = get_accuracy_df(OLS_lm_fit, val_x, val_y, combo[0])
    test_accu_df = get_accuracy_df(OLS_lm_fit, test_x, test_y, combo[0])
    
    n_train = len(train_accu_df)
    n_val = len(val_accu_df)
    n_test = len(test_accu_df)
    k = len(combo)

    train_r2 = get_r2(train_accu_df)
    val_r2 = get_r2(val_accu_df)
    test_r2 = get_r2(test_accu_df)

    train_r2_adj = get_adj_r2(train_accu_df, n_train, k)
    val_r2_adj = get_adj_r2(val_accu_df, n_val, k)
    test_r2_adj = get_adj_r2(test_accu_df, n_test, k)

    train_llh = get_norm_llh(train_accu_df, n_train)
    val_llh = get_norm_llh(val_accu_df, n_val)
    test_llh = get_norm_llh(test_accu_df, n_test)

    train_aic = get_aic(train_llh, k)
    val_aic = get_aic(val_llh, k)
    test_aic = get_aic(test_llh, k)

    train_bic = get_bic(train_llh, n_train, k)
    val_bic = get_bic(val_llh, n_val, k)
    test_bic = get_bic(test_llh, n_test, k)


    tmp = pd.DataFrame()
    tmp['combo'] = [combo[0]]
    tmp['score'] = [combo[1]]

    # tmp['Train r2'] = [OLS_lm_fit.rsquared]
    tmp['Train r2'] = [train_r2]
    tmp['Val r2'] = [val_r2]
    tmp['Test r2'] = [test_r2]

    # tmp['Train r2_adj'] = [OLS_lm_fit.rsquared_adj]
    tmp['Train r2_adj'] = [train_r2_adj]
    tmp['Val r2_adj'] = [val_r2_adj]
    tmp['Test r2_adj'] = [test_r2_adj]

    # tmp['Train AIC'] = [OLS_lm_fit.aic]
    tmp['Train AIC'] = [train_aic]
    tmp['Val AIC'] = [val_aic]
    tmp['Test AIC'] = [test_aic]

    # tmp['Train BIC'] = [OLS_lm_fit.bic]
    tmp['Val BIC'] = [train_bic]
    tmp['Val BIC'] = [val_bic]
    tmp['Test BIC'] = [test_bic]

    # tmp['Train llh'] = [OLS_lm_fit.llf]
    tmp['Train llh'] = [train_llh]
    tmp['Val llh'] = [val_llh]
    tmp['Test llh'] = [test_llh]    

    tuning_result = tuning_result.append(tmp)

tuning_result.to_csv(f'../models/tuning/{DATA_TYPE}_lm_{LABEL}.csv')