In [46]:
import pandas as pd
import copy
import time
import numpy as np
import random

In [47]:
class TaiZhou:



    def __init__(self):
        pass

        self._initialise_objects()

        print('TaiZhou Initialised')



    def _initialise_objects(self):

        self.train_x = None
        self.train_y = None
        self.val_x = None
        self.val_y = None
        self.test_x = None
        self.test_y = None
        self.tuning_result = None
        self.model = None
        self.parameter_choices = None
        self.hyperparameters = None
        self.checked = None
        self.result = None
        self.tuning_result = None
        self.tuning_result_saving_address = None
        self._up_to = 0
        self._seed = 19421221
        self.best_score = 0
        self.best_combo = None
        self.best_clf = None

        self.extra_output_columns = ['Train score', 'Val score', 'Test score', 'Time']

        

    def read_in_data(self, train_x, train_y, val_x, val_y, test_x, test_y):

        self.train_x = train_x
        print("Read in Train X data")

        self.train_y = train_y
        print("Read in Train x data")

        self.val_x = val_x
        print("Read in Val X data")

        self.val_y = val_y
        print("Read in Val y data")

        self.test_x = test_x
        print("Read in Test X data")

        self.test_y = test_y
        print("Read in Test y data")



    def read_in_model(self, model):

        self.model = model

        print(f'Successfully read in model {self.model}')



    def set_parameters(self, parameter_choices):
        
        self.parameter_choices = parameter_choices

        self.hyperparameters = list(parameter_choices.keys())

        self.n_items = [len(parameter_choices[key]) for key in self.hyperparameters]

        self._get_combinations()
        self._get_checked_and_result_array()
        self._setup_tuning_result_df()

        print("Successfully recorded parameter choices")


    def set_address(self, address):
        
        self.tuning_result_saving_address = address
        print('Successfully set tuning output address')



    def _get_combinations(self):
        
        self.combos = [[]]
        for i in range(len(self.n_items)):

            tmp = copy.deepcopy(self.combos)
            self.combos = list()

            for x in tmp:

                for k in range(self.n_items[i]):
                    y = copy.deepcopy(x)
                    
                    y.append(k)

                    self.combos.append(y)



    def _get_checked_and_result_array(self):

        self.checked = np.zeros(shape=self.n_items)
        self.result = np.zeros(shape=self.n_items)



    def _setup_tuning_result_df(self):

        tune_result_columns = copy.deepcopy(self.hyperparameters)
        tune_result_columns.extend(self.extra_output_columns)

        self.tuning_result = pd.DataFrame({col:list() for col in tune_result_columns})


    
    def change_tuning_style(self, type, seed = None): #TODO
        
        if type == 'a':
            self.combos.sort()
            print('Changed tuning order to sorted')
        
        elif type == 'b':
            if seed:
                random.seed(seed)
            else:
                random.seed(self._seed)
            
            random.shuffle(self.combos)
            print('Changed tuning_order to randomised')
        
        elif type == 'c':
            pass

        elif type == 'd':
            pass


    
    def tune(self): #TODO
        
        for combo in self.combos:
            
            self._up_to += 1

            if not self.checked[tuple(combo)]:

                self._train_and_test_combo(combo)
            


    def _train_and_test_combo(self, combo):
        
        params = {self.hyperparameters[i]:self.parameter_choices[self.hyperparameters[i]][combo[i]] for i in range(len(self.hyperparameters))}

        # initialise object
        clf = self.model(**params)

        # get time and fit
        start = time.time()
        clf.fit(self.train_x, self.train_y)
        end = time.time()

        # get scores and time used
        train_score = clf.score(self.train_x, self.train_y)
        val_score = clf.score(self.val_x, self.val_y)
        test_score = clf.score(self.test_x, self.test_y)
        time_used = end-start

        # build output dictionary and save result
        df_building_dict = params
        df_building_dict['Train score'] = [np.round(train_score, 4)]
        df_building_dict['Val score'] = [np.round(val_score, 4)]
        df_building_dict['Test score'] = [np.round(test_score, 4)]
        df_building_dict['Time'] = [np.round(time_used, 2)]

        tmp = pd.DataFrame(df_building_dict)

        self.tuning_result = self.tuning_result.append(tmp)
        self._save_tuning_result()

        # update best score stats
        if val_score > self.best_score: 
            self.best_score = val_score
            self.best_clf = clf
            self.best_combo = combo

        # update underlying governing DataFrames
        combo = tuple(combo)

        self.checked[combo] = 1
        self.result[combo] = val_score


        print(f'Trained and tested combination {self._up_to}, taking {np.round(time_used, 2)} seconds\n')



    def _save_tuning_result(self):

        self.tuning_result.to_csv(self.tuning_result_saving_address, index=False)

    

    def read_in_checked_result(): #???
        pass


In [48]:
taizhou = TaiZhou()

TaiZhou Initialised


Read in and Prepare Data

In [49]:
train_data = pd.read_csv('../data/curated/modelling/Train_1.csv')
val_data = pd.read_csv('../data/curated/modelling/Validate_1.csv')
test_data = pd.read_csv('../data/curated/modelling/Test_1.csv')

In [50]:
train_x = train_data.drop(['1'], axis=1)
train_y = train_data['1']
val_x = val_data.drop(['1'], axis=1)
val_y = val_data['1']
test_x = test_data.drop(['1'], axis=1)
test_y = test_data['1']

In [51]:
taizhou.read_in_data(train_x, train_y, val_x, val_y, test_x, test_y)

Read in Train X data
Read in Train x data
Read in Val X data
Read in Val y data
Read in Test X data
Read in Test y data


Set parameters

In [52]:
from sklearn.ensemble import RandomForestRegressor as RFR

In [53]:
rfr = RFR

In [54]:
taizhou.read_in_model(rfr)

Successfully read in model <class 'sklearn.ensemble._forest.RandomForestRegressor'>


In [55]:
parameter_choices = {
    'n_estimators': (50, 100),
    'max_depth': (5, 10, 25, 50, 100) ,   
    'random_state': (19421220,),
    'ccp_alpha': (0, 0.0001, 0.01, 1, 100, 10000),
    'max_samples': (0.5, 0.75),
    'max_features': (0.5, 0.75)
}

In [56]:
taizhou.set_parameters(parameter_choices)

Successfully recorded parameter choices


In [57]:
taizhou.change_tuning_style('a')

Changed tuning order to sorted


In [58]:
# import ZhongShan
# zhongshan = ZhongShan

# zhongshan.setup_directory('../models/tuning')

In [59]:
taizhou.set_address('../models/tuning/rfr_1.csv')

Successfully set tuning output address


In [60]:
taizhou.tune()

Trained and tested combination 1, taking 6.45 seconds

Trained and tested combination 2, taking 9.6 seconds

Trained and tested combination 3, taking 8.56 seconds

Trained and tested combination 4, taking 12.77 seconds

Trained and tested combination 5, taking 6.43 seconds

Trained and tested combination 6, taking 9.58 seconds

Trained and tested combination 7, taking 8.56 seconds

Trained and tested combination 8, taking 12.78 seconds

Trained and tested combination 9, taking 6.43 seconds

Trained and tested combination 10, taking 9.58 seconds

Trained and tested combination 11, taking 8.56 seconds

Trained and tested combination 12, taking 12.77 seconds

Trained and tested combination 13, taking 6.43 seconds

Trained and tested combination 14, taking 9.59 seconds

Trained and tested combination 15, taking 8.56 seconds

Trained and tested combination 16, taking 12.77 seconds

Trained and tested combination 17, taking 6.43 seconds

Trained and tested combination 18, taking 9.59 seconds