# Random Forest Regressor
## YangZhou Class Tuner (Heuristic/Greedy)

In [1]:
import pandas as pd
import os

In [2]:
from sklearn.ensemble import RandomForestRegressor

In [4]:
from YangZhou.GuidanceAlgorithm import *
from YangZhou.CruiseEngine import *
from YangZhou.CruiseAlgorithm import *

In [12]:
# create directory to store tuning results
output_relative_dirs = ['../data/curated/tuning']

# check if it exists as it makedir will raise an error if it does exist
for output_relative_dir in output_relative_dirs:
    if not os.path.exists(output_relative_dir):
        os.makedirs(output_relative_dir)

# Import Data

In [13]:
# Import Data
Train_data_X = pd.read_csv('../data/curated/ModelBuilding/Continuous/XTrain_16-1_16-5.csv')
Train_data_y = pd.read_csv('../data/curated/ModelBuilding/Continuous/yTrain_16-1_16-5.csv')
XTrain = Train_data_X
yTrain = Train_data_y['count']

In [14]:
Val_data_X = pd.read_csv('../data/curated/ModelBuilding/Continuous/XVal_16-5_16-6.csv')
Val_data_y = pd.read_csv('../data/curated/ModelBuilding/Continuous/yVal_16-5_16-6.csv')
XVal = Val_data_X
yVal = Val_data_y['count']

In [15]:
Test_data_X = pd.read_csv('../data/curated/ModelBuilding/Continuous/XTest_16-6_16-6.csv')
Test_data_y = pd.read_csv('../data/curated/ModelBuilding/Continuous/yTest_16-6_16-6.csv')
XTest = Test_data_X
yTest = Test_data_y['count']

# Extra One Hot Encoding (On the Fly)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# OHE for DOLocationID

ohe = OneHotEncoder(handle_unknown='ignore')

Train_data_to_ohe = XTrain[['DOLocationID']]
Train_data_ohe = ohe.fit_transform(Train_data_to_ohe).toarray()

Train_data_ohe = pd.DataFrame(Train_data_ohe,
                              columns=list(ohe.get_feature_names_out(['DOLocationID'])))

XTrain = Train_data_X.drop(['DOLocationID'], axis=1)

for col in Train_data_ohe.columns:
    new_col = Train_data_ohe[col]
    new_col.index = range(len(new_col))

    XTrain[str(col)] = new_col

XTrain = XTrain.drop('datetime', axis=1)

Val_data_to_ohe = XVal[['DOLocationID']]
Val_data_ohe = ohe.transform(Val_data_to_ohe).toarray()

Val_data_ohe = pd.DataFrame(Val_data_ohe,
                            columns=list(ohe.get_feature_names_out(['DOLocationID'])))

XVal = XVal.drop(['DOLocationID'], axis=1)

for col in Val_data_ohe.columns:
    new_col = Val_data_ohe[col]
    new_col.index = range(len(new_col))

    XVal[str(col)] = new_col

XVal = XVal.drop('datetime', axis=1)

Test_data_to_ohe = XTest[['DOLocationID']]
Test_data_ohe = ohe.transform(Test_data_to_ohe).toarray()

Test_data_ohe = pd.DataFrame(Test_data_ohe,
                             columns=list(ohe.get_feature_names_out(['DOLocationID'])))

XTest = XTest.drop(['DOLocationID'], axis=1)

for col in Test_data_ohe.columns:
    new_col = Test_data_ohe[col]
    new_col.index = range(len(new_col))

    XTest[str(col)] = new_col

XTest = XTest.drop('datetime', axis=1)


In [None]:
# OHE for PULocationID

ohe = OneHotEncoder(handle_unknown='ignore')

Train_data_to_ohe = XTrain[['PULocationID']]
Train_data_ohe = ohe.fit_transform(Train_data_to_ohe).toarray()

Train_data_ohe = pd.DataFrame(Train_data_ohe,
                              columns=list(ohe.get_feature_names_out(['PULocationID'])))

XTrain = Train_data_X.drop(['PULocationID'], axis=1)

for col in Train_data_ohe.columns:
    new_col = Train_data_ohe[col]
    new_col.index = range(len(new_col))

    XTrain[str(col)] = new_col


Val_data_to_ohe = XVal[['PULocationID']]
Val_data_ohe = ohe.transform(Val_data_to_ohe).toarray()

Val_data_ohe = pd.DataFrame(Val_data_ohe,
                            columns=list(ohe.get_feature_names_out(['PULocationID'])))

XVal = XVal.drop(['PULocationID'], axis=1)

for col in Val_data_ohe.columns:
    new_col = Val_data_ohe[col]
    new_col.index = range(len(new_col))

    XVal[str(col)] = new_col


Test_data_to_ohe = XTest[['PULocationID']]
Test_data_ohe = ohe.transform(Test_data_to_ohe).toarray()

Test_data_ohe = pd.DataFrame(Test_data_ohe,
                             columns=list(ohe.get_feature_names_out(['PULocationID'])))

XTest = XTest.drop(['PULocationID'], axis=1)

for col in Test_data_ohe.columns:
    new_col = Test_data_ohe[col]
    new_col.index = range(len(new_col))

    XTest[str(col)] = new_col



# Processing data for YangZhou Algorithm Input

In [17]:
# Set values to try in each hyperparameter value

arguments = {
    "n_estimators":{1:50, 2:100, 3:150},
    "max_depth": {1:6, 2:8, 3:12, 4:16},
    "max_samples": {1:0.25, 2:0.5, 3:0.75, 4:1},
    "ccp_alpha": {1:0, 2:0.001, 3:0.01, 4:0.1, 5:1, 6:10}
}
arguments

{'n_estimators': {1: 50, 2: 100, 3: 150},
 'max_depth': {1: 6, 2: 8, 3: 12, 4: 16},
 'max_samples': {1: 0.25, 2: 0.5, 3: 0.75, 4: 1},
 'ccp_alpha': {1: 0, 2: 0.001, 3: 0.01, 4: 0.1, 5: 1, 6: 10}}

In [18]:
num_arg_vals = [len(arguments[i]) for i in arguments]
num_arg_vals

[3, 4, 4, 6]

# YangZhou Algorithm

In [27]:
def get_new_cores(core, num_arg_val, surrounding_vectors, found, checked_boxes,
                  checked_core, XTrain, yTrain, XVal, XTest, arguments, tuning_results):

    # if (should be rare) case where core has been a core before, then skip. For prevention of infinite loops
    if checked_core[flatten_coordinates_h(core, num_arg_val)] == 2:
        prev_checked = 1
    else:
        checked_core[flatten_coordinates_h(core, num_arg_val)] = 2
        prev_checked = 0

    if prev_checked:
        print('Prev checked:', core, '!\n')
        return [], found, checked_boxes, checked_core

    # prepare data for welch test
    surrounding_coordinates = get_surrounding_coordinates(core,
                                            surrounding_vectors, num_arg_val)

    indices = get_indices(core)

    # put coordinates into treatments and nulls
    treatment, null, direction = get_blocks(core, surrounding_coordinates,
                                            surrounding_vectors, indices, num_arg_val)

    # actually tune the surrounding coordinates
    for coord in surrounding_coordinates:
        #TODO: TRAIN AND TEST
        index = flatten_coordinates_h(coord, num_arg_val)
        if checked_boxes[index] == 0:
            val_accu, tuning_results = train_and_get_accuracies(arguments,
                                            coord, XTrain, yTrain, XVal, XTest, tuning_results)

            found[index] = val_accu
            checked_boxes[index] = 1

    # perform welch test and return surrounding coordinates that should be used as new core
    new_cores = find_new_core1(treatment, null, direction, num_arg_val, found)

    return new_cores, found, checked_boxes, checked_core, tuning_results

In [28]:
def YangZhou_GuidanceSystem(core, num_arg_val, surrounding_vectors, found,
                            checked_boxes, restarts, checked_core, been_best,
                            XTrain, yTrain, XVal, XTest, arguments, tuning_results):

    if restarts == 0:
        print("BEGIN INITIAL GUIDANCE\n")
    else:
        print("RESTART GUIDANCE: ROUND", restarts, '\n')

    print('ROUND', restarts, 'ITERATION: ', 0, '\n')

    # first get a surrounding 3^d tuned
    new_cores, found, checked_boxes, checked_core, tuning_results = get_new_cores(core,
                                                num_arg_val, surrounding_vectors, found,
                                                checked_boxes, checked_core, XTrain, yTrain,
                                                XVal, XTest, arguments, tuning_results)
    for new_core in new_cores: # only add if not checked before
        if checked_core[flatten_coordinates_h(new_core, num_arg_val)] == 0:
            checked_core[flatten_coordinates_h(new_core, num_arg_val)] = 1

    round = 1
    while new_cores: # while new cores are being added
        print('ROUND', restarts, "ITERATION: ", round, "\n")
        round += 1

        print('New cores:', new_cores, '\n')
        old_new_cores = deepcopy(new_cores)
        new_cores = list()

        # for each of the new cores, 'recursively' tune and grab new cores;
        # but each Iteration doesn't end until all cores of current round has been checked
        for new_core in old_new_cores:

            new_new_cores, found, checked_boxes, checked_core, tuning_results = get_new_cores(new_core,
                                                    num_arg_val, surrounding_vectors, found, checked_boxes,
                                                    checked_core, XTrain, yTrain, XVal, XTest, arguments,
                                                    tuning_results)
            for new_new_core in new_new_cores:
                if checked_core[flatten_coordinates_h(new_new_core, num_arg_val)] == 0:
                    new_cores.append(new_new_core)
                    checked_core[flatten_coordinates_h(new_new_core, num_arg_val)] = 1

    # for current max, get 3^d block. if new max happens to be found, continue to do 3^d block until no new max is found
    # just a cheap way to flesh out the max (the goal of YangZhou)
    max_combo = recreate_coordinates_h(np.argmax(found), num_arg_val)

    while been_best[flatten_coordinates_h((max_combo), num_arg_val)] == 0:

        been_best[flatten_coordinates_h((max_combo), num_arg_val)] = 1
        #add surrounding find!! ##functionalise
        surrounding_coordinates = get_surrounding_coordinates(max_combo, surrounding_vectors, num_arg_val)
        for coord in surrounding_coordinates:
            #TODO: TRAIN AND TEST
            index = flatten_coordinates_h(coord, num_arg_val)
            if checked_boxes[index] == 0:

                val_accu, tuning_results = train_and_get_accuracies(arguments, coord, XTrain,
                                                            yTrain, XVal, XTest, tuning_results)

                found[index] = val_accu
                checked_boxes[index] = 1

        max_combo = recreate_coordinates_h(np.argmax(found),num_arg_val)

    max_accuracy = max(found)

    # print information of this round
    if restarts == 0:
        print(f"\nYANGZHOU INITIAL GUIDE STAGE ENDED\n")
    else:
        print(f"\nYANGZHOU GUIDE STAGE {restarts} ENDED\n")


    if len(arguments) == 2:
        print('Found: \n', np.array(found).reshape(num_arg_val).round(4), '\n')
        print('Checked Boxes: \n', np.array(checked_boxes).reshape(num_arg_val).round(4), '\n')

    print('Max Accuracy From This Guidance Round: \n', max(found))
    print('Max Combo From This Guidance Round: \n', max_combo)

    print('% Combos Checked Thus Far:', int(sum(checked_boxes)), 'out of', cond_prod(num_arg_val),
          'which is', f'{np.mean(checked_boxes).round(8)*100}%')


    return max_combo, max_accuracy, found, checked_boxes, checked_core, been_best, tuning_results

In [29]:
def YangZhou_CruiseSystem(arguments, num_arg, num_arg_val, surrounding_vectors, max_combo,
                          max_accuracy, found, checked_boxes, restarts, checked_core, XTrain,
                          yTrain, XVal, XTest, tuning_results):

    print(f"BEGIN CRUISING: ROUND {restarts}\n")

    # get cruise coordinates in sorted order (furthest away from current max)
    sorted_cruise_coordinates = get_sorted_cruise_coordinates(arguments, num_arg, max_combo)

    # calculate warning threshold
    max_surrounding_mean, max_surrounding_sd = get_max_surrounding_mean_sd(max_combo,
                                                max_accuracy, surrounding_vectors, num_arg_val, found)

    #TODO: add parameter to specify which warning_threshold

    # warning_threshold = cruise_warning_threshold1(max_accuracy, max_surrounding_sd, len(surrounding_vectors)-1)
    warning_threshold = cruise_warning_threshold2(max_accuracy, max_surrounding_sd, len(surrounding_vectors)-1)
    # warning_threshold = cruise_warning_threshold3(max_surrounding_mean, max_surrounding_sd, len(surrounding_vectors)-1)
    # warning_threshold = cruise_warning_threshold4(max_surrounding_mean, max_surrounding_sd, len(surrounding_vectors)-1)

    # check each cruise coordinate
    for cruise_coord in sorted_cruise_coordinates:
        index = flatten_coordinates_h(cruise_coord, num_arg_val)

        # only search if it hasn't been checked before (if has then is not an artifect of significance)
        if not checked_boxes[index]:
            val_accu, tuning_results = train_and_get_accuracies(arguments, cruise_coord,
                                                    XTrain, yTrain, XVal, XTest, tuning_results)

            found[index] = val_accu
            checked_boxes[index] = 1

            # if above warning threshold, then stop cruise and restart guide
            if found[index] >= warning_threshold:
                if len(arguments) == 2:
                    print('Found: \n', np.array(found).reshape(num_arg_val).round(4), '\n')
                    print('Checked Boxes: \n', np.array(checked_boxes).reshape(num_arg_val).round(4), '\n')

                print('Max Accuracy From This Guidance Round: \n', max(found))
                print('Max Combo From This Guidance Round: \n', max_combo)

                print('% Combos Checked Thus Far:', int(sum(checked_boxes)), 'out of',
                      cond_prod(num_arg_val), 'which is', f'{np.mean(checked_boxes).round(8)*100}%')

                print(f"YANGZHOU CRUISE STAGE {restarts} ENDED, RESTARTING GUIDANCE\n")
                return 1, cruise_coord, found, checked_boxes, checked_core, tuning_results

    # if reach here then all cruise indicies checked. can safely say end cruise
    print(f"YANGZHOU CRUISE STAGE {restarts} ENDED\n")
    print(f"YANGZHOU CRUISE SYSTEM SHUTDOWN\n\n")

    return 0, [], found, checked_boxes, checked_core, tuning_results

In [30]:
# def YangZhou_trial(arguments, XTrain, yTrain, XVal, XTest):
#
#     print("YANGZHOU SYSTEM ACTIVATED\n\n")
#
#     # Process inputs and initiate arguments
#     num_arg = {arg:len(arguments[arg]) for arg in arguments}
#     num_arg_val = list(num_arg.values())
#
#     arg_median_values = {arg:(num_arg[arg]//2)+1 for arg in arguments}
#     core = [arg_median_values[arg] for arg in num_arg]
#     print('Initial core:', core, '\n')
#     surrounding_vectors = get_surrounding_vectors(core)
#
#     found = np.zeros(np.prod(num_arg_val))
#     checked_boxes = np.zeros(np.prod(num_arg_val))
#     checked_core = np.zeros(np.prod(num_arg_val))
#     been_best = np.zeros(np.prod(num_arg_val))
#
#     tuning_results = pd.DataFrame()
#
#     print("YANGZHOU GUIDE SYSTEM ACTIVATED\n")
#
#     # Initial Round of Guidance
#     checked_core[flatten_coordinates_h(core, num_arg_val)] = 1
#     max_combo, max_accuracy, found, checked_boxes, checked_core, been_best, tuning_results = YangZhou_GuidanceSystem(core, num_arg_val, surrounding_vectors, found, checked_boxes, 0, checked_core, been_best, XTrain, yTrain, XVal, XTest, arguments, tuning_results)
#
#     # Recursively Cruise and restart Guide if find a combo that is within halfwidth of max
#     print("YANGZHOU CRUISE SYSTEM ACTIVATED\n")
#     cruising = 1
#     restarts = 1
#     while cruising:
#         cruising, restart_core, found, checked_boxes, checked_core, tuning_results = YangZhou_CruiseSystem(arguments, num_arg, num_arg_val, surrounding_vectors, max_combo, max_accuracy, found, checked_boxes, restarts, checked_core, XTrain, yTrain, XVal, XTest, tuning_results)
#
#         if cruising:
#             max_combo, max_accuracy, found, checked_boxes, checked_core, been_best, tuning_results = YangZhou_GuidanceSystem(restart_core, num_arg_val, surrounding_vectors, found, checked_boxes, restarts, checked_core, been_best, XTrain, yTrain, XVal, XTest, arguments, tuning_results)
#             restarts += 1
#
#     # Final extensive search around maxes.
#     print("YANGZHOU FINAL GUIDANCE ACTIVATED\n")
#     old_max_accuracy = deepcopy(max_accuracy)
#     max_combo, max_accuracy, found, checked_boxes, checked_core, been_best, tuning_results = YangZhou_GuidanceSystem(max_combo, num_arg_val, surrounding_vectors, found, checked_boxes, 'FINAL', checked_core, been_best, XTrain, yTrain, XVal, XTest, arguments, tuning_results)
#     while(max_accuracy-old_max_accuracy > 0):
#         old_max_accuracy = deepcopy(max_accuracy)
#         max_combo, max_accuracy, found, checked_boxes, checked_core, tuning_results = YangZhou_GuidanceSystem(max_combo, num_arg_val, surrounding_vectors, found, checked_boxes, 'FINAL', checked_core, been_best, XTrain, yTrain, XVal, XTest, arguments, tuning_results)
#
#
#     # Display final information
#     print("YANGZHOU FINAL GUIDANCE STAGE ENDED")
#     print("YANGZHOU MISSION ACCOMPLISHED\n")
#
#     if len(arguments) == 2:
#         print('Final Found: \n', np.array(found).reshape(num_arg_val).round(4), '\n')
#         print('Final Checked Boxes: \n', np.array(checked_boxes).reshape(num_arg_val).round(4), '\n')
#         print('Final Checked Cores: \n', np.array(checked_core).reshape(num_arg_val).round(4), '\n')
#
#     print('Max Accuracy: \n', max(found))
#     print('Max Combo: \n', max_combo)
#
#     print('% Combos Checked:', int(sum(checked_boxes)), 'out of', cond_prod(num_arg_val), 'which is', f'{np.mean(checked_boxes).round(8)*100}%')
#
#     return max_combo, max_accuracy, found, checked_boxes, checked_core, been_best

In [31]:
# max_combo, max_accuracy, found, checked_boxes, checked_core, been_best = YangZhou_trial(arguments, XTrain, yTrain, XVal, XTest)

The helper function which specifies what type of model we are tuning and also records tuning results

In [32]:
def train_and_get_accuracies(arguments, coord, XTrain, yTrain, XVal, XTest, tuning_results):

    RF = RandomForestRegressor(n_estimators=arguments['n_estimators'][coord[0]],
                               max_depth = arguments['max_depth'][coord[1]],
                               max_samples = arguments['max_samples'][coord[2]],
                               ccp_alpha = arguments['ccp_alpha'][coord[3]])

    RF.fit(XTrain, yTrain)
    train_accu = RF.score(XTrain, yTrain)
    val_accu = RF.score(XVal, yVal)
    test_accu = RF.score(XTest, yTest)

    one_result = pd.DataFrame({'n_estimators': [arguments['n_estimators'][coord[0]]],
                               'max_depth': [arguments['max_depth'][coord[1]]],
                               'max_samples': [arguments['max_samples'][coord[2]]],
                               'ccp_alpha': [arguments['ccp_alpha'][coord[3]]],
                               'training_accuracy': [train_accu],
                               'validation_accuracy': [val_accu],
                               'testing_accuracy': [test_accu]})

    tuning_results = tuning_results.append(one_result)
    tuning_results.to_csv('../data/curated/tuning/RFR_Unified.csv')

    return val_accu, tuning_results

# The MAIN of YangZhou - pulled out of function to make it more flexible in use

In [34]:
print("YANGZHOU SYSTEM ACTIVATED\n\n")

# Process inputs and initiate arguments
num_arg = {arg:len(arguments[arg]) for arg in arguments}
num_arg_val = list(num_arg.values())

arg_median_values = {arg:(num_arg[arg]//2)+1 for arg in arguments}
core = [arg_median_values[arg] for arg in num_arg]
print('Initial core:', core, '\n')
surrounding_vectors = get_surrounding_vectors(core)

# found = np.zeros(np.prod(num_arg_val))
# checked_boxes = np.zeros(np.prod(num_arg_val))
# checked_core = np.zeros(np.prod(num_arg_val))
# been_best = np.zeros(np.prod(num_arg_val))

tuning_results = pd.DataFrame()

print("YANGZHOU GUIDE SYSTEM ACTIVATED\n")

# Initial Round of Guidance
checked_core[flatten_coordinates_h(core, num_arg_val)] = 1
max_combo, max_accuracy, found, checked_boxes, checked_core, been_best, tuning_results = YangZhou_GuidanceSystem(core,
                                                                                num_arg_val, surrounding_vectors, found,
                                                                                checked_boxes, 0, checked_core, been_best,
                                                                                XTrain, yTrain, XVal, XTest, arguments,
                                                                                tuning_results)

# Recursively Cruise and restart Guide if find a combo that is within halfwidth of max
print("YANGZHOU CRUISE SYSTEM ACTIVATED\n")
cruising = 1
restarts = 1
while cruising:
    cruising, restart_core, found, checked_boxes, checked_core, tuning_results = YangZhou_CruiseSystem(arguments,
                                                                                    num_arg, num_arg_val, surrounding_vectors,
                                                                                    max_combo, max_accuracy, found, checked_boxes,
                                                                                    restarts, checked_core, XTrain, yTrain, XVal,
                                                                                    XTest, tuning_results)

    if cruising:
        max_combo, max_accuracy, found, checked_boxes, checked_core, been_best, tuning_results = YangZhou_GuidanceSystem(restart_core,
                                                                                    num_arg_val, surrounding_vectors, found, checked_boxes,
                                                                                    restarts, checked_core, been_best, XTrain, yTrain, XVal,
                                                                                    XTest, arguments, tuning_results)
        restarts += 1

# Final extensive search around maxes.
print("YANGZHOU FINAL GUIDANCE ACTIVATED\n")
old_max_accuracy = deepcopy(max_accuracy)
max_combo, max_accuracy, found, checked_boxes, checked_core, been_best, tuning_results = YangZhou_GuidanceSystem(max_combo,
                                                                                num_arg_val, surrounding_vectors, found,
                                                                                checked_boxes, 'FINAL', checked_core, been_best,
                                                                                XTrain, yTrain, XVal, XTest, arguments, tuning_results)
while(max_accuracy-old_max_accuracy > 0):
    old_max_accuracy = deepcopy(max_accuracy)
    max_combo, max_accuracy, found, checked_boxes, checked_core, tuning_results = YangZhou_GuidanceSystem(max_combo,
                                                                                num_arg_val, surrounding_vectors, found, checked_boxes,
                                                                                'FINAL', checked_core, been_best, XTrain, yTrain, XVal,
                                                                                XTest, arguments, tuning_results)


# Display final information
print("YANGZHOU FINAL GUIDANCE STAGE ENDED")
print("YANGZHOU MISSION ACCOMPLISHED\n")

if len(arguments) == 2:
    print('Final Found: \n', np.array(found).reshape(num_arg_val).round(4), '\n')
    print('Final Checked Boxes: \n', np.array(checked_boxes).reshape(num_arg_val).round(4), '\n')
    print('Final Checked Cores: \n', np.array(checked_core).reshape(num_arg_val).round(4), '\n')

print('Max Accuracy: \n', max(found))
print('Max Combo: \n', max_combo)

print('% Combos Checked:', int(sum(checked_boxes)), 'out of', cond_prod(num_arg_val),
      'which is', f'{np.mean(checked_boxes).round(8)*100}%')

YANGZHOU SYSTEM ACTIVATED


Initial core: [2, 3, 3, 4] 

YANGZHOU GUIDE SYSTEM ACTIVATED

BEGIN INITIAL GUIDANCE

ROUND 0 ITERATION:  0 



UnboundLocalError: local variable 'max_key' referenced before assignment