# 0 Initialization #

## 0.1 Imports and Path ##

In [1]:
# Imports
# Python = 3.10.12
import numpy as np  # V1.24.3
import pandas as pd # V1.5.3
import warnings
import matplotlib  # V3.7.1
import xgboost  # V1.7.3
import shap  # V0.42.1
from sklearn.metrics import r2_score as r2s  # V1.3.0
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.base import is_regressor
from sklearn import clone
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler as Scaler
from matplotlib import pyplot as plt
from itertools import product
from tqdm import trange
# <editor-fold desc="Protected Imports">
from sklearn.utils import check_random_state, _safe_indexing
from sklearn.utils.validation import _num_samples, _check_sample_weight
# </editor-fold>

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
# Path of data and output
path_tar, col_tar = r'C:\Users\Administrator\Desktop\Open Source\FRP_Dataset.xls', 'I'
path_sou, col_sou = r'C:\Users\Administrator\Desktop\Open Source\Steel_Dataset.xls', 'I'
save_path = r'C:\Users\Administrator\Desktop\Open Source\Results\Re_'

## 0.2 Define algorithm classes ##

### 0.2.1 TrAdaBoost ###

In [3]:
class TrAdaBoostR:
    def __init__(self, learner):
        self.learner = learner
        self.weight = 0
        self.p, self.err, self.beta_t, self.p_train = None, None, None, None
        self.learners = []
        self.n_iters = 20

    def fit(self, x_source: np.ndarray, y_source: np.ndarray, x_target: np.ndarray, y_target: np.ndarray,
            n_iters: int = 20):
        self.n_iters = n_iters
        y_source, y_target = np.ravel(y_source), np.ravel(y_target)
        n, m = y_source.shape[0], y_target.shape[0]  # Data Points (diff and same distribution)
        x_train = np.concatenate((x_source, x_target), axis=0)
        y_train = np.concatenate((y_source, y_target), axis=0)
        # 1 Initialization
        self.weight = np.zeros((n_iters, n + m))
        self.weight[0] = np.ones(n + m) / (n + m)
        # 2 Loop
        # 2.0 Initialize vector
        self.p = np.zeros((n_iters, n + m))
        self.p_train = np.zeros((n_iters, n + m))
        self.err = np.zeros(n_iters)
        self.beta_t = np.zeros(n_iters)
        for t in range(n_iters):
            # 2.1 get p
            self.p[t] = self.weight[t] / self.weight[t].sum()
            # 2.2 predict using Learner
            learner = clone(self.learner)
            learner.fit(x_train, y_train, sample_weight=self.p[t])
            self.p_train[t] = learner.predict(x_train)
            self.learners.append(learner)
            # 2.3 get Learner Loss
            loss = self.p_train[t, :] - y_train
            l1_same = np.max(np.abs(loss[n:n + m]))
            weight_same = self.weight[t, n:n + m].sum()
            for i in range(n, n + m):
                self.err[t] += abs(self.p_train[t, i] - y_train[i]) * self.weight[t, i] / weight_same / l1_same
            # 2.3.1 termination condition
            if self.err[t] >= 0.5:
                self.p = self.p[:t, :]
                self.err = self.err[:t]
                self.beta_t = self.beta_t[:t]
                self.p_train = self.p_train[:t, :]
                break
            # 2.4 Get beta
            self.beta_t[t] = self.err[t] / (1 - self.err[t])
            beta = 1 / (1 + (2 * np.log(n) / n_iters) ** 0.5)
            # 2.5 update weight vector
            l1_diff = np.max(np.abs(loss[0:n]))
            if t == n_iters - 1:
                break
            for i in range(n):
                self.weight[t + 1, i] = self.weight[t, i] * beta ** (abs(self.p_train[t, i] - y_train[i]) / l1_diff)
            for i in range(n, n + m):
                self.weight[t + 1, i] = self.weight[t, i] * self.beta_t[t] ** \
                                        (- abs(self.p_train[t, i] - y_train[i]) / l1_same)

    def predict(self, x: np.ndarray):
        # 1 get weight vector by beta_t
        weights = self.beta_t / self.beta_t.sum()
        # 2 output
        predicts = None
        for reg in self.learners:
            p_x = reg.predict(x)
            p_x = p_x.reshape(p_x.shape[0], 1)
            if predicts is None:
                predicts = p_x
            else:
                predicts = np.concatenate([predicts, p_x], axis=1)
        return predicts.mean(axis=1)

### 0.2.2 Two-stage TrAdaboost ###

In [4]:
class AdaBoostR2T(AdaBoostRegressor):
    n_protected = 0
    sample_weight = None

    def fit(self, X, y, sample_weight=None):
        self._validate_params()

        X, y = self._validate_data(
            X,
            y,
            accept_sparse=["csr", "csc"],
            ensure_2d=True,
            allow_nd=True,
            dtype=None,
            y_numeric=is_regressor(self),
        )

        sample_weight = _check_sample_weight(
            sample_weight, X, np.float64, copy=True, only_non_negative=True
        )
        sample_weight /= sample_weight.sum()

        # Check parameters
        self._validate_estimator()

        # Clear any previous fit results
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)

        # Initialization of the random number instance that will be used to
        # generate a seed at each iteration
        random_state = check_random_state(self.random_state)
        epsilon = np.finfo(sample_weight.dtype).eps

        zero_weight_mask = sample_weight == 0.0
        for iboost in range(self.n_estimators):
            # avoid extremely small sample weight, for details see issue #20320
            sample_weight = np.clip(sample_weight, a_min=epsilon, a_max=None)
            # do not clip sample weights that were exactly zero originally
            sample_weight[zero_weight_mask] = 0.0

            # Boosting step
            sample_weight, estimator_weight, estimator_error = self._boost(
                iboost, X, y, sample_weight, random_state
            )

            # Early termination
            if sample_weight is None:
                break
            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error

            # Stop if error is zero
            if estimator_error == 0:
                break

            sample_weight_sum = np.sum(sample_weight)

            if not np.isfinite(sample_weight_sum):
                warnings.warn(
                    "Sample weights have reached infinite values,"
                    f" at iteration {iboost}, causing overflow. "
                    "Iterations stopped. Try lowering the learning rate.",
                    stacklevel=2,
                )
                break

            # Stop if the sum of sample weights has become non-positive
            if sample_weight_sum <= 0:
                break
            # =======  override  =======
            sum_n = np.sum(sample_weight[:self.n_protected])
            sum_m = np.sum(sample_weight[self.n_protected:])
            if iboost < self.n_estimators - 1:
                # Normalize m samples while n are protected
                sample_weight[self.n_protected:] *= (1 - sum_n) / sum_m
            self.sample_weight = sample_weight
            # ==========================
        return self

    def _boost(self, iboost, X, y, sample_weight, random_state):
        estimator = self._make_estimator(random_state=random_state)

        # Weighted sampling of the training set with replacement
        bootstrap_idx = random_state.choice(
            np.arange(_num_samples(X)),
            size=_num_samples(X),
            replace=True,
            p=sample_weight,
        )

        # Fit on the bootstrapped sample and obtain a prediction
        # for all samples in the training set
        X_ = _safe_indexing(X, bootstrap_idx)
        y_ = _safe_indexing(y, bootstrap_idx)
        estimator.fit(X_, y_)
        y_predict = estimator.predict(X)

        error_vect = np.abs(y_predict - y)
        sample_mask = sample_weight > 0

        # ========  override  ========
        # freeze first n's weight
        sample_mask[:self.n_protected].fill(False)
        # ============================

        masked_sample_weight = sample_weight[sample_mask]
        masked_error_vector = error_vect[sample_mask]
        error_max = masked_error_vector.max()
        if error_max != 0:
            masked_error_vector /= error_max

        if self.loss == "square":
            masked_error_vector **= 2
        elif self.loss == "exponential":
            masked_error_vector = 1.0 - np.exp(-masked_error_vector)

        # Calculate the average loss
        estimator_error = (masked_sample_weight * masked_error_vector).sum()

        if estimator_error <= 0:
            # Stop if fit is perfect
            return sample_weight, 1.0, 0.0

        elif estimator_error >= 0.5:
            # Discard current estimator only if it isn't the only one
            if len(self.estimators_) > 1:
                self.estimators_.pop(-1)
            return None, None, None

        beta = estimator_error / (1.0 - estimator_error)

        # Boost weight using AdaBoost.R2 alg
        estimator_weight = self.learning_rate * np.log(1.0 / beta)

        if not iboost == self.n_estimators - 1:
            sample_weight[sample_mask] *= np.power(
                beta, (1.0 - masked_error_vector) * self.learning_rate
            )

        return sample_weight, estimator_weight, estimator_error

    def set_protected(self, n_protected):
        self.n_protected = n_protected


class TwoStageTrAdaboost:
    s = 20  # Hyperparameter S : the number of steps
    regs = []  # Regressor saved
    learner = None
    w = None  # Weight
    train_indicator, valid_indicator = {}, {}
    verification, display_warning = False, False
    n = 0
    last_regressor = 0
    boost_learner = None
    beta_record = []
    random_state = 0

    def __init__(self, steps: int = 20, base_learner=None, boost_learner=AdaBoostR2T()
                 , display_warning: bool = False, random_state=0):
        from sklearn.tree import DecisionTreeRegressor
        self.s = steps
        self.display_warning = display_warning
        if base_learner is None:
            self.learner = DecisionTreeRegressor(max_depth=3, random_state=self.random_state)
        else:
            self.learner = base_learner
        self.regs = []
        self.w = None
        self.train_indicator, self.valid_indicator = {}, {}
        self.verification, self.display_warning = False, False
        self.n = 0
        self.last_regressor = 0
        self.boost_learner = boost_learner
        self.beta_record = []

    def fit(self, x_source: np.ndarray, y_source: np.ndarray, x_target: np.ndarray, y_target: np.ndarray):
        from sklearn.metrics import r2_score as r2s
        # Initialize
        n, m = len(y_source), len(y_target)
        self.n = n
        self.w = np.zeros((self.s, n + m))
        self.w[0, :] = np.ones(n + m) / (n + m)
        # Form dataset
        x_train = np.concatenate((x_source, x_target), axis=0)
        y_train = np.concatenate((y_source, y_target), axis=0)
        # Loop
        for t in range(self.s):
            # Call Adaboost.R2t
            r2t = AdaBoostR2T(estimator=self.learner, learning_rate=0.5, n_estimators=20, loss='square',
                              random_state=self.random_state)
            r2t.set_protected(n)  # freeze first n instances
            r2t.fit(x_train, y_train, sample_weight=self.w[t, :])
            self.regs.append(r2t)
            # Call Learner
            learner = self.learner
            learner.fit(x_train, y_train, sample_weight=self.w[t, :])
            hyp_train = learner.predict(x_train)
            # get error
            e_learner = np.abs(y_train - hyp_train)
            e_t = e_learner / np.max(e_learner)
            # Update weight vector
            if t == self.s - 1:
                break
            #   Initialize binary search
            beta_0, beta_1 = -20, 1
            target_sum = m / (n + m) + t / (self.s - 1) * (1 - m / (n + m))
            curr_sum = 10
            while abs(target_sum - curr_sum) > 10 ** -3:
                curr_beta = beta_0 / 2 + beta_1 / 2
                self.w[t + 1, :n] = self.w[t, :n] * np.power(10 ** curr_beta, e_t[:n])
                self.w[t + 1, n:] = self.w[t, n:]
                self.w[t + 1] /= np.sum(self.w[t + 1])  # Normalize
                curr_sum = np.sum(self.w[t + 1, n:])
                if curr_sum > target_sum:
                    beta_0 = (beta_0 + beta_1) / 2
                else:
                    beta_1 = (beta_0 + beta_1) / 2
                if beta_1 - beta_0 < 10 ** -10:
                    if not self.display_warning:
                        break
                    print('Convergence Failed in Iter {3} with E = {0:.4f}'
                          .format(target_sum - curr_sum, beta_0, beta_1, t))
                    break
            self.beta_record.append(10 ** beta_0)
        # Return Train Indicator
        #   Initialize
        n_4 = len(self.regs)
        r2_target, mae_target, mse_target = np.zeros(n_4), np.zeros(n_4), np.zeros(n_4)
        #   Calculate
        for i in range(n_4):
            p_train = self.regs[i].predict(x_train)
            r2_target[i] = r2s(y_train[n:], p_train[n:])
            mae_target[i] = np.mean(np.abs(y_train[n:] - p_train[n:]))
            mse_target[i] = np.mean((y_train[n:] - p_train[n:]) ** 2)
        #   Record
        self.train_indicator['r2_target'] = r2_target
        self.train_indicator['mae_target'] = mae_target
        self.train_indicator['mse_target'] = mse_target
        # Refine Index
        self.verification = False

    def valid(self, x_valid: np.ndarray, y_valid: np.ndarray):
        from sklearn.metrics import r2_score as r2s
        # Return Valid Indicator
        #   Initialize
        n_4 = len(self.regs)
        n = self.n
        r2, mae, mse = np.zeros(n_4), np.zeros(n_4), np.zeros(n_4)
        #   Calculate
        for i in range(n_4):
            p_valid = self.regs[i].predict(x_valid)
            r2[i] = r2s(y_valid, p_valid)
            mae[i] = np.mean(np.abs(y_valid - p_valid))
            mse[i] = np.mean((y_valid - p_valid) ** 2)
        #   Record
        self.valid_indicator['r2'] = r2
        self.valid_indicator['mae'] = mae
        self.valid_indicator['mse'] = mse
        # Refine Index
        self.verification = True

    def predict(self, x: np.ndarray, principle='mae', valid_indicator=True):
        if not self.verification:
            raise AssertionError('No Validation Present.')
        # Get Regressor
        reg_principle = self.valid_indicator.get(principle)
        reg_arg = np.argmin(reg_principle)
        reg = self.regs[reg_arg]
        self.last_regressor = reg_arg
        # Predict
        return reg.predict(x)

### 0.2.3 Proposed Algorithm ###

In [5]:
class ProposedAlg:
    s = 20  # Hyperparameter S : the number of steps
    regs = []  # Regressor saved
    learner = None
    w = None  # Weight
    train_indicator, valid_indicator = {}, {}
    verification, display_warning = False, False
    n = 0
    last_regressor = 0
    boost_learner = None
    beta_record = []
    feature_importance = None

    def __init__(self, steps: int = 20, base_learner=None, boost_learner=None
                 , display_warning: bool = False):
        from sklearn.tree import DecisionTreeRegressor
        self.s = steps
        self.display_warning = display_warning
        if base_learner is None:
            self.learner = DecisionTreeRegressor(max_depth=3)
        else:
            self.learner = base_learner
        if boost_learner is None:
            self.boost_learner = AdaBoostR2T()
        else:
            self.boost_learner = boost_learner
        self.regs = []
        self.w = None
        self.train_indicator, self.valid_indicator = {}, {}
        self.verification, self.display_warning = False, False
        self.n = 0
        self.last_regressor = 0
        self.boost_learner = boost_learner
        self.beta_record = []

    def fit(self, x_source: np.ndarray, y_source: np.ndarray, x_target: np.ndarray, y_target: np.ndarray):
        from sklearn.metrics import r2_score as r2s
        from sklearn.base import clone
        # Initialize
        n, m = len(y_source), len(y_target)
        self.n = n
        self.w = np.zeros((self.s, n + m))
        self.w[0, :] = np.ones(n + m) / (n + m)
        # Form dataset
        x_train = np.concatenate((x_source, x_target), axis=0)
        y_train = np.concatenate((y_source, y_target), axis=0)
        # Loop
        for t in range(self.s):
            # Call boost_learner
            r2t = clone(self.boost_learner)
            r2t.fit(x_train, y_train, sample_weight=self.w[t, :])
            self.regs.append(r2t)
            # Call Learner
            learner = clone(self.learner)
            learner.fit(x_train, y_train, sample_weight=self.w[t, :])
            hyp_train = learner.predict(x_train)
            # Cal. error
            e_learner = np.abs(y_train - hyp_train)
            e_t = e_learner / np.max(e_learner)
            # Update weight vector
            if t == self.s - 1:
                break
            # Initialize binary search
            beta_0, beta_1 = -20, 1
            target_sum = m / (n + m) + t / (self.s - 1) * (1 - m / (n + m))
            curr_sum = 10
            while abs(target_sum - curr_sum) > 10 ** -3:
                curr_beta = beta_0 / 2 + beta_1 / 2
                self.w[t + 1, :n] = self.w[t, :n] * np.power(10 ** curr_beta, e_t[:n])
                self.w[t + 1, n:] = self.w[t, n:]
                self.w[t + 1] /= np.sum(self.w[t + 1])
                curr_sum = np.sum(self.w[t + 1, n:])
                if curr_sum > target_sum:
                    beta_0 = (beta_0 + beta_1) / 2
                else:
                    beta_1 = (beta_0 + beta_1) / 2
                if beta_1 - beta_0 < 10 ** -10:
                    if not self.display_warning:
                        break
                    print('Convergence Failed in Iter {3} with E = {0:.4f}'
                          .format(target_sum - curr_sum, beta_0, beta_1, t))
                    break
            self.beta_record.append(10 ** beta_0)
        # Return Train Indicator
        #   Initialize
        n_4 = len(self.regs)
        r2_target, mae_target, mse_target = np.zeros(n_4), np.zeros(n_4), np.zeros(n_4)
        #   Calculate
        for i in range(n_4):
            p_train = self.regs[i].predict(x_train)
            r2_target[i] = r2s(y_train[n:], p_train[n:])
            mae_target[i] = np.mean(np.abs(y_train[n:] - p_train[n:]))
            mse_target[i] = np.mean((y_train[n:] - p_train[n:]) ** 2)
        #   Record
        self.train_indicator['r2_target'] = r2_target
        self.train_indicator['mae_target'] = mae_target
        self.train_indicator['mse_target'] = mse_target
        # Refine Index
        self.verification = False

    def predict(self, x: np.ndarray, principle='mae'):
        # Get Regressor
        reg_principle = self.train_indicator.get(principle + '_target')
        reg_arg = reg_principle < np.median(reg_principle)
        self.last_regressor = reg_arg
        # Predict
        predicts = np.zeros([x.shape[0], 1])
        for i in range(len(self.regs)):
            if reg_arg[i]:
                curr_predict = self.regs[i].predict(x)
                predicts = np.concatenate([predicts, curr_predict.reshape([curr_predict.shape[0], 1])], axis=1)
        predicts = predicts[:, 1:]
        return np.mean(predicts, axis=1)

## 0.3 Base Methods for main process ##

In [6]:
def load_data(d_path: str, label_col_str: str, feature_col_str: str = 'B', c_index: bool = False):
    import pandas as pd
    # get column index
    label_col, count = 0, 0
    for i in label_col_str:
        count += 1
        if len(label_col_str) - count == 1:
            label_col += (ord(i.lower()) - 96) * 26
        else:
            label_col += (ord(i.lower()) - 96)
    feature_col, count = 0, 0
    for i in feature_col_str:
        count += 1
        if len(feature_col_str) - count == 1:
            feature_col += (ord(i.lower()) - 96) * 26
        else:
            feature_col += (ord(i.lower()) - 96)
    # get data
    db = pd.read_excel(d_path)
    x = db.iloc[:, feature_col - 1:label_col - 1]
    y = db.iloc[:, label_col - 1]
    x_index = x.columns.to_list()
    y_index = y.name
    x = np.array(x, dtype='float64')
    y = np.array(y, dtype='float64')
    if c_index:
        return x, y, x_index, y_index
    else:
        return x, y

In [7]:
def load_data_line(d_path: str, col_index: str):
    import pandas as pd
    label_col, count = 0, 0
    for i in col_index:
        count += 1
        if len(col_index) - count == 1:
            label_col += (ord(i.lower()) - 96) * 26
        else:
            label_col += (ord(i.lower()) - 96)
    db = pd.read_excel(d_path)
    y = db.iloc[:, label_col - 1]
    return y

In [8]:
def get_subplot_id(n_row: int, n_col: int, curr_id: int):
    curr_row = 0
    while curr_id >= n_row:
        curr_row += 1
        curr_id -= n_row
    return curr_row, curr_id

In [9]:
def sort_lists(a: list, b: list, reverse=True):
    d = dict(zip(a, b))
    a_sorted = sorted(d, reverse=reverse)
    b_sorted = []
    for i in a_sorted:
        b_sorted.append(d.get(i))
    return a_sorted, b_sorted

In [10]:
def a20_index(true_values: np.ndarray, predict_values: np.ndarray, variation: float = 0.2) -> float:
    m20 = 0
    for ii, jj in zip(true_values, predict_values):
        if jj * (1-variation) <= ii <= jj * (1+variation):
            m20 += 1
    return m20 / true_values.shape[0]

In [11]:
def fractional_bias(true_values: np.ndarray, predict_values: np.ndarray) -> float:
    a_sub_p = true_values - predict_values
    a_add_p = true_values + predict_values
    return 2 * a_sub_p.sum() / a_add_p.sum()

# 1 Import Data #

In [12]:
x_original_tar, y_tar, x_name, y_name = load_data(path_tar, col_tar, c_index=True)
x_original_s, ys = load_data(path_sou, col_sou)
ids = load_data_line(path_tar, 'A').tolist()
feature_names = ['$A_{core}$', "$f_{co}'$", '$d_t$', '$s_t$', '$E_t$', '$f_{fu}$', '$Shape$']
feature_units = ['(mm$^2$)', '(MPa)', '(mm)', '(mm)', '(GPa)', '(MPa)', '']
label_name = '$f_{cc}\'$'
reg_names = ['RF', 'Extra Trees', 'AdaBoost.R2', 'XGBoost',
             'TrAdaBoost.R2', 'Two-stage TrAdaBoost.R2', 'proposed model']
reg_names_simplified = ['RF', 'ET', 'AD', 'XG', 'TA', 'TS', 'PM']
metric_names = ['R2TR', 'R2', 'RMSE', 'MAE', 'MAPE', 'A20', 'FB']
rec_space = list(product(metric_names, reg_names_simplified))
rec_cols = ['RS']
rec_cols.extend([i[1] + '_' + i[0] for i in rec_space])
df = pd.DataFrame(columns=rec_cols)

# 2 Monte Carlo Simulation #

In [13]:
rs_initial, rs_steps = 0, 1500
for random_state in trange(rs_initial, rs_steps):
    # 2.1 Define Models
    np.random.seed(random_state)
    learner = RandomForestRegressor(n_estimators=200, max_features=7, max_depth=12,
                                    criterion='absolute_error', random_state=random_state)
    base_learner = DecisionTreeRegressor(max_depth=15, criterion='friedman_mse', random_state=random_state)
    regs = [RandomForestRegressor(n_estimators=200, max_features=7, max_depth=11,
                                  criterion='poisson', random_state=random_state),
            ExtraTreesRegressor(n_estimators=200, max_features=6, max_depth=5,
                                criterion='poisson', random_state=random_state),
            AdaBoostRegressor(n_estimators=200, learning_rate=0.7740, loss='linear', random_state=random_state),
            xgboost.XGBRegressor(n_estimators=200, max_depth=15, learning_rate=1.5583,
                                 reg_alpha=21.469, reg_lambda=446.24),
            #  Transfer
            TrAdaBoostR(base_learner),
            TwoStageTrAdaboost(),
            ProposedAlg(base_learner=base_learner, boost_learner=learner)
            ]
    # 2.2 Normalization
    sc = Scaler()
    sc.fit(x_original_tar)
    x_tar, xs = sc.transform(x_original_tar), sc.transform(x_original_s)
    # 2.3 Train-Test set spilt
    xt, x_test, yt, y_test = train_test_split(x_tar, y_tar, test_size=0.2, random_state=random_state)
    ids_train, ids_test, _, _ = train_test_split(ids, y_tar, test_size=0.2, random_state=random_state)
    # 2.4 Model Training
    p_trains, p_tests = [], []
    metrics = []
    for i in range(len(regs)):
        if regs[i] is None:
            continue
        reg = regs[i]
        if i <= 3:  # 0, 1, 2, 3
            reg.fit(xt, yt)
        elif i == 5:  # 5
            x_train, x_valid, y_train, y_valid = train_test_split(xt, yt,
                                                                  test_size=0.2, random_state=random_state)
            reg.fit(xs, ys, x_train, y_train)
            reg.valid(x_valid, y_valid)
        else:  # 4, 6
            reg.fit(xs, ys, xt, yt)
        p_train, p_test = reg.predict(xt), reg.predict(x_test)
        p_trains.append(p_train)
        p_tests.append(p_test)
        # Metrics
        metric = [r2s(yt, p_train),
                  r2s(y_test, p_test),
                  mean_squared_error(y_test, p_test) ** 0.5,
                  mean_absolute_error(y_test, p_test),
                  mean_absolute_percentage_error(y_test, p_test),
                  a20_index(y_test, p_test),
                  fractional_bias(y_test, p_test)]
        metrics.append(metric)
    metrics_flatten = [random_state]
    for i in range(len(metrics)):
        for j in range(len(reg_names)):
            metrics_flatten.append(metrics[j][i])
    df.loc[len(df)] = metrics_flatten
    # 2.5 Record to Excel
df.to_excel(save_path + 'RS = {0} to {1}.xlsx'.format(rs_initial, rs_steps))

100%|██████████████████████████████████████████████████████████████████████████| 1500/1500 [5:13:16<00:00, 12.53s/it]


# 3 Get Metrics #

In [14]:
mc_results_path = save_path + 'RS = {0} to {1}.xlsx'.format(rs_initial, rs_steps)

In [15]:
df = pd.DataFrame(columns=['Mean', 'Left CI', 'Right CI'])
r2 = ['J', 'K', 'L', 'M', 'N', 'O', 'P']

for i in r2:
    pm_r2 = np.array(load_data_line(mc_results_path, i))
    pm_r2_sorted, pm_length = np.sort(pm_r2), pm_r2.shape[0]
    left_index, right_index = int(pm_length * 2.5 / 100), int(pm_length * 97.5 / 100)
    confidence = [pm_r2_sorted.mean(), pm_r2_sorted[left_index], pm_r2_sorted[right_index]]
    df.loc[len(df)] = confidence
df = df.rename(index=dict(zip(list(range(7)), reg_names)))

In [16]:
df

Unnamed: 0,Mean,Left CI,Right CI
RF,0.813155,0.612408,0.925509
Extra Trees,0.817259,0.61517,0.923669
AdaBoost.R2,0.793693,0.584316,0.90629
XGBoost,0.794286,0.598201,0.904228
TrAdaBoost.R2,0.791583,0.547868,0.914719
Two-stage TrAdaBoost.R2,0.731366,0.460023,0.885549
proposed model,0.826906,0.652269,0.923808


In [17]:
df.round(4)

Unnamed: 0,Mean,Left CI,Right CI
RF,0.8132,0.6124,0.9255
Extra Trees,0.8173,0.6152,0.9237
AdaBoost.R2,0.7937,0.5843,0.9063
XGBoost,0.7943,0.5982,0.9042
TrAdaBoost.R2,0.7916,0.5479,0.9147
Two-stage TrAdaBoost.R2,0.7314,0.46,0.8855
proposed model,0.8269,0.6523,0.9238
