# Regression Trees
* We'll build a regression tree model from scratch using functions to find
the split that minimizes the MSE and then we'll average the leafs instead of
counting the number of classes (like in classification).
* We'll use it on a toy dataset and then we'll build a regression tree model
using scikit-learn on another notebook.
* The quality of the splitting point is measured by the weighted MSE of two
children (variance of all target values). The samller the weighted MSE, the
better the split.
![House_dataset](./dataset/toy_dataset_house_prices.png)

## Step 1: Define the MSE and weighted MSE functions

In [15]:
import numpy as np
# This calculates the single variance (MSE) for the targets
def mse(targets):
    # When the set is empty, the MSE is 0
    if targets.size == 0:
        return 0
    # Otherwise, return the variance
    return np.var(targets)

# Define the weighted MSE after a split in a node
def weighted_mse(groups):
    """
    Calculate the weighted MSE of children after a split
    """
    total = sum(len(group) for group in groups)
    weighted_sum = 0.0

    # Iterating on each group and using the mse function declared before
    for group in groups:
        weighted_sum += len(group) / float(total) * mse(group)
    return weighted_sum


In [16]:
# Testing the functions:
print(f"mse: {mse(np.array([1, 2, 3])):.4f}")
print(f"weighted mse: {weighted_mse([np.array([1, 2, 3]), np.array([1, 2])]):.4f}")


mse: 0.6667
weighted mse: 0.5000


## Step 2: Building the splitting node function.
* To build the house price regressio tree, we exhaust all possible pairs of
feature and value and we compute the corresponding MSE.
* Once we're satisfied with the nodes, we calculate the averaged target price
 for each leaf
![house_split](dataset/toy_dataset_house_prices_splitting_nodes.png)

In [17]:
# Node splitting utility function
def split_node(X, y, index, value):
    """
    Split data set X, y based on a feature and a value
    @param index: index of the feature used for splitting
    @param value: value of the feature used for splitting
    @return: left and right child, a child is in the format of [X, y]
    """
    x_index = X[:, index]
    # If the feature is numerical, apply a mask based on value
    if type(X[0, index]) in [int, float]:
        mask = x_index >= value
    # If this feature is categorical:
    else:
        mask = x_index == value
    # split into left and right child
    left = [X[~mask, :], y[~mask]]
    right = [X[mask, :], y[mask]]
    return left, right

# Define the greedy search function, trying out all possible splits and
# returning the one with the least weighted MSE

def get_best_split(X, y):
    """
    Obtain the best splitting point and resulting children for the data set X, y
    @return: {index: index of the feature, value: feature value, children: left and right children}
    """
    # Using placeholders for the best index, value, score and children
    best_index, best_value, best_score, children = None, None, 1e10, None

    for index in range(len(X[0])):
        # Trying out each column
        for value in np.sort(np.unique(X[:, index])):
            # Calling the split_node to get left and right children
            groups = split_node(X, y, index, value)
            # Getting the weighted_mse from the children (left, right)
            evaluation_metric = weighted_mse([groups[0][1], groups[1][1]])

            # If we find a best score, we update all values
            if evaluation_metric < best_score:
                best_index, best_values, best_score, children = index, value,\
                                                                evaluation_metric, groups
    return {'index': best_index, 'value': best_value,
            'children': children}


## Step 3: Building the recursive function:
* Our final function is called split(). The purpose is to link it all together.
* It checks whether any stopping criteria are met and assigns the leaf node if so, or proceeds with further separation otherwise:

In [18]:
def get_leaf(targets):
    # Obtain the leaf as the mean of the targets
    return np.mean(targets)

def split(node, max_depth, min_size, depth):
    # Split children of a node to construct new nodes or assign them terminals
    # node: dictionary with children information
    # max_depth: maximal depth of the tree
    # min_size: minimal samples required to further split a child
    # depth: current depth of the node

    # Splitting node into left and right children
    left, right = node['children']
    # Delete the 'children' key from the node dictionary
    del(node['children'])

    # If left child has no samples, set the right child as the leaf node and return
    if left[1].size == 0:
        node['right'] = get_leaf(right[1])
        return

    # If right child has no samples, set the left child as the leaf node and return
    if right[1].size == 0:
        node['left'] = get_leaf(left[1])
        return

    # Check if the current depth exceeds the maximal depth
    if depth >= max_depth:
        # Set both left and right children as leaf nodes and return
        node['left'], node['right'] = get_leaf(left[1]), get_leaf(right[1])
        return

    # Check if the left child has enough samples
    if left[1].size <= min_size:
        # If not, set the left child as a leaf node
        node['left'] = get_leaf(left[1])
    else:
        # If it has enough samples, further split the left child
        result = get_best_split(left[0], left[1])
        result_left, result_right = result['children']

        # Handle the cases where one of the result's children has no samples
        if result_left[1].size == 0:
            node['left'] = get_leaf(result_right[1])
        elif result_right[1].size == 0:
            node['left'] = get_leaf(result_left[1])
        else:
            node['left'] = result
            # Recursively call the split function for the left child
            split(node['left'], max_depth, min_size, depth + 1)

    # Check if the right child has enough samples
    if right[1].size <= min_size:
        # If not, set the right child as a leaf node
        node['right'] = get_leaf(right[1])
    else:
        # If it has enough samples, further split the right child
        result = get_best_split(right[0], right[1])
        result_left, result_right = result['children']

        # Handle the cases where one of the result's children has no samples
        if result_left[1].size == 0:
            node['right'] = get_leaf(result_right[1])
        elif result_right[1].size == 0:
            node['right'] = get_leaf(result_left[1])
        else:
            node['right'] = result
            # Recursively call the split function for the right child
            split(node['right'], max_depth, min_size, depth + 1)


In [19]:
# The entry point of the regression tree construction is as follows:

def train_regression_tree(X_train, y_train, max_depth, min_size):
    # Getting a root node (the one with the smallest MSE)
    root = get_best_split(X_train, y_train)
    # Calling the recursive function to get the best tree
    split(root, max_depth, min_size, 1)
    return root



## Final step: Testing the functions

In [22]:
X_train = np.array([['semi', 3],
                    ['detached', 2],
                    ['detached', 3],
                    ['semi', 2],
                    ['semi', 4]], dtype=object)
y_train = np.array([600, 700, 800, 400, 700])

tree = train_regression_tree(X_train, y_train, 2, 2)
tree

{'index': 0,
 'value': None,
 'left': {'index': 1, 'value': None, 'left': 400.0, 'right': 650.0},
 'right': 750.0}

In [21]:
# Visualizing the tree (an auxiliary function to compare our tree with the
# one drawn by hand
CONDITION = {'numerical': {'yes': '>=', 'no': '<'},
              'categorical': {'yes': 'is', 'no': 'is not'}}
def visualize_tree(node, depth=0):
     if isinstance(node, dict):
         if type(node['value']) in [int, float]:
             condition = CONDITION['numerical']
         else:
             condition = CONDITION['categorical']
         print('{}|- X{} {} {}'.format(depth * ' ',
                  node['index'] + 1, condition['no'],
                  node['value']))
         if 'left' in node:
             visualize_tree(node['left'], depth + 1)
         print('{}|- X{} {} {}'.format(depth * ' ',
                 node['index'] + 1, condition['yes'],
                 node['value']))
         if 'right' in node:
             visualize_tree(node['right'], depth + 1)
     else:
         print('{}[{}]'.format(depth * ' ', node))
visualize_tree(tree)



|- X1 is not None
 |- X2 is not None
  [400.0]
 |- X2 is None
  [650.0]
|- X1 is None
 [750.0]
