<h1>Gradient Boosting</h1>

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

from IPython.display import HTML

In [19]:
class DecisionTreeRegressor_:
    '''
    DecisionTreeRegressor class is used to permorm CART algorithm for Decision Tree Regressor.
    *****************************************************************************************
    Attributes:
    max_depth - int; adjusts the depth of the tree
    min_samples_split - int; set the minimum size to split
    *****************************************************************************************
    Methods:
    make_dataset - from X and y makes one matrix
    MSE - count the MSE
    test_split - split the dataset into two groups with the threshold 
    get_split - find the best split using the best criterion value
    to_terminal - make the final node
    split - build the tree, recursively
    fit - starts the tree building
    print_tree - print the tree
    predict_row - predict the row of data using the tree
    predict - predict the whole data
    '''
    def __init__(self, max_depth=np.infty, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        
    def make_dataset(self, X, y):
        return np.concatenate((X, y.reshape(-1, 1)), axis=1)
    
    def MSE(self, groups):
        n_instances = np.sum([len(group) for group in groups])
        criterion = 0
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            score = np.mean(group[:, -1])
            criterion += np.sum((score - group[:, -1]) ** 2) # not weighted, just sum or residuals
        return criterion

    def test_split(self, index, value, dataset):
        left, right = list(), list()
        for row in dataset:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return np.array(left), np.array(right)

    def get_split(self, dataset):
        b_index, b_value, b_score, b_groups, MSE_history = None, None, np.infty, None, []
        for index in range(len(dataset[0]) - 1):
            for row in dataset:
                groups = self.test_split(index, row[index], dataset)
                criterion = self.MSE(groups)
                MSE_history.append(criterion)
                if criterion <= b_score:
                    b_index, b_value, b_score, b_groups = index, row[index], criterion, groups
        if len(set(MSE_history)) == 1: # checking if there's no need to split dataset
            b_groups = self.test_split(0, np.min(dataset[:, 0]), dataset)
        try:
            b_value = (b_value + np.max(b_groups[0][:, b_index])) / 2
        except IndexError:
            pass
        return {'index':b_index, 'value':b_value, 'groups':b_groups}

    def to_terminal(self, group):
        outcomes = group[:, -1]
        return np.mean(outcomes)

    def split(self, node, depth):
        left, right = node['groups']
        del(node['groups'])
        if len(left) == 0 or len(right) == 0:
            node['left'] = node['right'] = self.to_terminal(np.array(left.tolist() + right.tolist()))
            return
        if depth >= self.max_depth:
            node['left'], node['right'] = self.to_terminal(left), self.to_terminal(right)
            return
        if len(left) < self.min_samples_split:
            node['left'] = self.to_terminal(left)
        else:
            node['left'] = self.get_split(left)
            self.split(node['left'], depth+1)
        if len(right) < self.min_samples_split:
            node['right'] = self.to_terminal(right)
        else:
            node['right'] = self.get_split(right)
            self.split(node['right'], depth+1)
            
    def fit(self, X, y):
        train = self.make_dataset(X, y)
        root = self.get_split(train)
        self.split(root, 1)
        self.node = root

    def print_tree(self, node, depth=0):
        if isinstance(node, dict):
            print('{0}[X[{1}] < {2}]'.format(depth * '>', node['index'], np.round(node['value'], 3)))
            self.print_tree(node['left'], depth+1)
            self.print_tree(node['right'], depth+1)
        else:
            print('{0}[{1}]'.format('   ' + depth * '>', node))
    
    def predict_row(self, node, row):
        if row[node['index']] < node['value']:
            if isinstance(node['left'], dict):
                return self.predict_row(node['left'], row)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self.predict_row(node['right'], row)
            else:
                return node['right']
            
    def predict(self, X):
        predictions = np.array([])
        for row in X:
            predictions = np.append(predictions, self.predict_row(self.node, row))
        return predictions

In [20]:
class GradientBoostingRegressor:
    '''
    GradientBoostingRegressor class is used to permorm gragient boosting on trees.
    *****************************************************************************************
    Attributes:
    lr - learning rate
    n_iter - number of iterations
    max_depth - int; adjusts the depth of the tree
    min_samples_split - int; set the minimum size to split
    *****************************************************************************************
    Methods:
    fit - build the algorithm
    predict - predict the whole data
    '''
    def __init__(self, lr=0.1, n_iter=100, **model_args):
        self.lr = lr
        self.n_iter = n_iter
        self.model = DecisionTreeRegressor(**model_args)
        self.models = list()
        
        for i in range(n_iter):
            self.models.append(copy.deepcopy(self.model))
            
    def fit(self, x, y):
        approximation = np.zeros((x.shape[0])).reshape(-1)
        
        for model in self.models:
            grad = -(y.reshape(-1) - approximation)
            
            model.fit(x, grad) # fit model on residuals
            approximation -= self.lr * model.predict(x)
        return self
    
    def predict(self, x):
        approximation = np.zeros((x.shape[0])).reshape(-1)
        for model in self.models:
            approximation -= self.lr * model.predict(x)
        return approximation
    
    def __repr__(self):
        return f'GradientBoostingRegressor(lr={self.lr}, n_iter={self.n_iter})'

In [21]:
data = load_boston()
x = data['data']
y = data['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [53]:
boosting = GradientBoostingRegressor(lr=0.1, max_depth=3)
boosting.fit(x_train, y_train)

GradientBoostingRegressor(lr=0.1, n_iter=100)

In [54]:
print(f'{r2_score.__name__}: {r2_score(y_test, boosting.predict(x_test))}')

r2_score: 0.887289369399193


In [55]:
my_tree = DecisionTreeRegressor_(max_depth=3)
my_tree.fit(x_train, y_train)

In [56]:
print(f'{r2_score.__name__}: {r2_score(y_test, my_tree.predict(x_test))}')

r2_score: 0.6860847825591382


In [57]:
skl_tree = DecisionTreeRegressor(max_depth=3)
skl_tree.fit(x_train, y_train)

DecisionTreeRegressor(max_depth=3)

In [58]:
print(f'{r2_score.__name__}: {r2_score(y_test, skl_tree.predict(x_test))}')

r2_score: 0.6860847825591381


In [4]:
with open('./style.css', 'r') as f:
    style = f.read()
HTML(style)