<h1>Gradient Boosting</h1>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
from sklearn.datasets import load_boston, make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder

from IPython.display import HTML
import warnings
warnings.filterwarnings('ignore')

In [2]:
class DecisionTreeRegressor_:
    '''
    DecisionTreeRegressor class is used to permorm CART algorithm for Decision Tree Regressor.
    *****************************************************************************************
    Attributes:
    max_depth - int; adjusts the depth of the tree
    min_samples_split - int; set the minimum size to split
    *****************************************************************************************
    Methods:
    make_dataset - from X and y makes one matrix
    MSE - count the MSE
    test_split - split the dataset into two groups with the threshold 
    get_split - find the best split using the best criterion value
    to_terminal - make the final node
    split - build the tree, recursively
    fit - starts the tree building
    print_tree - print the tree
    predict_row - predict the row of data using the tree
    predict - predict the whole data
    '''
    def __init__(self, max_depth=np.infty, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        
    def make_dataset(self, X, y):
        return np.concatenate((X, y.reshape(-1, 1)), axis=1)
    
    def MSE(self, groups):
        n_instances = np.sum([len(group) for group in groups])
        criterion = 0
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            score = np.mean(group[:, -1])
            criterion += np.sum((score - group[:, -1]) ** 2) # not weighted, just sum or residuals
        return criterion

    def test_split(self, index, value, dataset):
        left, right = list(), list()
        for row in dataset:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return np.array(left), np.array(right)

    def get_split(self, dataset):
        b_index, b_value, b_score, b_groups, MSE_history = None, None, np.infty, None, []
        for index in range(len(dataset[0]) - 1):
            for row in dataset:
                groups = self.test_split(index, row[index], dataset)
                criterion = self.MSE(groups)
                MSE_history.append(criterion)
                if criterion <= b_score:
                    b_index, b_value, b_score, b_groups = index, row[index], criterion, groups
        if len(set(MSE_history)) == 1: # checking if there's no need to split dataset
            b_groups = self.test_split(0, np.min(dataset[:, 0]), dataset)
        try:
            b_value = (b_value + np.max(b_groups[0][:, b_index])) / 2
        except IndexError:
            pass
        return {'index':b_index, 'value':b_value, 'groups':b_groups}

    def to_terminal(self, group):
        outcomes = group[:, -1]
        return np.mean(outcomes)

    def split(self, node, depth):
        left, right = node['groups']
        del(node['groups'])
        if len(left) == 0 or len(right) == 0:
            node['left'] = node['right'] = self.to_terminal(np.array(left.tolist() + right.tolist()))
            return
        if depth >= self.max_depth:
            node['left'], node['right'] = self.to_terminal(left), self.to_terminal(right)
            return
        if len(left) < self.min_samples_split:
            node['left'] = self.to_terminal(left)
        else:
            node['left'] = self.get_split(left)
            self.split(node['left'], depth+1)
        if len(right) < self.min_samples_split:
            node['right'] = self.to_terminal(right)
        else:
            node['right'] = self.get_split(right)
            self.split(node['right'], depth+1)
            
    def fit(self, X, y):
        train = self.make_dataset(X, y)
        root = self.get_split(train)
        self.split(root, 1)
        self.node = root

    def print_tree(self, node, depth=0):
        if isinstance(node, dict):
            print('{0}[X[{1}] < {2}]'.format(depth * '>', node['index'], np.round(node['value'], 3)))
            self.print_tree(node['left'], depth+1)
            self.print_tree(node['right'], depth+1)
        else:
            print('{0}[{1}]'.format('   ' + depth * '>', node))
    
    def predict_row(self, node, row):
        if row[node['index']] < node['value']:
            if isinstance(node['left'], dict):
                return self.predict_row(node['left'], row)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self.predict_row(node['right'], row)
            else:
                return node['right']
            
    def predict(self, X):
        predictions = np.array([])
        for row in X:
            predictions = np.append(predictions, self.predict_row(self.node, row))
        return predictions

In [3]:
class GradientBoostingRegressor:
    '''
    GradientBoostingRegressor class is used to permorm gragient boosting on trees.
    *****************************************************************************************
    Attributes:
    lr - learning rate
    n_iter - number of iterations
    max_depth - int; adjusts the depth of the tree
    min_samples_split - int; set the minimum size to split
    *****************************************************************************************
    Methods:
    fit - build the algorithm
    predict - predict the whole data
    '''
    def __init__(self, lr=0.1, n_iter=100, **model_args):
        self.lr = lr
        self.n_iter = n_iter
        self.model = DecisionTreeRegressor(**model_args)
        self.models = list()
        
        for i in range(n_iter):
            self.models.append(copy.deepcopy(self.model))
            
    def fit(self, x, y):
        approximation = np.zeros((x.shape[0])).reshape(-1)
        
        for model in self.models:
            grad = -(y.reshape(-1) - approximation)
            
            model.fit(x, grad) # fit model on residuals
            approximation -= self.lr * model.predict(x)
        return self
    
    def predict(self, x):
        approximation = np.zeros((x.shape[0])).reshape(-1)
        for model in self.models:
            approximation -= self.lr * model.predict(x)
        return approximation
    
    def __repr__(self):
        return f'GradientBoostingRegressor(lr={self.lr}, n_iter={self.n_iter})'

In [4]:
data = load_boston()
x = data['data']
y = data['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [5]:
boosting = GradientBoostingRegressor(lr=0.1, max_depth=3)
boosting.fit(x_train, y_train)

GradientBoostingRegressor(lr=0.1, n_iter=100)

In [6]:
print(f'{r2_score.__name__}: {r2_score(y_test, boosting.predict(x_test))}')

r2_score: 0.8828142228476458


In [7]:
my_tree = DecisionTreeRegressor_(max_depth=3)
my_tree.fit(x_train, y_train)

In [8]:
print(f'{r2_score.__name__}: {r2_score(y_test, my_tree.predict(x_test))}')

r2_score: 0.6860847825591382


In [9]:
skl_tree = DecisionTreeRegressor(max_depth=3)
skl_tree.fit(x_train, y_train)

In [10]:
print(f'{r2_score.__name__}: {r2_score(y_test, skl_tree.predict(x_test))}')

r2_score: 0.6860847825591382


## Classification

In [11]:
x, y = make_classification(n_samples=2000, n_features=5, n_classes=4, n_clusters_per_class=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

ohe = OneHotEncoder(sparse=False)
y_train = ohe.fit_transform(y_train.reshape(-1, 1))
y_test = ohe.transform(y_test.reshape(-1, 1))

In [12]:
def softmax(z):
    '''
    Softmax function, z - matrix with shape [n_objects; n_classes]
    '''
    return np.exp(z) / np.sum(np.exp(z), axis=1).reshape(-1, 1)

In [13]:
class GradientBoostingClassifier:
    def __init__(self, lr=0.1, n_iter=100, **model_args):
        self.lr = lr
        self.n_iter = n_iter
        self.model = DecisionTreeRegressor(**model_args) # yes, we build regressor
        self.models = dict()
        self.n_classes = None
            
    def loss(self, y_true, a_pred):
        '''
        Compute cross-entropy loss, a_pred - matrix with shape [n_objects; n_classes]
        '''
        loss = 0
        for i, j in enumerate(y_true):
            loss += -np.log(a_pred[i][j])
        return loss
            
    def fit(self, x, y, test_set=None, echo=True):
        self.n_classes = y.shape[1]
        y_not_ohe = np.argmax(y, axis=1)
        
        # for each class
        approximation = np.zeros((x.shape[0], self.n_classes))
        for i in range(self.n_iter):
            # for every iteration we create n models for n classes
            self.models[i] = dict()
            
            if echo:
                print(f'LOSS {i+1}: {self.loss(y_not_ohe, softmax(approximation))}')
            
            # because we build models independently, we need to normalize their preds with softmax
            grad = -(y - softmax(approximation))
            
            preds = np.zeros(shape=approximation.shape)
            for class_ in range(self.n_classes):
                
                # in multiclass classification we need to build n models for n classes
                
                model = copy.deepcopy(self.model)
                grad_for_class = grad[:, class_]

                model.fit(x, grad_for_class) # fit model on residuals
                preds[:, class_] = model.predict(x)
                
                # save model for iteration and class
                self.models[i][class_] = model

            approximation -= self.lr * preds
            if test_set is not None:
                x_t, y_t = test_set
                y_t = np.argmax(y_t, axis=1)
                preds = self.predict(x_t)
                if echo:
                    print(f'TEST LOSS {i+1}: {self.loss(y_t, softmax(preds))}')

        return self
    
    def predict(self, x):
        approximation = np.zeros((x.shape[0], self.n_classes))
        for i in self.models.keys():
            preds = np.zeros(approximation.shape)
            for class_ in range(self.n_classes):
                preds[:, class_] = self.models[i][class_].predict(x)
            approximation -= self.lr * preds
        return softmax(approximation)
    
    def __repr__(self):
        return f'GradientBoostingClassifier(lr={self.lr}, n_iter={self.n_iter})'

In [14]:
boosting_clf = GradientBoostingClassifier(lr=0.1, n_iter=463, max_depth=4)

In [15]:
boosting_clf.fit(x_train, y_train)

LOSS 1: 2218.0709777917327
LOSS 2: 2124.8102895477955
LOSS 3: 2036.2085162079009
LOSS 4: 1952.2448309812635
LOSS 5: 1872.8987104748887
LOSS 6: 1798.1813118926648
LOSS 7: 1727.6540558724314
LOSS 8: 1660.9182707283755
LOSS 9: 1598.4428696209434
LOSS 10: 1539.685174954901
LOSS 11: 1484.4864760676435
LOSS 12: 1432.6232764327658
LOSS 13: 1383.9251535469305
LOSS 14: 1338.2183210467865
LOSS 15: 1295.347858330585
LOSS 16: 1255.1553130076895
LOSS 17: 1217.4237880190576
LOSS 18: 1181.878876523908
LOSS 19: 1148.2829859115802
LOSS 20: 1116.3690202923265
LOSS 21: 1086.1797784393157
LOSS 22: 1058.087177581934
LOSS 23: 1031.3688740236992
LOSS 24: 1006.1449474234932
LOSS 25: 982.4357423031788
LOSS 26: 959.9761234528261
LOSS 27: 938.9562211621941
LOSS 28: 919.0371342002221
LOSS 29: 900.1024522880687
LOSS 30: 882.1913477264865
LOSS 31: 865.2992111067829
LOSS 32: 849.1008698955894
LOSS 33: 833.8201238375004
LOSS 34: 819.2888040498311
LOSS 35: 805.5193243744591
LOSS 36: 792.3546006383817
LOSS 37: 779.6766

LOSS 305: 242.1631538753679
LOSS 306: 241.64035121138681
LOSS 307: 241.09000193258592
LOSS 308: 240.30071163805326
LOSS 309: 239.773972184268
LOSS 310: 239.25521821340286
LOSS 311: 238.6278029563948
LOSS 312: 237.98767213166954
LOSS 313: 237.45410938360013
LOSS 314: 236.8865816104125
LOSS 315: 236.4352929767478
LOSS 316: 235.8564282627429
LOSS 317: 235.2845580865909
LOSS 318: 234.70891928186617
LOSS 319: 234.16428442588963
LOSS 320: 233.64803264534464
LOSS 321: 233.02812292726512
LOSS 322: 232.44792856152333
LOSS 323: 231.86458339077436
LOSS 324: 231.38013581430167
LOSS 325: 230.88295846393478
LOSS 326: 230.31422068521198
LOSS 327: 229.7520011672354
LOSS 328: 229.14034393973174
LOSS 329: 228.5738176142164
LOSS 330: 228.05964412610987
LOSS 331: 227.6617295792345
LOSS 332: 227.3079836427407
LOSS 333: 226.85639904169582
LOSS 334: 226.2757213337857
LOSS 335: 225.87159756531892
LOSS 336: 225.51386451363507
LOSS 337: 224.99790946349782
LOSS 338: 224.53236809834905
LOSS 339: 223.8655836486239

GradientBoostingClassifier(lr=0.1, n_iter=463)

In [16]:
test_preds = boosting_clf.predict(x_test)
test_preds

array([[0.00594921, 0.98338552, 0.00582911, 0.00483616],
       [0.00706225, 0.00943849, 0.00715206, 0.9763472 ],
       [0.95854109, 0.0091478 , 0.02357806, 0.00873304],
       ...,
       [0.00620146, 0.00964713, 0.00710539, 0.97704603],
       [0.00818807, 0.01936631, 0.00953136, 0.96291426],
       [0.00652486, 0.98134839, 0.00648617, 0.00564058]])

In [17]:
train_preds = boosting_clf.predict(x_train)
print(f'TRAIN acc: {accuracy_score(np.argmax(y_train, axis=1), np.argmax(train_preds, axis=1))}')
print(f'TEST acc: {accuracy_score(np.argmax(y_test, axis=1), np.argmax(test_preds, axis=1))}')

TRAIN acc: 0.9825
TEST acc: 0.8875


In [18]:
with open('./style.css', 'r') as f:
    style = f.read()
HTML(style)