-
Notifications
You must be signed in to change notification settings - Fork 0
/
gradient_boosting.py
161 lines (144 loc) · 6.9 KB
/
gradient_boosting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import numpy as np
from numpy_ml.utils import to_categorical
from ..deep_learning.loss_functions import SquaredLoss, BinomialDeviance
from .decision_tree import RegressionTree
# reference: https://explained.ai/gradient-boosting/
# Why does a GBM model have a learning rate?
# Answer: It reduces contribution of following trees to make the prediction
# more accurate. Suppose the optimal value is 5.23 and at each step, the prediction
# of a tree is 1, then the best value we can landed when learning_rate = 1 is 5 or
# 6. IF learning_rate = 0.1, we can reach 5.2. If learning_rate = 0.01, then
# we can reach 5.23 which is the best one.
# Why does GBM fit gradient instead of residuals?
# Answer: The idea of boosing is that the following models will correct mistakes
# made by the previous weak learners. Let's say if the loss function is MSE, then
# after t-1 iterations, we have t-1 models, for the t-th model, obj_t = [y-(y_(t-1)+f_t)]^2
# obj_t = constant + [2f_t*(y_(t-1)-y)+f_t^2], so now the question becomes what
# should be f_t so that obj_t is minimized, it is a quadratic formula, f_t= y - y_(t-1)
# which is the gradient
# How to understand that predictions from following trees are subtracted from
# the previous predictions?
# A: Because we want the most significant improvement on the prediction
class GradientBoosting(object):
"""
Super class of GradientBoostingClassifier and GradientBoostingRegressor. Use
a collection of regression trees that trains on predicting the gradient of
the loss function
Parameters
---------------------------
n_estimators: int
The number of classification trees that are used.
learning_rate: float
The step length that will be taken when following the negative gradient
during training
min_samples_split: int
The minimum number of samples needed to make a split when building a tree
min_impurity: float
The minimum impurity required to further split the tree
max_depth: int
The maximum depth of a tree
regression: boolean
True if it is a regression problem otherwise False
subsample: float (shoud be in (0,1])
The fraction of samples to be used for fitting the individual base learners.
random_state: int
The random seed for subsample
"""
def __init__(self,n_estimators,learning_rate,min_samples_split,min_impurity,
max_depth,regression,subsample=1,max_features=1,random_state=0):
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.min_samples_split = min_samples_split
self.min_impurity = min_impurity
self.max_depth = max_depth
self.regression = regression
self.subsample = subsample
self.max_features = max_features
self.random_state = random_state
assert self.subsample > 0 and self.subsample <= 1
assert self.max_features > 0 and self.max_features <= 1
# Square loss for regression, logloss for classification
if self.regression:
self.loss = SquareLoss()
else:
self.loss = BinomialDeviance()
# Initialize regression trees
self.tree_list = []
for _ in range(self.n_estimators):
tree = RegressionTree(min_samples_split = self.min_samples_split,
min_impurity = self.min_impurity,
max_depth = self.max_depth,
max_features = self.max_features,
random_state = self.random_state)
self.tree_list.append(tree)
def fit(self,X,y):
if self.subsample < 1:
np.random.seed(self.random_state)
index = np.random.choice(len(X),int(self.subsample*len(X)))
X = X[index]
y = y[index]
# initialze predictions using mean value of y
# y_pred = np.full(np.shape(y),np.mean(y,axis=0))
y_pred = np.full(np.shape(y),np.mean(y))
y_pred = np.log(y_pred / (1 - y_pred))
for i in range(self.n_estimators):
gradient = self.loss.negative_gradient(y,y_pred)
# Each tree will fit the gradient instead of residual
# why negative gradient?
# https://datascience.stackexchange.com/a/56040
self.tree_list[i].fit(X,gradient)
# Update leaf values using line search
# ref: https://stats.stackexchange.com/questions/330849/how-do-newton-raphson-updates-work-in-gradient-boosting
self.loss.update_terminal_region(X, y, gradient,self.tree_list[i])
update = self.tree_list[i].predict(X)
# Here we subtract negative update
y_pred += np.multiply(self.learning_rate,update)
def predict(self,X):
raise Exception('Not implemented')
# y_pred = np.array([])
# for tree in self.tree_list:
# update = tree.predict(X)
# update = np.multiply(self.learning_rate,update)
# y_pred = update if not y_pred.any() else y_pred+update
#
# if not self.regression:
# # this is softmax
# y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
# # Set label to the value that maximizes probability
# y_pred = np.argmax(y_pred, axis=1)
#
# return y_pred
def staged_decision_function(self, X):
"""Compute decision function of X for each iteration."""
y_pred = np.array([])
staged_pred = []
for tree in self.tree_list:
update = tree.predict(X)
update = np.multiply(self.learning_rate,update)
y_pred = update if not y_pred.any() else y_pred+update
staged_pred.append(y_pred)
return np.array(staged_pred)
class GradientBoostingRegressor(GradientBoosting):
def __init__(self,n_estimators=200,learning_rate=0.5,min_samples_split=2,
min_var_red = 1e-7,max_depth=4):
super(GradientBoostingRegressor, self).__init__(n_estimators=n_estimators,
learning_rate=learning_rate,
min_samples_split=min_samples_split,
min_impurity=min_var_red,
max_depth=max_depth,
regression=True)
class GradientBoostingClassifier(GradientBoosting):
def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,
min_info_gain=1e-7, max_depth=2, subsample=1,max_features=1, random_state=0):
super(GradientBoostingClassifier, self).__init__(n_estimators=n_estimators,
learning_rate=learning_rate,
min_samples_split=min_samples_split,
min_impurity=min_info_gain,
max_depth=max_depth,
regression=False,
subsample=subsample,
max_features=max_features,
random_state=random_state)
def fit(self, X, y):
# y = to_categorical(y)
super(GradientBoostingClassifier, self).fit(X, y)