## Example of applying the Gradient Boosting Machine

In [2]:
import pandas as pd

In [8]:
# Assume we have six instances with a numerical feature x and regression values y

df = pd.DataFrame({"x":range(6),"y":[1,1,2,2,3,3]},index=["i_"+str(i) for i in range(1,7)])

# The first model M_0 just predicts their mean

df["M_0"] = df["y"].mean()

# The new target values, res_0, are the original values minus the prediction of M_0, i.e., the residuals
# These values represent what should be further subtracted to get a zero error

df["res_0"] =df["y"]-df["M_0"]

# Assume that we generate a decision stump based on the split x > 3

df["x > 3"] = df["x"] > 3

# We can now calculate the mean of the target values in each leaf, which will be the prediction of M_1

group_means = df.groupby("x > 3")["res_0"].mean()
df["M_1"] = [group_means[g] for g in df["x > 3"]]

# The new target values, res_1, represent what needs to be subtracted for each instance, after subtracting from the original regression values the values output by M_0 and M_1

df["res_1"]= df["res_0"]-df["M_1"]

# Assume that we generate a new decision stump based on the split x1 < 2, using res_1 as the target

df["x < 2"] = df["x"] < 2

# Again, we calculate the mean of the target values in each leaf, which will be the prediction of M_2

group_means = df.groupby("x < 2")["res_1"].mean()
df["M_2"] = [group_means[g] for g in df["x < 2"]]

# The new target values, res_2, represent what needs to subtracted for each instance, after subtracting the output of M_0, M_1 and M_2 in sequence

df["res_2"]= df["res_1"]-df["M_2"]

# The final and third model M_3 may reuse a previous split (x >3), but now with leafs that predict different values than above

group_means = df.groupby("x > 3")["res_2"].mean()
df["M_3"] = [group_means[g] for g in df["x > 3"]]

display(df)

Unnamed: 0,x,y,M_0,res_0,x > 3,M_1,res_1,x < 2,M_2,res_2,M_3
i_1,0,1,2.0,-1.0,False,-0.5,-0.5,True,-0.5,0.0,0.125
i_2,1,1,2.0,-1.0,False,-0.5,-0.5,True,-0.5,0.0,0.125
i_3,2,2,2.0,0.0,False,-0.5,0.5,False,0.25,0.25,0.125
i_4,3,2,2.0,0.0,False,-0.5,0.5,False,0.25,0.25,0.125
i_5,4,3,2.0,1.0,True,1.0,0.0,False,0.25,-0.25,-0.25
i_6,5,3,2.0,1.0,True,1.0,0.0,False,0.25,-0.25,-0.25


In [9]:
# When using this sequence of models (M_0, M_1, M_2, M_3) on some instance,
# the final prediction is the sum of the output of the models.
# For example, given an instance with x = 6, M_0 would output 2 (independently of the value of x),
# M_1 would output 1.0 (since x>3), M2 would output 0.25 (since x1 < 2 is false),
# and M_3 would output -0.25 (again since x>3).
# Hence the prediction will be 2.0 + 1.0 + 0.25 - 0.25 = 3
# Note that this is the same as will be predicted for e5 and e6, 
# and hence their residuals after applying the sequence is 0