# How XGBoost works

## Base Tree Model Split

In [8]:
import numpy as np

x = np.array([1, 2, 3, 4, 5])
y = np.array([5, 6, 7, 15, 16])

sorted_idx = np.argsort(x)

x = x[sorted_idx]
y = y[sorted_idx]

In [10]:
best_mse = np.inf
best_split = None

In [12]:
n = len(x)

for i in range(1,n):
    left_y = y[:i]
    right_y = y[i:]

    left_mse = np.mean((left_y - np.mean(left_y)) ** 2)
    right_mse = np.mean((right_y - np.mean(right_y)) ** 2)

    total_mse = (len(left_y) * left_mse + len(right_y) * right_mse)/2

    if total_mse < best_mse:
        best_mse = total_mse
        best_split = (x[i-1] + x[i])/2
print(f"Best split at x = {best_split}, MSE = {best_mse:.4f}")


Best split at x = 3.5, MSE = 0.5000


## XGboost Split

In [15]:
x = np.array([1, 2, 3, 4, 5])
y = np.array([5, 6, 7, 15, 16])

g = np.array([0, -1, -2, -10, -11])     #gradient 
h = np.ones_like(g) * 1.0              # hessian 

# Sorting
sorted_idx = np.argsort(x)
x = x[sorted_idx]
g = g[sorted_idx]
h = h[sorted_idx]

# Init
best_gain = -np.inf
best_split = None
lambda_ = 1.0  # L2 
gamma = 0.0    #min gain

G_total = np.sum(g)
H_total = np.sum(h)

G_left = 0.0
H_left = 0.0

In [16]:
for i in range(1, len(x)):
    G_left += g[i-1]
    H_left += h[i-1]

    G_right = G_total - G_left
    H_right = H_total - H_left

    gain = 0.5 * (G_left**2 / (H_left + lambda_) + 
                  G_right**2 / (H_right + lambda_) -
                  G_total**2 / (H_total + lambda_)) - gamma
    
    if gain > best_gain:
        best_gain = gain
        best_split = (x[i - 1] + x[i]) / 2

print(f"Best split at x = {best_split}, gain = {best_gain:.4f}")

Best split at x = 3.5, gain = 26.6250
