# Notebook: GBDTs Explained

## Setup: Import Libraries and Generate Data

In [1]:
import numpy as np
import lightgbm as lgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(123)

# Generate synthetic regression data
X, y = make_regression(n_samples=1000, n_features=10, noise=10, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## Train a Simple LightGBM Model

We'll train a model with:
- **1 tree** (to make verification easier)
- **boost_from_average=False** (so initial predictions are 0)
- **learning_rate=0.3** and **lambda_l2=0.5** (to test the full formula)

In [2]:
np.random.seed(123)

# Define model parameters
params = {
    'objective': 'regression',
    'metric': 'mse',
    'num_leaves': 15,
    'learning_rate': 0.3,
    'lambda_l2': 0.5,
    'verbose': -1,
    'seed': 123,
    'boost_from_average': False,  # Start from 0 as initial prediction
}

# Train model
train_data = lgb.Dataset(X_train, label=y_train)
model = lgb.train(params, train_data, num_boost_round=1)

print(f"Model trained with {params['num_leaves']} leaves")
print(f"Learning rate η = {params['learning_rate']}")
print(f"Regularization λ = {params['lambda_l2']}")

Model trained with 15 leaves
Learning rate η = 0.3
Regularization λ = 0.5


## Compute Gradients and Hessians

For the first tree with `boost_from_average=False`, initial predictions are 0.

For MSE loss:
- $g_i = \hat{y}_i - y_i = 0 - y_i = -y_i$
- $h_i = 1$ (constant)

In [3]:
# Initial predictions are 0
preds_initial = np.zeros(len(X_train))

# Compute gradients and Hessians for MSE
gradients = preds_initial - y_train  # g_i = ŷ_i - y_i = -y_i
hessians = np.ones(len(y_train))     # h_i = 1

## Extract Leaf Assignments and Actual Leaf Weights

We'll get:
1. Which leaf each training sample falls into
2. The actual leaf weights stored by LightGBM

In [4]:
# Get leaf assignments for each sample
leaf_indices = model.predict(X_train, num_iteration=1, pred_leaf=True).flatten()

# Extract actual leaf weights from the model
tree_df = model.trees_to_dataframe()
leaf_nodes = tree_df[tree_df['split_gain'].isna()].copy()  # Leaf nodes have no split
leaf_nodes['leaf_num'] = leaf_nodes['node_index'].str.extract(r'L(\d+)').astype(int)
leaf_value_map = dict(zip(leaf_nodes['leaf_num'], leaf_nodes['value']))

print(f"Number of leaves: {len(leaf_value_map)}")
print(f"Unique leaf assignments: {np.unique(leaf_indices)}")
print(f"\nSample of actual leaf weights from LightGBM:")
for leaf_id in sorted(list(leaf_value_map.keys())[:5]):
    print(f"  Leaf {leaf_id}: w = {leaf_value_map[leaf_id]:.6f}")

Number of leaves: 15
Unique leaf assignments: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]

Sample of actual leaf weights from LightGBM:
  Leaf 0: w = -89.803428
  Leaf 3: w = -69.701744
  Leaf 5: w = -29.439445
  Leaf 6: w = -16.978203
  Leaf 11: w = -22.564865


## Verification Part 1: Leaf Weights Match the Formula

Now we'll compute leaf weights using the formula and compare them to LightGBM's actual values:

$$w_j^{\text{computed}} = \eta \cdot \left(-\frac{G_j}{H_j + \lambda}\right) = 0.3 \cdot \left(-\frac{\sum_{i \in R_j} g_i}{\sum_{i \in R_j} h_i + 0.5}\right)$$

In [5]:
# Extract parameters
eta = params['learning_rate']
lambda_l2 = params['lambda_l2']

# Storage for comparison
actual_weights = []
computed_weights = []
leaf_sizes = []

print(f"Formula: w_j = η · (-Σg_i / (Σh_i + λ))")
print(f"Parameters: η={eta}, λ={lambda_l2}\n")

print("{:<10} {:<15} {:<15} {:<15} {:<12}".format(
    "Leaf", "Actual w_j", "Computed w_j", "Error", "N_samples"))
print("-" * 75)

for leaf_id in sorted(np.unique(leaf_indices)):
    mask = (leaf_indices == leaf_id)
    n_samples = mask.sum()
    
    # Sum gradients and Hessians in this leaf
    G_j = gradients[mask].sum()
    H_j = hessians[mask].sum()
    
    # Apply formula
    w_j_computed = eta * (-G_j / (H_j + lambda_l2))
    w_j_actual = leaf_value_map[int(leaf_id)]
    
    error = abs(w_j_actual - w_j_computed)
    
    actual_weights.append(w_j_actual)
    computed_weights.append(w_j_computed)
    leaf_sizes.append(n_samples)
    
    print(f"{leaf_id:<10} {w_j_actual:<15.6f} {w_j_computed:<15.6f} {error:<15.2e} {n_samples:<12}")

actual_weights = np.array(actual_weights)
computed_weights = np.array(computed_weights)

Formula: w_j = η · (-Σg_i / (Σh_i + λ))
Parameters: η=0.3, λ=0.5

Leaf       Actual w_j      Computed w_j    Error           N_samples   
---------------------------------------------------------------------------
0          -89.803428      -89.803429      6.32e-07        62          
1          -66.485838      -66.485838      1.81e-07        20          
2          -16.484145      -16.484145      3.93e-08        28          
3          -69.701744      -69.701743      2.67e-07        28          
4          91.781561       91.781561       5.58e-08        68          
5          -29.439445      -29.439445      1.42e-07        55          
6          -16.978203      -16.978203      1.80e-07        74          
7          -5.895929       -5.895929       7.67e-08        49          
8          -14.786499      -14.786499      1.92e-07        27          
9          37.008983       37.008983       5.94e-08        64          
10         45.140052       45.140052       4.16e-08        79     

## Summary of Leaf Weight Verification

In [6]:
# Compute statistics
mae = np.mean(np.abs(actual_weights - computed_weights))
rmse = np.sqrt(np.mean((actual_weights - computed_weights)**2))
max_error = np.max(np.abs(actual_weights - computed_weights))

print("=" * 80)
print("SUMMARY: Leaf Weights")
print("=" * 80)
print(f"Number of leaves: {len(actual_weights)}")
print(f"Mean Absolute Error between actual and computed weights: {mae:.2e}")
print(f"Root Mean Squared Error between actual and computed weights: {rmse:.2e}")
print(f"Maximum Error between actual and computed weights: {max_error:.2e}")

if max_error < 1e-6:
    print("\n✓✓✓ VERIFICATION SUCCESSFUL!")
    print("The formula is EXACT (errors are only due to floating point precision)")
else:
    print("\n✗ Verification failed - errors exceed machine precision")

SUMMARY: Leaf Weights
Number of leaves: 15
Mean Absolute Error between actual and computed weights: 1.37e-07
Root Mean Squared Error between actual and computed weights: 2.04e-07
Maximum Error between actual and computed weights: 6.32e-07

✓✓✓ VERIFICATION SUCCESSFUL!
The formula is EXACT (errors are only due to floating point precision)


## Verification Part 2: model.predict() Matches Formula-Based Predictions

Now we verify that predictions from `model.predict()` exactly match what we get by:
1. Finding which leaf each sample falls into
2. Looking up that leaf's weight

This proves: $\hat{y}_i = w_{j(x_i)}$ where $j(x_i)$ is the leaf assignment for sample $i$.

In [7]:
# Get predictions from LightGBM
model_predictions = model.predict(X_train, num_iteration=1)

# Manually compute predictions using formula-derived leaf weights
manual_predictions = np.zeros(len(X_train))
unique_leaves_sorted = sorted(np.unique(leaf_indices))

for i, leaf_id in enumerate(leaf_indices):
    # Find which leaf this sample falls into and get its weight
    leaf_idx = unique_leaves_sorted.index(leaf_id)
    manual_predictions[i] = computed_weights[leaf_idx]

# Compare
pred_mae = np.mean(np.abs(model_predictions - manual_predictions))
pred_rmse = np.sqrt(np.mean((model_predictions - manual_predictions)**2))
pred_max_error = np.max(np.abs(model_predictions - manual_predictions))

print("=" * 80)
print("VERIFICATION: model.predict() vs formula-based predictions")
print("=" * 80)
print(f"Mean Absolute Error between model.predict() and formula-based: {pred_mae:.2e}")
print(f"Root Mean Squared Error between model.predict() and formula-based: {pred_rmse:.2e}")
print(f"Maximum Error between model.predict() and formula-based: {pred_max_error:.2e}")

VERIFICATION: model.predict() vs formula-based predictions
Mean Absolute Error between model.predict() and formula-based: 1.27e-07
Root Mean Squared Error between model.predict() and formula-based: 2.05e-07
Maximum Error between model.predict() and formula-based: 6.32e-07


## Show Sample Predictions

In [8]:
print("\n{:<10} {:<20} {:<20} {:<15} {:<10}".format(
    "Sample", "model.predict()", "Formula-based", "Error", "Leaf"))
print("-" * 80)

for i in range(min(10, len(X_train))):
    error = abs(model_predictions[i] - manual_predictions[i])
    print(f"{i:<10} {model_predictions[i]:<20.6f} {manual_predictions[i]:<20.6f} {error:<15.2e} {leaf_indices[i]:<10}")

if pred_max_error < 1e-6:
    print("\n✓✓✓ PERFECT MATCH!")
    print("model.predict() exactly equals the formula-based predictions!")
    print("\nThis proves the complete chain:")
    print("  1. Leaf weights: w_j = η · (-Σg_i / (Σh_i + λ))")
    print("  2. Predictions: ŷ_i = w_j where j = leaf(x_i)")


Sample     model.predict()      Formula-based        Error           Leaf      
--------------------------------------------------------------------------------
0          -16.978203           -16.978203           1.80e-07        6         
1          -89.803428           -89.803429           6.32e-07        0         
2          30.433308            30.433308            6.31e-09        13        
3          -16.484145           -16.484145           3.93e-08        2         
4          -22.564865           -22.564865           1.22e-08        11        
5          30.433308            30.433308            6.31e-09        13        
6          91.781561            91.781561            5.58e-08        4         
7          37.008983            37.008983            5.94e-08        9         
8          -16.978203           -16.978203           1.80e-07        6         
9          91.781561            91.781561            5.58e-08        4         

✓✓✓ PERFECT MATCH!
model.predict() ex