# Imports

In [1]:
import sys
import pathlib
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble._gb import BaseGradientBoosting
from sklearn.ensemble._gb_losses import LeastSquaresError
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

if pathlib.Path().parent.resolve().absolute().as_posix() not in sys.path:
    sys.path.append(pathlib.Path().parent.resolve().absolute().as_posix())

from pilot import Pilot

%load_ext autoreload
%autoreload 2

# Load data

In [2]:
concrete = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls")
concrete_X_train, concrete_X_test, concrete_y_train, concrete_y_test = train_test_split(concrete.iloc[:, :-1], concrete.iloc[:, -1], test_size=0.2, random_state=42)

# Custom Gradient Boosting

In [53]:
class GradientBoostedPilot(GradientBoostingRegressor):
    def __init__(
        self, 
        *,
        max_depth: int = 12, 
        split_criterion: str = 'BIC',
        min_sample_split: int = 10, 
        min_sample_leaf: int = 5, 
        step_size: int = 1,
        categorical_idx: np.ndarray = np.array([-1]),   
        loss="squared_error",
        learning_rate=0.1,
        n_estimators=100,
        subsample=1.0,
        criterion="friedman_mse",
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        min_impurity_decrease=0.0,
        init=None,
        random_state=None,
        max_features=None,
        alpha=0.9,
        verbose=0,
        max_leaf_nodes=None,
        warm_start=False,
        validation_fraction=0.1,
        n_iter_no_change=None,
        tol=1e-4,
        ccp_alpha=0.0,
        ):
        super().__init__(
            loss=loss,
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            criterion=criterion,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_depth=max_depth,
            init=init,
            subsample=subsample,
            max_features=max_features,
            min_impurity_decrease=min_impurity_decrease,
            random_state=random_state,
            alpha=alpha,
            verbose=verbose,
            max_leaf_nodes=max_leaf_nodes,
            warm_start=warm_start,
            validation_fraction=validation_fraction,
            n_iter_no_change=n_iter_no_change,
            tol=tol,
            ccp_alpha=ccp_alpha
        )
        self.max_depth = max_depth
        self.split_criterion = split_criterion
        self.min_sample_split = min_sample_split
        self.min_sample_leaf = min_sample_leaf
        self.step_size = step_size
        self.categorical_idx = categorical_idx
        
    def _fit_stage(
        self,
        i,
        X,
        y,
        raw_predictions,
        sample_weight,
        sample_mask,
        random_state,
        X_csc=None,
        X_csr=None
        ):
        

        assert sample_mask.dtype == bool
        loss = self._loss
        original_y = y

        # Need to pass a copy of raw_predictions to negative_gradient()
        # because raw_predictions is partially updated at the end of the loop
        # in update_terminal_regions(), and gradients need to be evaluated at
        # iteration i - 1.
        raw_predictions_copy = raw_predictions.copy()

        for k in range(loss.K):
            if loss.is_multi_class:
                y = np.array(original_y == k, dtype=np.float64)

            residual = loss.negative_gradient(
                y, raw_predictions_copy, k=k, sample_weight=sample_weight
            )

            # induce regression tree on residuals
            tree = Pilot.PILOT(
                max_depth=self.max_depth,
                split_criterion=self.split_criterion,
                min_sample_split=self.min_sample_split,
                min_sample_leaf=self.min_sample_leaf,
                step_size=self.step_size,
            )

            X = X_csr if X_csr is not None else X
            tree.fit(X, residual, categorical=self.categorical_idx)

            # update tree leaves
            loss.update_terminal_regions(
                tree,
                X,
                y,
                residual,
                raw_predictions,
                sample_weight,
                sample_mask,
                learning_rate=self.learning_rate,
                k=k,
            )

            # add tree to ensemble
            self.estimators_[i, k] = tree

        return raw_predictions
    
    def predict(self, X):
        X = np.array(X)
        base_prediction = self._raw_predict_init(X)
        tree_predictions = np.array([self.learning_rate * e.predict(X) for e in self.estimators_.flatten()])
        return  base_prediction + tree_predictions.sum(axis=0).reshape(-1, 1)
            



In [54]:
pilot_gb = GradientBoostedPilot(n_estimators=100, categorical_idx=np.array([-1]))
pilot_gb.fit(concrete_X_train, concrete_y_train)
y_pred = pilot_gb.predict(X=concrete_X_test)
mean_squared_error(concrete_y_test, y_pred)

32.93692538225475

In [55]:
gbd = GradientBoostingRegressor()
gbd.fit(concrete_X_train, concrete_y_train)

y_pred = gbd.predict(X=concrete_X_test)
mean_squared_error(concrete_y_test, y_pred)

30.696433666134