# Gradient Boosting Regressor

![1_dIHrPFBT2fmXuTXMb-3_Xw.webp](attachment:3246910c-3bb2-453e-b268-e0580dbd8a18.webp)

https://towardsdatascience.com/all-you-need-to-know-about-gradient-boosting-algorithm-part-1-regression-2520a34a502

In [19]:
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import datasets
import numpy as np

In [3]:
class CustomGradientBoostingRegressor:
    '''Custom Gradient Boosting Regressot'''
    def __init__(self, learning_rate, n_estimators, max_depth = 1):
        
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees  = []
        
    def fit(self, X, y):
        '''Fit function'''
        self.F0 = y.mean()
        Fm = self.F0
        
        for i in range(self.n_estimators):
            r = y - Fm
            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=0)
            tree.fit(X, r)
            gamma = tree.predict(X)
            Fm += self.learning_rate * gamma
            self.trees.append(tree)
            
            
    def predict(self, X):
        '''Predict function'''
        
        Fm = self.F0
        for i in range(self.n_estimators):
            Fm += self.learning_rate * self.trees[i].predict(X)
        
        return Fm
        

# Test

In [5]:
custom_reg = CustomGradientBoostingRegressor(learning_rate=0.01 ,n_estimators=20, max_depth=10)
sklearn_reg = GradientBoostingRegressor(learning_rate=0.01 ,n_estimators=20, max_depth=10)

In [7]:
diabets = datasets.load_diabetes()

In [10]:
print(diabets['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [13]:
X = diabets["data"]
y = diabets["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
custom_reg.fit(X_train, y_train)
sklearn_reg.fit(X_train, y_train)

In [15]:
custom_pred = custom_reg.predict(X_test)
sklearn_pred = sklearn_reg.predict(X_test)

In [17]:
print(f"custom MAE : {mean_absolute_error(custom_pred, y_test)}\nsklearn MAE : {mean_absolute_error(sklearn_pred, y_test)}")

print(f"custom MSE : {mean_squared_error(custom_pred, y_test)}\nsklearn MSE : {mean_squared_error(sklearn_pred, y_test)}")

custom MAE : 48.87092234778761
sklearn MAE : 57.577353188277186
custom MSE : 3498.1922738931908
sklearn MSE : 4762.168538693377


# Gradient Boosting Classifier

In [248]:
class CustomGradientBoostingClassifier:
    
    def __init__(self, n_estimators, learning_rate = 0.01, max_depth=1):
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []

        
    def fit(self, X, y):
        '''Fit function'''
        F0 = np.log(y.mean()) / (1-y.mean())
        self.F0 = np.full(len(y), F0)
        Fm = self.F0.copy()
        
        for _ in range(self.n_estimators):
            
            p = np.exp(Fm) / (1+np.exp(Fm))
        
            r = y - p
            
            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=0)
            tree.fit(X, r)
            ids = tree.apply(X)
            
            for j in np.unique(ids):
                fltr = ids == j
                
                num = r[fltr].sum()
                den = (p[fltr]*(1-p[fltr])).sum()
                gamma = num/den
                
                Fm[fltr] += self.learning_rate * gamma
                
                tree.tree_.value[j,0,0] = gamma
            self.trees.append(tree)
            
    def predict(self, X):
        '''Predict funtion'''
        
        Fm = self.F0
        
        for i in range(self.n_estimators):
            Fm += self.learning_rate * self.trees[i].predict(X)
        
        return np.exp(Fm) / (1 + np.exp(Fm))
        
        

In [249]:
custom_classifier = CustomGradientBoostingClassifier(n_estimators=20, max_depth=10, learning_rate=0.01)
sklearn_classifier = GradientBoostingClassifier(n_estimators=20, max_depth=10, learning_rate=0.01)

In [250]:
iris = datasets.load_iris()

In [251]:
print(iris['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [252]:
X = iris['data']
y = iris['target']

In [253]:
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.1 , random_state=0, shuffle=True)

In [254]:
sklearn_classifier.fit(X_train, y_train)

In [255]:
custom_classifier.fit(X_train, y_train)

In [256]:
custom_pred_class = custom_classifier.predict(X_test)

ValueError: operands could not be broadcast together with shapes (135,) (15,) (135,) 

In [220]:
sklearn_pred_class = sklearn_classifier.predict(X_test)

In [221]:
print(f"Accuacy  Custom : {accuracy_score(y_test, custom_pred_class)}")

ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [223]:
custom_pred_class.shape

(15, 135)

In [224]:
y_test.shape

(15,)

In [257]:
custom_classifier.predict(X_test[0])

ValueError: Expected 2D array, got 1D array instead:
array=[5.8 2.8 5.1 2.4].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [247]:
X_test[1]

array([6. , 2.2, 4. , 1. ])