### ===Task===

Modify the above scratch code such that:
- Notice that we are still using max_depth = 1.  Attempt to tweak min_samples_split, max_depth for the regression and see whether we can achieve better mse on our boston data
- Notice that we only write scratch code for gradient boosting for regression, add some code so that it also works for binary classification.  Load the breast cancer data from sklearn and see that it works.
- Further change the code so that it works for multiclass classification.  Load the digits data from sklearn and see that it works
- Put everything into class

In [166]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor

class Gradient_Boosting:
    def __init__(self, S=100, learning_rate=0.1, max_depth=1, min_samples_split=2, regression=True):
        self.learning_rate = learning_rate
        self.regression = regression
        self.first_model = DummyRegressor(strategy='mean')
        tree_params = {'max_depth': max_depth, 'min_samples_split': min_samples_split}
        self.models = [DecisionTreeRegressor(**tree_params) for _ in range(S)]

    def grad(self, y, h):
        return y - h

    def encode(self, y):
        classes = np.unique(y)
        y_encode = np.zeros((len(y), len(classes)))
        for label in range(len(classes)):
            y_encode[np.where(y==label), label] = 1
        return y_encode
    
    def fit(self, X, y):
        if not self.regression and len(y.shape) == 1:
            y = self.encode(y)
        
        self.first_model.fit(X, y)
        self.models_trained = [self.first_model]
        self.i = 0
        #fit the estimators
        for model in self.models:
            y_pred = self.predict(X, Argmax=False)
            
            #errors will be the total errors maded by models_trained
            residual = self.grad(y, y_pred)
            
            #fit the next model with residual
            self.i += 1
            #print(self.i, X.shape, residual.shape, y_pred.shape)
            model.fit(X, residual)
            
            self.models_trained.append(model)

    def predict(self, X, Argmax=True):
        #print('X',X.shape)
        models = self.models_trained
        f0 = models[0].predict(X)  #first use the dummy model
        boosting = sum(self.learning_rate * model.predict(X) for model in models[1:])
        y_pred = (f0 + boosting)
        if not self.regression:
            #print(models[0], X.shape, y_pred.shape, y_pred[0])
            y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred), axis=1, keepdims=True)
            if Argmax:
                y_pred = np.argmax(y_pred, axis=1)
        return y_pred

In [170]:
# Regression

from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X, y = load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, random_state=42)

#fit the models
reg_model=Gradient_Boosting(S=200, max_depth = 3) 
reg_model.fit(X_train, y_train)

#predict
y_pred = reg_model.predict(X_test)

#print metrics
print("MSE: ", mean_squared_error(y_test, y_pred))

MSE:  7.505305617226334


In [171]:
# Binary classification

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report

X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, random_state=42)

#fit the models
bi_class_model=Gradient_Boosting(S=200, max_depth = 3, regression=False) 
bi_class_model.fit(X_train, y_train)

#predict
y_pred = bi_class_model.predict(X_test)

#print metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95        63
           1       0.98      0.96      0.97       108

    accuracy                           0.96       171
   macro avg       0.96      0.97      0.96       171
weighted avg       0.97      0.96      0.97       171



In [177]:
# Multiclass classification

from sklearn.datasets import load_digits

X, y = load_digits(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, random_state=42)

#fit the models
multi_class_model=Gradient_Boosting(S=200, max_depth = 3, regression=False) 
multi_class_model.fit(X_train, y_train)

#predict
y_pred = multi_class_model.predict(X_test)

#print metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        53
           1       0.96      0.90      0.93        50
           2       0.98      0.91      0.95        47
           3       0.92      0.87      0.90        54
           4       0.95      0.95      0.95        60
           5       0.93      0.94      0.93        66
           6       0.98      0.98      0.98        53
           7       0.92      0.98      0.95        55
           8       0.80      0.93      0.86        43
           9       0.90      0.88      0.89        59

    accuracy                           0.93       540
   macro avg       0.93      0.93      0.93       540
weighted avg       0.93      0.93      0.93       540

