### ===Task===

Modify the above scratch code such that:
- Notice that we are still using max_depth = 1.  Attempt to tweak min_samples_split, max_depth for the regression and see whether we can achieve better mse on our boston data
- Notice that we only write scratch code for gradient boosting for regression, add some code so that it also works for binary classification.  Load the breast cancer data from sklearn and see that it works.
- Further change the code so that it works for multiclass classification.  Load the digits data from sklearn and see that it works
- Put everything into class

In [21]:
from scipy.special import expit
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor

def grad(y, h):
    return y - h

def fit(X, y, models):
    
    models_trained = []
    
    #using DummyRegressor is a good technique for starting model
    tree_params = {'max_depth': 1}
    first_model = DecisionTreeRegressor(**tree_params)
    first_model.fit(X, y)
    models_trained.append(first_model)
    
    #fit the estimators
    for i, model in enumerate(models):
        #predict using all the weak learners we trained up to
        #this point
        y_pred = predict(X, models_trained)
        
        #errors will be the total errors maded by models_trained
        residual = grad(y, y_pred)
        
        #fit the next model with residual
        model.fit(X, residual)
        
        models_trained.append(model)
        
    return models_trained
        
def predict(X, models):
    learning_rate = 0.1  ##hard code for now
    f0 = models[0].predict(X)  #first use the dummy model
    boosting = sum(learning_rate * model.predict(X) for model in models[1:])
    return f0 + boosting

In [26]:
# Regression

from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X, y = load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, random_state=42)

n_estimators = 200
tree_params = {'max_depth': 1}
models = [DecisionTreeRegressor(**tree_params) for _ in range(n_estimators)]


#fit the models
models = fit(X_train, y_train, models)

#predict
y_pred = predict(X_test, models)

#print metrics
print("Our MSE: ", mean_squared_error(y_test, y_pred))

Our MSE:  13.990261066816064
(354, 13)


In [None]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor


class GradientBoosting:
    def __init__(self, learning_rate=0.01, min_sample_split=2, max_depth=1, n_estimators=5, reg = True):
        self.learning_rate = learning_rate
        self.reg = reg
        self.min_sample_split = min_sample_split
        self.max_depth = max_depth
        tree_params = {'max_depth': self.max_depth,
        'min_samples_split': self.min_sample_split}
        self.models = [DecisionTreeRegressor(**tree_params) for _ in range(n_estimators)]
        self.first_model = DummyRegressor(strategy='mean')

    def grad(self, y, h):
        return y - h
        
    def fit(self, X, y, models):
        self.models_trained = []
        self.first_model.fix(X, y)
        self.models_trained.append(self.first_model)
        for i, model in enumerate(models):
            y_pred = predict(X, self.models_trained)
            
            residual = grad(y, y_pred)
            
            model.fit(X, residual)
            
            self.models_trained.append(model)

    def predict(self, X):        
        f0 = self.models_trained[0].predict(X)  #first use the dummy model
        boosting = sum(self.learning_rate * model.predict(X) for model in self.models_trained[1:])
        yhat = f0 + boosting
        if not self.reg:
            yhat = np.exp(yhat) / np.sum(np.exp(yhat), axis=1)
        return yhat

In [14]:
print(type(models[1]))

<class 'sklearn.tree._classes.DecisionTreeRegressor'>
