In [1]:
from scipy.special import expit
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
class GradientBoosting:
    def __init__(self,S,alpha,max_depth,min_samples_split,regression=True):
        self.S = S
        self.alpha = alpha
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.regression = regression
        
        tree_params = {'max_depth': self.max_depth,
                      'min_samples_split': self.min_samples_split}
        self.models = [DecisionTreeRegressor(**tree_params) for _ in range(S)]
        
    def grad(self,y, h):
        return y - h

    def fit(self,X, y):

        self.models_trained = []
        #using DummyRegressor is a good technique for starting model
        first_model = DummyRegressor(strategy='mean')
        first_model.fit(X, y)
        self.models_trained.append(first_model)

        #fit the estimators
        for i, model in enumerate(self.models):
            #predict using all the weak learners we trained up to
            #this point
            y_pred = self.predict(X,argmax=False)
#             print(y_pred)

            #errors will be the total errors maded by models_trained
            residual = self.grad(y, y_pred)
#             print(residual)

            #fit the next model with residual
            model.fit(X, residual)

            self.models_trained.append(model)


    def predict(self,X,argmax=True):
        models = self.models_trained
        learning_rate = 0.1  ##hard code for now
        f0 = models[0].predict(X)  #first use the dummy model
        boosting = sum(learning_rate * model.predict(X) for model in models[1:])
        yhat = f0 + boosting
        if not self.regression:
            yhat = np.exp(yhat) / np.sum(np.exp(yhat), axis=1, keepdims=True)
            if argmax:
#                 print(yhat)
                yhat = np.argmax(yhat, axis=1)
#                 print(yhat)
        return yhat


In [3]:
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                        test_size=0.3, random_state=42)
model = GradientBoosting(S=200, alpha=0.1, max_depth = 2, 
                 min_samples_split = 2)
model.fit(X_train, y_train)
yhat = model.predict(X_test)

#print metrics
print("MSE for max_depth2: ", mean_squared_error(y_test, yhat))


MSE for max_depth2:  10.843779891542544


In [4]:
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                        test_size=0.3, random_state=42)
model = GradientBoosting(S=200, alpha=0.1, max_depth = 3, 
                 min_samples_split = 2)
model.fit(X_train, y_train)
yhat = model.predict(X_test)

#print metrics
print("MSE for max_depth 3: ", mean_squared_error(y_test, yhat))

MSE for max_depth 3:  7.776238096139652


In [71]:
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                        test_size=0.3, random_state=42)
model = GradientBoosting(S=200, alpha=0.1, max_depth = 4, 
                 min_samples_split = 2)
model.fit(X_train, y_train)
yhat = model.predict(X_test)

#print metrics
print("MSE for max_depth 4: ", mean_squared_error(y_test, yhat))

MSE for max_depth 4:  8.311887519118308


In [63]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
# print(y_train)
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1
# print(y_train_encoded)

model = GradientBoosting(S=200, alpha=0.1, max_depth = 3, 
                 min_samples_split = 2,
                 regression=False)
# print(X_train.shape)
model.fit(X_train, y_train_encoded)
yhat = model.predict(X_test)

# #print metrics
print("Our accuracy: ", accuracy_score(y_test, yhat))

Our accuracy:  0.9649122807017544


In [66]:
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X, y = load_digits(return_X_y=True)

X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
# print(y_train)
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1
# print(y_train_encoded)

model = GradientBoosting(S=200, alpha=0.1, max_depth = 3, 
                 min_samples_split = 2,
                 regression=False)
# print(X_train.shape)
model.fit(X_train, y_train_encoded)
yhat = model.predict(X_test)

# #print metrics
print("Our accuracy: ", accuracy_score(y_test, yhat))

Our accuracy:  0.9314814814814815
