In this notebook, im going through two boosting methods, in particular:

Gradient Boosting 


Adaptive Boosting

In [1]:
import pandas as pd
import numpy as np

def get_data(filename):

    data = pd.read_csv(filename, sep = '\t', header = None)

    data.columns = ['Frequency','Angle of Attack','Chord length','Free stream velocity','Suction','Pressure']

    return data

filename = 'airfoil_self_noise.dat'

df = get_data(filename)

for _ in range(10):
    df.sample(frac = 1)

Y = df['Pressure']
X = df[['Frequency','Angle of Attack', 'Chord length','Free stream velocity','Suction']]

split = 0.8
split*=len(X)
split = int(split)

X_train ,X_val = X[:split], X[split:]
y_train, y_val = Y[:split], Y[split:]


N, p= X_train.shape

In [22]:
# what would vanilla linear regression give
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_val)
mean_squared_error(y_pred, y_val)

26.42246255104156

In [23]:
# now let us try some gradient boosting:

In [169]:
class GradientBoosting:
    def __init__(self, loss_function, dloss_function):
        self.loss = loss_function
        self.dloss = dloss_function
        self.models = []
        self.gammas = []
        
    def setUp(self, X_train, y_train):
        self.X = X_train
        self.N, self.p = self.X.shape
        self.y = y_train
        
    def _find_gamma(self, gamma, F_init, h_model):
        return self.loss(self.y, F_init + gamma*h_model.predict(self.X))
                         
    def _find_h_model(self, model, F_init):
        """
        Assumes to have a model.fit API call.
        """
        #F_init = np.mean(self.y)
        L_diff = -self.dloss(self.y,F_init)
        model.fit(X_train, L_diff)
        return model
    
    def _minimise_for_gamma(self, gamma_0, F_init, h_model, opt_verbose = False):
        def get_gamma(gamma):
            return self._find_gamma(gamma, F_init, h_model)
        
        res = minimize(get_gamma, gamma_0, method='nelder-mead',
                       options={'xatol': 1e-8, 'disp': opt_verbose})

        gamma_next = res['x']
        return gamma_next
        
        #F_next = F_init + gamma_next*hm.predict(X_train)
        
    def _get_F_next(self, model, F_init, gamma_0,opt_verbose = False, print_current_loss = False):
        h_model = self._find_h_model(model, F_init)

        gamma_next = self._minimise_for_gamma(gamma_0, F_init, h_model, opt_verbose)

        F_next = F_init + gamma_next*h_model.predict(self.X)
        
        if print_current_loss:
            print(self.loss(F_next, self.y))
        return F_next, gamma_next
    
    def run_iteration(self, F_init, model, gamma_0, print_every = 50):

        gamma_0 = 200
        if print_every and print_every%50 == 0:
            F_next,gamma_next = gb._get_F_next(model, F_init, gamma_0, False, True)
        else:
            F_next,gamma_next = gb._get_F_next(model, F_init, gamma_0, False, False)
            
        self.models.append(model)
        self.gammas.append(gamma_next)
        F_init = F_next
        
        
        return F_init
    
    
    def fit(self, X, y, gamma_0 = 0.5, models = None, max_iteration = 10, print_every = None):
        self.setUp(X, y)
        F_init = np.mean(self.y)
        self.F0 = F_init
        if models is None:
            for M in range(0,max_iteration):
                model = LinearRegression()
                F_init = gb.run_iteration(F_init, model, gamma_0, print_every)
            
            
    def predict(self, X):
        M = len(self.gammas)
        y_pred = np.sum(np.array([self.gammas[i]*self.models[i].predict(X) for i in range(0,M)]),0) + self.F0
        return y_pred

In [175]:
def loss(x,y):
    diff = x-y
    diff = diff**2
    return np.mean(diff)

def dloss(x,y):
    diff = (y - x)/len(x)
    return diff

gb = GradientBoosting(loss, dloss)
gb.setUp(X_train, y_train)

F_init = np.mean(gb.y)
model = LinearRegression()
gamma_0 = 0.0
gb.fit(X_train, y_train)
loss(gb.predict(X_val), y_val)

26.42246255217496

In [177]:
# tree algorithm?

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor()

model.fit(X_train, y_train)

mean_squared_error(model.predict(X_val),y_val)

18.02107903751532

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm_notebook
from scipy.optimize import minimize

class GradientBoostingTreeRegressor:
    def __init__(self, max_depth, loss_function,  max_leaf_nodes = None):
        self.max_depth = max_depth
        self.max_leaf_nodes = max_leaf_nodes
        self.loss = loss_function
        
    def _get_gamma_general(self, gamma, idx, F_prev):
        return self.loss(self.y[idx],F_prev[idx] + gamma)

    def _get_all_gammas(self,regions, all_leaves_indices, F_prev, gamma_0 = 0):
        gamma_per_region = {}
        for region in tqdm_notebook(regions, leave = True):
            idx, = np.where(all_leaves_indices== region)

            def get_gamma(gamma):
                return self._get_gamma_general(gamma, idx, F_prev)

            res = minimize(get_gamma, gamma_0, method='nelder-mead',  options={'xatol': 1e-8, 'disp': False})

            gamma_per_region[region] = res['x'][0]

        return gamma_per_region
    
    def setUp(self ,X, y):
        self.X = X
        self.y = y
        
    def _train_F0_model(self):
        model_0 = DecisionTreeRegressor(max_depth = self.max_depth, max_leaf_nodes = self.max_leaf_nodes)
        model_0.fit(self.X, self.y)
        self.model_0 = model_0
        return model_0
        
    def fit(self, X, y, max_iterations = 2):
        self.setUp(X,y)
        self.gamma_per_m = {}
        model_0 = self._train_F0_model()
        
        all_leaves = model_0.apply(self.X)
        regions = np.unique(all_leaves)
        J = len(regions)

        F_prev = np.array([np.mean(y_train)]*len(y_train))

        for i in range(0,max_iterations):
            print(self.loss(F_prev, y_train))


            gamma_per_region = self._get_all_gammas(regions,all_leaves, F_prev )
            self.gamma_per_m[i] = gamma_per_region
            gamma_per_region_function = lambda x: gamma_per_region[x]

            F_prev = F_prev + list(map(gamma_per_region_function,model_0.apply(X_train)))
            
    def predict(self, X):
        
        def get_gammas(i):
            return lambda x: self.gamma_per_m[i][x]
        
        res = np.mean(y_train)
        for k in self.gamma_per_m.keys():
            res += np.array(list(map(get_gammas(k),self.model_0.apply(self.X))))
        
        return res
        


In [21]:
gb = GradientBoostingTreeRegressor(max_depth = 7, loss_function = mean_squared_error)
gb.fit(X_train, y_train)

45.060230917761174


HBox(children=(FloatProgress(value=0.0, max=116.0), HTML(value='')))


7.691384372374305


HBox(children=(FloatProgress(value=0.0, max=116.0), HTML(value='')))




In [24]:
diff = gb.predict(X_train) - y_train
np.mean(diff**2)

7.691384372374305

In [174]:
# let's see how this compares with XGBoost.

import xgboost

xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

xgb.fit(X_train,y_train)

y_pred = xgb.predict(X_val)

loss(y_pred, y_val)

13.492315679317933