In [25]:
import pandas as pd
import numpy as np

def get_data(filename):

    data = pd.read_csv(filename, sep = '\t', header = None)

    data.columns = ['Frequency','Angle of Attack','Chord length','Free stream velocity','Suction','Pressure']

    return data

filename = 'airfoil_self_noise.dat'

df = get_data(filename)

for _ in range(10):
    df.sample(frac = 1)

Y = df['Pressure']
X = df[['Frequency','Angle of Attack', 'Chord length','Free stream velocity','Suction']]

split = 0.8
split*=len(X)
split = int(split)

X_train ,X_val = X[:split], X[split:]
y_train, y_val = Y[:split], Y[split:]


N, p= X_train.shape

In [22]:
# what would vanilla linear regression give
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_val)
mean_squared_error(y_pred, y_val)

26.42246255104156

In [23]:
# now let us try some gradient boosting:

In [169]:
class GradientBoosting:
    def __init__(self, loss_function, dloss_function):
        self.loss = loss_function
        self.dloss = dloss_function
        self.models = []
        self.gammas = []
        
    def setUp(self, X_train, y_train):
        self.X = X_train
        self.N, self.p = self.X.shape
        self.y = y_train
        
    def _find_gamma(self, gamma, F_init, h_model):
        return self.loss(self.y, F_init + gamma*h_model.predict(self.X))
                         
    def _find_h_model(self, model, F_init):
        """
        Assumes to have a model.fit API call.
        """
        #F_init = np.mean(self.y)
        L_diff = -self.dloss(self.y,F_init)
        model.fit(X_train, L_diff)
        return model
    
    def _minimise_for_gamma(self, gamma_0, F_init, h_model, opt_verbose = False):
        def get_gamma(gamma):
            return self._find_gamma(gamma, F_init, h_model)
        
        res = minimize(get_gamma, gamma_0, method='nelder-mead',
                       options={'xatol': 1e-8, 'disp': opt_verbose})

        gamma_next = res['x']
        return gamma_next
        
        #F_next = F_init + gamma_next*hm.predict(X_train)
        
    def _get_F_next(self, model, F_init, gamma_0,opt_verbose = False, print_current_loss = False):
        h_model = self._find_h_model(model, F_init)

        gamma_next = self._minimise_for_gamma(gamma_0, F_init, h_model, opt_verbose)

        F_next = F_init + gamma_next*h_model.predict(self.X)
        
        if print_current_loss:
            print(self.loss(F_next, self.y))
        return F_next, gamma_next
    
    def run_iteration(self, F_init, model, gamma_0, print_every = 50):

        gamma_0 = 200
        if print_every and print_every%50 == 0:
            F_next,gamma_next = gb._get_F_next(model, F_init, gamma_0, False, True)
        else:
            F_next,gamma_next = gb._get_F_next(model, F_init, gamma_0, False, False)
            
        self.models.append(model)
        self.gammas.append(gamma_next)
        F_init = F_next
        
        
        return F_init
    
    
    def fit(self, X, y, gamma_0 = 0.5, models = None, max_iteration = 10, print_every = None):
        self.setUp(X, y)
        F_init = np.mean(self.y)
        self.F0 = F_init
        if models is None:
            for M in range(0,max_iteration):
                model = LinearRegression()
                F_init = gb.run_iteration(F_init, model, gamma_0, print_every)
            
            
    def predict(self, X):
        M = len(self.gammas)
        y_pred = np.sum(np.array([self.gammas[i]*self.models[i].predict(X) for i in range(0,M)]),0) + self.F0
        return y_pred

In [175]:
def loss(x,y):
    diff = x-y
    diff = diff**2
    return np.mean(diff)

def dloss(x,y):
    diff = (y - x)/len(x)
    return diff

gb = GradientBoosting(loss, dloss)
gb.setUp(X_train, y_train)

F_init = np.mean(gb.y)
model = LinearRegression()
gamma_0 = 0.0
gb.fit(X_train, y_train)
loss(gb.predict(X_val), y_val)

26.42246255217496

In [None]:
from sklearn.metrics import m

In [177]:
# tree algorithm?

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor()

model.fit(X_train, y_train)

mean_squared_error(model.predict(X_val),y_val)

18.02107903751532

In [302]:
class GradientBoostingTreeRegressor:
    def __init__(self, n_estimators, max_depth, max_leaf_nodes, loss_function):
        self.n_estimators  = n_estimators
        self.max_depth = max_depth
        self.max_leaf_nodes = max_leaf_nodes
        self.loss = loss_function
        self.models = []
        self.errors = []
        self.F0 = None
        
    def setUp(self, X, y):
        self.X = X
        self.y = y
    
    def init_model(self):
        self.F0 = np.mean(self.y)
        return np.array([self.F0]*len(self.y))

        
    def run_iteration(self, y_init, model):
        error = self.y - y_init
        model.fit(self.X, error)
        y_pred = y_init + model.predict(self.X)
        #self.F.append(y_pred)
        self.models.append(model)
        self.errors.append(error)
        return self.loss(y_pred, y_train), y_pred
    
    def fit(self, X, y, print_every = 10):
        self.setUp(X, y)
        y_pred = self.init_model()
        for _ in range(self.n_estimators):
            model = DecisionTreeRegressor(max_depth = self.max_depth, max_leaf_nodes = self.max_leaf_nodes)
            loss, y_pred = gbtr.run_iteration(y_pred, model)
            #self.F.append(y_pred)
            if _%print_every == 0:
                print(loss)
    
    def predict(self, X):
        
        return np.sum(np.array([self.models[i].predict(X) for i in range(0,self.n_estimators)]),0) + self.F0
        
    
        
        

In [303]:
gbtr = GradientBoostingTreeRegressor(n_estimators = 60, max_depth = 7, max_leaf_nodes = None, 
                                     loss_function = mean_squared_error)

gbtr.fit(X_train, y_train)

7.691384372374303
0.35596546918175387
0.0700148503629082
0.014550623232145612
0.0034839129483134118
0.0007301605760442672


In [306]:
mean_squared_error(gbtr.predict(X_train), y_train)

0.00012652557976002718

In [307]:
mean_squared_error(gbtr.predict(X_val), y_val)

30.571446399343692

In [None]:
# This is wrong - need to examine how the to take a derivative of the decision tree.

In [276]:
gbtr.F0

124.60557820299503

In [246]:
models = gbtr.models

In [248]:
gbtr.F0 + models[0].predict(X_train) + models[1].predict(X_train) + models[2].predict(X_train)

array([126.201     , 125.716415  , 125.716415  , ..., 127.78471469,
       132.66143563, 138.11592713])

In [249]:
y_train

0       126.201
1       125.201
2       125.951
3       127.591
4       127.461
         ...   
1197    118.416
1198    120.766
1199    127.676
1200    136.886
1201    139.226
Name: Pressure, Length: 1202, dtype: float64

In [215]:
gbtr = GradientBoostingTreeRegressor(n_estimators = 100, max_depth = 7, max_leaf_nodes = None, 
                                     loss_function = mean_squared_error)

gbtr.setUp(X_train, y_train)
model, y_pred = gbtr.init_model()

for _ in range(0,30):
    model = DecisionTreeRegressor( max_depth = 7)
    loss, y_pred = gbtr.run_iteration(y_pred, model)
    print(loss)

4.012896016807978
2.410318900260539
1.6672565083221378
1.3159623746623579
0.8827646630431142
0.7419458340871492
0.5612162399182049
0.5237981730856289
0.4318269937802575
0.35596546918175453
0.3250971694886142
0.28406979104250757
0.24295514739453994
0.1785937401588718
0.14788741568926456
0.12250339934003468
0.10697934346109872
0.09299953627204012
0.08048857714589873
0.0700148503629082
0.05728736295562408
0.04404291441367991
0.03885994245139565
0.031405173572276485
0.029385697833963425
0.02704171537041256
0.0194956577589255
0.01802475673922405
0.016293872094202633
0.014550623232145662


In [174]:
# let's see how this compares with XGBoost.

import xgboost

xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

xgb.fit(X_train,y_train)

y_pred = xgb.predict(X_val)

loss(y_pred, y_val)

13.492315679317933