In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_hastie_10_2

In [18]:
class AdaBoost(object):
    def __init__(self,estimator,n_estimator):
        self.estimator = estimator
        self.n_estimator = n_estimator

    def train(self,X_train,y_train):
        num = len(X_train)
        self.W = []
        w = np.ones(num)/num
        self.W.append(w)
        prediction_train = np.zeros(num)

        self.ALPHA=[1.]
        self.Fn=[]
        for n in range(self.n_estimator):
            self.estimator.fit(X_train,y_train,sample_weight=w)
            self.Fn.append(self.estimator)
            y_pred_train = self.estimator.predict(X_train)

            misclass_index=[]
            classify = []
            for idx,boolean in enumerate(np.equal(y_pred_train,y_train)):
                if not boolean:
                    misclass_index.append(idx)
                    classify.append(-1)
                else:
                    classify.append(1)

            missclass_w = [w[i] for i in misclass_index]
            error = np.sum(missclass_w)

            alpha = 0.5 * np.log((1-error)/float(error))
            self.ALPHA.append(alpha)
            w = np.multiply(w,np.exp([(-x)*alpha for x in classify]))
            w_normal = w/np.sum(w)
            self.W.append(w_normal)
            self.prediction_train=[np.sum(x) for x in zip(prediction_train,[x*alpha for x in y_pred_train])]
            self.prediction_train_final = np.sign(self.prediction_train)

    def test(self,X_test):
        num=len(X_test)
        self.prediction_test = np.zeros(num)
        for n in range(self.n_estimator):
            self.prediction_test += self.ALPHA[n]*(self.Fn[n].predict(X_test))
        self.prediction_test_final = np.sign(self.prediction_test)
    
    def test_score(self,y_test):
        self.test_acc = (np.sum(np.equal(self.prediction_test_final,y_test))/len(y_test))
        print("The test accuracy is:",self.test_acc)

In [19]:
x, y = make_hastie_10_2()
df = pd.DataFrame(x)
df['Y'] = y

    # Split into training and test set
train, test = train_test_split(df, test_size = 0.2)
X_train, Y_train = train.iloc[:,:-1], train.iloc[:,-1]
X_test, Y_test = test.iloc[:,:-1], test.iloc[:,-1]
    
    # Fit a simple decision tree first
clf_tree = DecisionTreeClassifier(max_depth = 1, random_state = 1)

In [20]:
clf = AdaBoost(estimator=clf_tree,n_estimator=100)

In [21]:
clf.train(X_train,Y_train)
clf.test(X_test)
clf.test_score(Y_test)

The test accuracy is: 0.46791666666666665


In [2]:
def get_error_rate(pred,y):
    return sum(pred!=y)/float(len(y))

def print_error_rate(error):
    print('Error rate: Training: %.4f, Test: %.4f' % error)
    
def generic_clf(y_train,X_train,y_test,X_test,clf):
    clf.fit(X_train,y_train)
    pred_train=clf.predict(X_train)
    pred_test=clf.predict(X_test)
    return get_error_rate(pred_train,y_train),get_error_rate(pred_test,y_test)

def adaboost_clf(y_train,X_train,y_test,X_test,M,clf):
    n_train,n_test=len(X_train),len(X_test)
    
    w=np.ones(n_train)/n_train
    pred_train,pred_test=[np.zeros(n_train),np.zeros(n_test)]
    
    for i in range(M):
        clf.fit(X_train,y_train,sample_weight=w)
        pred_train_i=clf.predict(X_train)
        pred_test_i=clf.predict(X_test)
        
        miss=[int(x) for x in (pred_train_i != y_train)]
        
        miss2=[x if x==1 else -1 for x in miss]
        
        err_m = np.dot(w,miss)/sum(w)
        
        alpha_m = 0.5*np.log((1-err_m)/float(err_m))
        
        w=np.multiply(w,np.exp([float(x)*alpha_m for x in miss2]))
        
        pred_train=[sum(x) for x in zip(pred_train,[x*alpha_m for x in pred_train_i])]
        
        pred_test=[sum(x) for x in zip(pred_test,[x*alpha_m for x in pred_test_i])]
        
    pred_train,pred_test = np.sign(pred_train),np.sign(pred_test)
    
    return get_error_rate(pred_train,y_train),get_error_rate(pred_test,y_test)

def plot_error_rate(er_train,er_test):
    df_error=pd.DataFrame([er_train,er_test]).T
    df_error.columns=['Training','Test']
    plot1=df.error.plot(linewidth=3,figsize=(8,6),
                       color=['blue','orange'],grid=True)
    plot1.set_xlabel('Number of iterations',fontsize=12)
    plot1.set_xticklabels(range(0,450,50))
    plot1.set_ylabel('Error rate',fontsize=12)
    plot.set_title('Error rate vs number of iterations',fontsize=16)
    plt.axhline(y=er_test[0],lw=1,color='red',ls='dashed')