In [87]:
import numpy as np

In [215]:
class AdaBoostClassifer:
    
    def __init__(self, max_estimators=50):
        self._max_estimators = max_estimators
        
    def _stumClassify(self, X, featureIndex, threshVal, threshSymbol):
        retList = np.ones((X.shape[0],))
        
        if threshSymbol == 'lt':
            retList[X[:, featureIndex] <= threshVal] = -1
        else:
            retList[X[:, featureIndex] > threshVal] = -1
            
        return retList
    
    def _buildStump(self, X, Y, D):
        m, n = X.shape
        numSteps = 10
        bestStump = {}
        bestPredictions = np.zeros((m, 1))
        minError = float('inf')
        
        for i in range(n):
            rangeMin = X[:, i].min()
            rangeMax = X[:, i].max()
            stepSize = (rangeMax - rangeMin) / numSteps
            
            for j in range(-1, numSteps + 1):
                for symbol in ['lt', 'gt']:
                    threshVal = rangeMin + j * stepSize
                    predictions = self._stumClassify(X, i, threshVal, symbol)
                    
                    errList = (predictions != Y).astype(np.int)
                    weightError = np.dot(D, errList)
                    
#                     print('split: dim:{0}, thresh:{1}, thresh sybmol:{2}, weighted error is:{3}'.format(i, j, symbol, weightError))

                    if weightError < minError:
                        minError = weightError
                        bestPredictions = predictions
                        bestStump['dim'] = i
                        bestStump['thresh'] = threshVal
                        bestStump['symbol'] = symbol
#                         print('weightError:', weightError)
                        
        return bestStump, minError, bestPredictions             
    
    def fit(self, X, Y):
        weakClassList = []
        m = X.shape[0]
        D = np.ones((m,)) / m
        predictions = np.zeros((m,))

        for i in range(self._max_estimators):
            bestStump, minError, bestPredictions = self._buildStump(X, Y, D)
            alpha = float(0.5 * np.log((1.0 - minError) / max(minError, 1e-16)))
            bestStump['alpha'] = alpha
            weakClassList.append(bestStump)
            
#             print('min error: ', minError)
#             print('alpha: ', alpha)

            expon = -1 * Y * bestPredictions * alpha
            D = D * np.exp(expon)
            D = D / D.sum()
            
#             print('D: ', D)
            
            predictions += alpha * bestPredictions
            
            error = (np.sign(predictions) != Y).sum()
            errorRate = error / m
            
#             print('sub claasifier errorRate: {0}'.format(errorRate))
            
            if errorRate == 0:
                break
                
        self._weakClassList = weakClassList     
        
    def predict(self, X):
        m = X.shape[0]
        finalPredictions = np.zeros((m,))
        
        for index, subClassifer in enumerate(self._weakClassList):
            subPredictions = self._stumClassify(X, 
                                                subClassifer['dim'],
                                                subClassifer['thresh'],
                                                subClassifer['symbol']
                                               )
            
            finalPredictions += subClassifer['alpha'] * subPredictions
            
        return np.sign(finalPredictions)
    
    def accuracy(self, Y, predictions):
        return np.mean(Y == predictions)

In [216]:
def loadData(path):
    X = []
    Y = []
    with open(path) as f:
        for line in f:
            parts = [float(part) for part in line.split('\t')]
            X.append(parts[:-1])
            Y.append(parts[-1])
            
    return np.array(X), np.array(Y)

In [217]:
X_train, Y_train = loadData('./data/horseColicTraining2.txt')
X_test, Y_test = loadData('./data/horseColicTest2.txt')

In [218]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((299, 21), (299,), (67, 21), (67,))

In [235]:
adaClassifer = AdaBoostClassifer(30)

In [236]:
adaClassifer.fit(X_train, Y_train)

In [237]:
prediction_test = adaClassifer.predict(X_test)
adaClassifer.accuracy(prediction_test, Y_test)

0.7910447761194029