# Hypothesis boosting.

## AdaBoost or Gradient Boost.
These ensemble methods train each classifier using the one before it.

In [1]:
from sklearn.metrics import accuracy_score
def compare_accuracy(classifiers):
    for classifier in classifiers:
        #classifier.fit(X_train,y_train)
        y_pred = classifier.predict(X_test)
        print(classifier.__class__.__name__,accuracy_score(y_test,y_pred))

from sklearn.datasets import make_moons
X,y = make_moons(n_samples=2000, noise=0.20)
# This is a wrapper for ShuffleSplit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.30)

## AdaBoost

Training is serial, not parallel.

Loop: train model, evaluate model, assign model weight.
Add model weight to each misclassified instance.
(This upweights instances that were misclassified by good models.)
Train new model on weighted instances.
Finally, use the ensemble of all models.

Prediction uses majority weighted voting (best models count more).

SciKit-Learn has a multi-class AdaBoost.
Algorithm is SAMME for classification, SAMME.R for probabilities.

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
# Increasing max_depth improves the descision tree but degrades the ada boost.
dtc = DecisionTreeClassifier(max_depth=1)
dtc.fit(X_train,y_train)  # only for comparison; adaboost retrains it
abc = AdaBoostClassifier(
    dtc,
    n_estimators=200,
    algorithm='SAMME.R',
    learning_rate=0.5
)
abc.fit(X_train,y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [16]:
compare_accuracy([dtc,abc])

DecisionTreeClassifier 0.805
AdaBoostClassifier 0.9566666666666667


## Gradient boost
Gradient boosted regression trees (GBRT).
Each model trains on the residuals of the predecessor.
Whereas ERROR is deviation from unknown true population mean,
RESIDUAL is deviation from observed sample average.

### First, we implement gradient boost manually.
Use regression.

In [25]:
from sklearn.tree import DecisionTreeRegressor
#
dtr1 = DecisionTreeRegressor(max_depth=2)
dtr1.fit(X_train,y_train)
residual1 = y_train - dtr1.predict(X_train)
#
dtr2 = DecisionTreeRegressor(max_depth=2)
dtr2.fit(X_train,residual1)
residual2 = residual1 - dtr2.predict(X_train)
#
dtr3 = DecisionTreeRegressor(max_depth=2)
dtr3.fit(X_train,residual1)
residual3 = residual2 - dtr3.predict(X_train)
# 
# THE FINAL PREDICTION IS THE SUM OF THE CLASSIFIER PREDICTIONS!
# This is an additive model.
# No individual classier except the first would work on its own.
#
def ensemble_predict (X,classifiers):
    return sum(TREE.predict(X) for TREE in classifiers)
trees=[dtr1,dtr2,dtr3]
y_pred=ensemble_predict(X_test,trees)
y_pred[:5]

array([0.09996956, 0.99275727, 0.09996956, 0.09996956, 1.11986277])

### Second, we implement gradient boost the right way.

Shrinkage.
Lower the learning rate but increase # trees.

We'll start with few trees, high rate.
Note that too many trees will overfit.
Use early stopping to optimize tree number.
Use warm_start=True to stop and restart training.
Use the staged_predict() method to measure error per successive model.

In [30]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=3,
    learning_rate=1.0   
)
gbr.fit(X_train,y_train)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [32]:
y_pred=gbr.predict(X_test)
y_pred[:5]

array([0.08095442, 0.94173762, 0.08095442, 0.08095442, 1.00529036])

### Third, an even better way.
This is mentioned in the book but it is not part of sklearn.
XGBoost, part of DMLC. 

In [34]:
# Book stops here.
# We will try again as classification (not regression).

from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(
    max_depth=2,
    n_estimators=3,
    learning_rate=1.0   
)
gbc.fit(X_train,y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=2, n_estimators=3)

In [36]:
y_pred=gbc.predict(X_test)
y_pred[:5]

array([0, 1, 0, 0, 1])