In [204]:
import numpy as np

from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

# classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

# reproducibility
seed = 104
np.random.seed(seed)

Let's consider binary classification case. Generate 20 dimensional dataset with 1000 samples, where 8 features holding information, 3 are redundant and 2 repeated.

In [48]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=8, n_redundant=3, n_repeated=2, random_state=seed)

Split the dataset into train/test parts

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

## A. comparison
With default parameters

### Decision Tree

In [203]:
decision_tree = DecisionTreeClassifier(random_state=seed)
decision_tree.fit(X_train, y_train)

# make predictions
decision_tree_y_pred  = decision_tree.predict_proba(X_test)

# calculate log loss
decision_tree_logloss = log_loss(y_test, decision_tree_y_pred)

print("== Decision Tree ==")
print("Log loss: {0:.2f}".format(decision_tree_logloss))
print("Number of nodes created: {}".format(decision_tree.tree_.node_count))

== Decision Tree ==
Log loss: 7.60
Number of nodes created: 167


### AdaBoost

In [184]:
adaboost = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=1000,
    random_state=seed)
adaboost.fit(X_train, y_train)

# make predictions
adaboost_y_pred = adaboost.predict_proba(X_test)

# calculate log loss
adaboost_logloss = log_loss(y_test, adaboost_y_pred)

print("== AdaBoost ==")
print("Log loss: {0:.2f}".format(adaboost_logloss))

== AdaBoost ==
Log loss: 0.69


### Gradient Boosted Trees

In [206]:
gbc = GradientBoostingClassifier(
    max_depth=1,
    n_estimators=1000,
    warm_start=True,
    random_state=seed)
gbc.fit(X_train, y_train)

# make predictions
gbc_y_pred = gbc.predict_proba(X_test)

# calculate log loss
gbc_logloss = log_loss(y_test, gbc_y_pred)

print("== Gradient Boosting ==")
print("Log loss: {0:.2f}".format(gbc_logloss))

== Gradient Boosting ==
Log loss: 0.48


### XGBoost

In [210]:
xgb = XGBClassifier(
    n_estimators=1000,
    max_depth=1,
    seed=seed
)
xgb.fit(X_train, y_train)

# make predictions
xgb_y_pred = xgb.predict_proba(X_test)

# calculate log loss
xgb_logloss = log_loss(y_test, xgb_y_pred)

print("== XGBoost ==")
print("Log loss: {0:.2f}".format(xgb_logloss))

== XGBoost ==
Log loss: 0.47
