In [1]:
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)

param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
watchlist <- list(eval = dtest, train = dtrain)
nround = 2

Loading required package: xgboost
"package 'xgboost' was built under R version 3.4.3"

In [3]:
# training the model for two rounds
bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)


[1]	eval-error:0.042831	train-error:0.046522 
[2]	eval-error:0.021726	train-error:0.022263 


In [4]:
cat('start testing prediction from first n trees\n')
labels <- getinfo(dtest,'label')
### predict using first 1 tree
ypred1 = predict(bst, dtest, ntreelimit=1)
# by default, we predict using all the trees
ypred2 = predict(bst, dtest)

cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n')
cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n')

start testing prediction from first n trees
error of ypred1= 0.04283054 
error of ypred2= 0.02172564 


## Generalized Linear Model

In [5]:
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)

In [7]:
##
#  this script demonstrate how to fit generalized linear model in xgboost
#  basically, we are using linear model, instead of tree for our boosters
#  you can fit a linear regression, or logistic regression model
##

# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer 
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param <- list(objective = "binary:logistic", booster = "gblinear",
              nthread = 2, alpha = 0.0001, lambda = 1, lambda_bias = 1)


In [8]:
# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun), 
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable

##
# the rest of settings are the same
##
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
bst <- xgb.train(param, dtrain, num_round, watchlist)
ypred <- predict(bst, dtest)
labels <- getinfo(dtest, 'label')
cat('error of preds=', mean(as.numeric(ypred>0.5)!=labels),'\n')


[1]	eval-error:0.016760	train-error:0.024106 
[2]	eval-error:0.004966	train-error:0.005681 
error of preds= 0.00496586 
