In [1]:
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)


Loading required package: xgboost
"package 'xgboost' was built under R version 3.4.3"

In [2]:
nround <- 2
param <- list(max_depth=2, eta=1, silent=1, nthread=2, objective='binary:logistic')

cat('running cross validation\n')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5, metrics={'error'})

running cross validation
[1]	train-error:0.046523+0.002404	test-error:0.046526+0.009616 
[2]	train-error:0.022263+0.001024	test-error:0.022264+0.004100 


In [8]:
cat('running cross validation, disable standard deviation display\n')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5,
       metrics='error', showsd = FALSE)


running cross validation, disable standard deviation display
[1]	train-error:0.046522+0.000825	test-error:0.046522+0.003304 
[2]	train-error:0.022263+0.000404	test-error:0.022263+0.001616 


In [9]:
###
# you can also do cross validation with cutomized loss function
# See custom_objective.R
##
print ('running cross validation, with cutomsized loss function')

logregobj <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  preds <- 1/(1 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1 - preds)
  return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
}

param <- list(max_depth=2, eta=1, silent=1,
              objective = logregobj, eval_metric = evalerror)
# train with customized objective
xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5)


[1] "running cross validation, with cutomsized loss function"
[1]	train-error:0.046522+0.000979	test-error:0.046521+0.003915 
[2]	train-error:0.022263+0.000854	test-error:0.022261+0.003415 


In [10]:
# do cross validation with prediction values for each fold
res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE)
res$evaluation_log
length(res$pred)

[1]	train-error:0.046522+0.001026	test-error:0.046522+0.004105 
[2]	train-error:0.022263+0.000388	test-error:0.022264+0.001550 


iter,train_error_mean,train_error_std,test_error_mean,test_error_std
1,0.04652231,0.0010256487,0.04652188,0.004104821
2,0.02226323,0.0003877655,0.02226424,0.001549918


## Predict leaf indices

In [11]:
require(xgboost)
require(data.table)
require(Matrix)

set.seed(1982)

# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)

Loading required package: data.table
Loading required package: Matrix


In [15]:
param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
nround = 4

# training the model for two rounds
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)

# Model accuracy without new features
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label)/
                 length(agaricus.test$label)

In [22]:
accuracy.before

In [16]:
# by default, we predict using all the trees
pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
head(pred_with_leaf)

0,1,2,3
4,3,5,4
3,3,5,3
4,3,5,4
4,3,5,4
5,4,5,3
3,3,5,3


In [17]:
create.new.tree.features <- function(model, original.features){
  pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
  cols <- list()
  for(i in 1:model$niter){
    # max is not the real max but it s not important for the purpose of adding features
    leaf.id <- sort(unique(pred_with_leaf[,i]))
    cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
  }
  cBind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
}

In [18]:
# Convert previous features to one hot encoding
new.features.train <- create.new.tree.features(bst, agaricus.train$data)
new.features.test <- create.new.tree.features(bst, agaricus.test$data)

In [20]:
# learning with new features
new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
watchlist <- list(train = new.dtrain)
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)

# Model accuracy with new features
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) /
                    length(agaricus.test$label)

## DART

In [37]:
param <- list(max_depth=2, eta=1, objective='binary:logistic', booster='dart')
nround = 1000

# training the model for two rounds
bst_dart = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)


In [38]:
bst_dart

##### xgb.Booster
raw: 202.9 Kb 
call:
  xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
params (as set within xgb.train):
  max_depth = "2", eta = "1", objective = "binary:logistic", booster = "dart", nthread = "2", silent = "1"
xgb.attributes:
  niter
callbacks:
  cb.print.evaluation(period = print_every_n)
niter: 1000

In [39]:
# Model accuracy with new features
accuracy.after <- sum((predict(bst_dart, new.dtest) >= 0.5) == agaricus.test$label) /
                    length(agaricus.test$label)

## Monotonic Constraints

In [45]:
model_no_constraints = xgb.train(param, dtrain, nrounds = nround, watchlist = watchlist,
                                 num_boost_round = 1000, 
                                 early_stopping_rounds = 10)

[1]	train-error:0.046522 
Will train until train_error hasn't improved in 10 rounds.

[2]	train-error:0.022263 
[3]	train-error:0.007063 
[4]	train-error:0.015200 
[5]	train-error:0.007063 
[6]	train-error:0.001228 
[7]	train-error:0.001228 
[8]	train-error:0.001228 
[9]	train-error:0.001228 
[10]	train-error:0.000000 
[11]	train-error:0.000000 
[12]	train-error:0.000000 
[13]	train-error:0.000000 
[14]	train-error:0.000000 
[15]	train-error:0.000000 
[16]	train-error:0.000000 
[17]	train-error:0.000000 
[18]	train-error:0.000000 
[19]	train-error:0.000000 
[20]	train-error:0.000000 
Stopping. Best iteration:
[10]	train-error:0.000000



In [58]:
params_constrained <- list(max_depth=2, eta=1, objective='binary:logistic', booster='dart',
                          monotone_constraints = rep(0,126))

model_with_constraints = xgb.train(params_constrained, dtrain, nrounds = nround,
                                   watchlist = watchlist, 
                                   num_boost_round = 1000, 
                                   early_stopping_rounds = 10)

[1]	train-error:0.046522 
Will train until train_error hasn't improved in 10 rounds.

[2]	train-error:0.022263 
[3]	train-error:0.007063 
[4]	train-error:0.015200 
[5]	train-error:0.007063 
[6]	train-error:0.001228 
[7]	train-error:0.001228 
[8]	train-error:0.001228 
[9]	train-error:0.001228 
[10]	train-error:0.000000 
[11]	train-error:0.000000 
[12]	train-error:0.000000 
[13]	train-error:0.000000 
[14]	train-error:0.000000 
[15]	train-error:0.000000 
[16]	train-error:0.000000 
[17]	train-error:0.000000 
[18]	train-error:0.000000 
[19]	train-error:0.000000 
[20]	train-error:0.000000 
Stopping. Best iteration:
[10]	train-error:0.000000

