In [1]:
library(data.table)
library(lightgbm)
library(caret)

Loading required package: R6
Loading required package: lattice
Loading required package: ggplot2

Attaching package: ‘caret’

The following object is masked from ‘package:httr’:

    progress



In [2]:

LGB_CV_Predict <- function(lgb_cv, data, num_iteration = NULL, folds=NULL, type=c("cv", "test")) {
  require(foreach)
  if (is.null(num_iteration)) {
    num_iteration <- lgb_cv$best_iter
  }
  if (type=="cv"){
    print("create cross validation predictions")
    pred_mat <- foreach::foreach(i = seq_along(lgb_cv$boosters), .combine = "c", .packages=c("data.table","lightgbm")) %do% {
      lgb_tree <- lgb_cv$boosters[[i]][[1]]
      predict(lgb_tree, 
              data[folds[[i]],], 
              num_iteration = num_iteration, 
              rawscore = FALSE, predleaf = FALSE, header = FALSE, reshape = FALSE)
    }
    
    as.double(pred_mat)[order(unlist(folds))]
    
  } else if (type=="test"){
    print("create test set predictions")
    pred_mat <- foreach::foreach(i = seq_along(lgb_cv$boosters), .combine = "+", .packages=c("data.table","lightgbm")) %do% {
      lgb_tree <- lgb_cv$boosters[[i]][[1]]
      predict(lgb_tree, 
              data, 
              num_iteration = lgb_cv$best_iter, 
              rawscore = FALSE, predleaf = FALSE, header = FALSE, reshape = FALSE)
    }
    as.double(pred_mat)/length(lgb_cv$boosters)
  }
}



In [3]:
t1 <- fread("../input/train.csv")
s1 <- fread("../input/test.csv")
t1[,filter:=0]
s1[,":="(target=-1,
         filter=2)]

ts1 <- rbind(t1, s1)
set.seed(500)
cvFoldsList <- createFolds(ts1[filter==0, target], k=30)

varnames <- setdiff(colnames(ts1), c("ID_code","target", "filter"))
dtrain <- lgb.Dataset(data.matrix(ts1[filter==0,varnames,with=F]), label=ts1[filter==0, target], free_raw_data = FALSE)


In [4]:
params <- list(objective = "binary", 
               boost="gbdt",
               metric="auc",
               boost_from_average="false",
               num_threads=30,
               learning_rate = 0.01,
               num_leaves = 11,
               max_depth=-1,
               tree_learner = "serial",
               feature_fraction = 0.05,
               bagging_freq = 5,
               bagging_fraction = 0.3,
               min_data_in_leaf = 90,
               min_sum_hessian_in_leaf = 10.0,
               verbosity = 1)



In [5]:
tme <- Sys.time()
lgb1 <- lgb.cv(params,
               dtrain,
               nrounds=1200000,
               folds=cvFoldsList,
               early_stopping_rounds = 1900,
               eval_freq=3000,
               seed=44000)
Sys.time() - tme



[1]:	valid's auc:0.575009+0.0114029 
[3001]:	valid's auc:0.894882+0.00624815 
[6001]:	valid's auc:0.899879+0.00622734 
[9001]:	valid's auc:0.900646+0.00636565 
[12001]:	valid's auc:0.90053+0.00637894 


Time difference of 3.295708 hours

In [6]:
test_preds <- LGB_CV_Predict(lgb1, data.matrix(ts1[filter==2, varnames, with=F]), type="test")


Loading required package: foreach


[1] "create test set predictions"


In [7]:

dt <- data.table(ID_code=ts1[filter==2, ID_code], target=test_preds)
fwrite(dt, "./submission.csv")