# Boosting methods in R
### by Xiaoqi Zheng, 0724/2020

In [76]:
library(rsample)      # data splitting 
library(gbm)          # basic implementation
library(xgboost)      # a faster implementation of gbm
library(caret)        # an aggregator package for performing many machine learning

In [77]:
## load data
library(tidyverse)
library(ISLR)

ml_data <- College
ml_data[1:5,1:5]
dim(ml_data)

Unnamed: 0_level_0,Private,Apps,Accept,Enroll,Top10perc
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>
Abilene Christian University,Yes,1660,1232,721,23
Adelphi University,Yes,2186,1924,512,16
Adrian College,Yes,1428,1097,336,22
Agnes Scott College,Yes,417,349,137,60
Alaska Pacific University,Yes,193,146,55,16


In [78]:
# Partition into training and test data
set.seed(42)
index <- createDataPartition(ml_data$Private, p = 0.7, list = FALSE)
train_data <- ml_data[index, ]
test_data  <- ml_data[-index, ]

## 1. Gradient Boosting Machines (GBM)

In [79]:
# Train model with preprocessing & repeated cv
model_gbm <- caret::train(Private ~ .,
                          data = train_data,
                          method = "gbm",
                          trControl = trainControl(method = "repeatedcv", 
                                                  number = 5, 
                                                  repeats = 3, 
                                                  verboseIter = FALSE),
                          verbose = 0)
model_gbm

Stochastic Gradient Boosting 

545 samples
 17 predictor
  2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 3 times) 
Summary of sample sizes: 437, 436, 436, 435, 436, 436, ... 
Resampling results across tuning parameters:

  interaction.depth  n.trees  Accuracy   Kappa    
  1                   50      0.9369738  0.8366455
  1                  100      0.9388143  0.8424288
  1                  150      0.9443022  0.8584537
  2                   50      0.9430959  0.8521284
  2                  100      0.9449251  0.8583012
  2                  150      0.9442967  0.8579690
  3                   50      0.9381859  0.8399493
  3                  100      0.9412385  0.8490957
  3                  150      0.9485780  0.8686399

Tuning parameter 'shrinkage' was held constant at a value of 0.1

Tuning parameter 'n.minobsinnode' was held constant at a value of 10
Accuracy was used to select the optimal model using the largest value.
The final values us

In [80]:
## test 
caret::confusionMatrix(data = predict(model_gbm, test_data),
                       reference = test_data$Private)

Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No   56   6
       Yes   7 163
                                          
               Accuracy : 0.944           
                 95% CI : (0.9061, 0.9698)
    No Information Rate : 0.7284          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.8577          
                                          
 Mcnemar's Test P-Value : 1               
                                          
            Sensitivity : 0.8889          
            Specificity : 0.9645          
         Pos Pred Value : 0.9032          
         Neg Pred Value : 0.9588          
             Prevalence : 0.2716          
         Detection Rate : 0.2414          
   Detection Prevalence : 0.2672          
      Balanced Accuracy : 0.9267          
                                          
       'Positive' Class : No              
                              

## 2. eXtreme Gradient Boosting (XGboost)

In [96]:
trctrl <- trainControl(method = "cv", number = 5)

tune_grid <- expand.grid(nrounds = 100:150,
                        max_depth = 5,
                        eta = 0.05,
                        gamma = 0.01,
                        colsample_bytree = 0.75,
                        min_child_weight = 0,
                        subsample = 0.5)

rf_fit <- train(Private ~ .,
                data = train_data, 
                method = "xgbTree",
                trControl=trctrl,
                tuneGrid = tune_grid,
                tuneLength = 10)

In [98]:
# have a look at the model 
rf_fit

eXtreme Gradient Boosting 

545 samples
 17 predictor
  2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (5 fold) 
Summary of sample sizes: 436, 437, 436, 435, 436 
Resampling results across tuning parameters:

  nrounds  Accuracy   Kappa    
   10      0.9339422  0.8283897
   11      0.9412480  0.8459599
   12      0.9394131  0.8411165
   13      0.9394131  0.8411165
   14      0.9412480  0.8452833
   15      0.9376119  0.8367592
   16      0.9376119  0.8376159
   17      0.9375782  0.8376041
   18      0.9357264  0.8320201
   19      0.9394131  0.8419746
   20      0.9394131  0.8418589
   21      0.9375782  0.8381638
   22      0.9394131  0.8431954
   23      0.9375782  0.8381638
   24      0.9412480  0.8474790
   25      0.9394131  0.8431954
   26      0.9375612  0.8376403
   27      0.9394301  0.8435936
   28      0.9394131  0.8431954
   29      0.9375782  0.8381638
   30      0.9412649  0.8476126
   31      0.9375782  0.8370127
   32      0.9412649  0.8465522

In [99]:
# Testing
test_predict <- predict(rf_fit, test_data)

In [100]:
caret::confusionMatrix(data = test_predict,
                       reference = test_data$Private)

Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No   54   7
       Yes   9 162
                                          
               Accuracy : 0.931           
                 95% CI : (0.8904, 0.9601)
    No Information Rate : 0.7284          
    P-Value [Acc > NIR] : 4.085e-15       
                                          
                  Kappa : 0.8239          
                                          
 Mcnemar's Test P-Value : 0.8026          
                                          
            Sensitivity : 0.8571          
            Specificity : 0.9586          
         Pos Pred Value : 0.8852          
         Neg Pred Value : 0.9474          
             Prevalence : 0.2716          
         Detection Rate : 0.2328          
   Detection Prevalence : 0.2629          
      Balanced Accuracy : 0.9079          
                                          
       'Positive' Class : No              
                              

Also see: https://www.hackerearth.com/zh/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/