@@ -36,8 +36,10 @@ Let's try the Titanic data set to see encoding in action. `xgboost` requires a n
3636``` r
3737library(" titanic" )
3838library(" xgboost" )
39+ library(" sigr" )
3940library(" WVPlots" )
4041
42+ # select example data set
4143data(titanic_train )
4244str(titanic_train )
4345```
@@ -86,18 +88,23 @@ summary(titanic_train)
8688 ##
8789
8890``` r
91+ outcome <- ' Survived'
92+ target <- 1
8993shouldBeCategorical <- c(' PassengerId' , ' Pclass' , ' Parch' )
9094for (v in shouldBeCategorical ) {
9195 titanic_train [[v ]] <- as.factor(titanic_train [[v ]])
9296}
93- outcome <- ' Survived'
9497tooDetailed <- c(" Ticket" , " Cabin" , " Name" , " PassengerId" )
9598vars <- setdiff(colnames(titanic_train ), c(outcome , tooDetailed ))
9699
100+ dTrain <- titanic_train
101+
102+
103+
97104set.seed(3425656 )
98- crossValPlan <- vtreat :: kWayStratifiedY(nrow(titanic_train ),
105+ crossValPlan <- vtreat :: kWayStratifiedY(nrow(dTrain ),
99106 10 ,
100- titanic_train ,
107+ dTrain ,
101108 outcome )
102109
103110evaluateModelingProcedure <- function (xMatrix , outcomeV , crossValPlan ) {
@@ -128,10 +135,10 @@ Our preferred way to encode data is to use the `vtreat` package either in the "n
128135``` r
129136library(" vtreat" )
130137set.seed(3425656 )
131- tplan <- vtreat :: designTreatmentsZ(titanic_train , vars , verbose = FALSE )
138+ tplan <- vtreat :: designTreatmentsZ(dTrain , vars , verbose = FALSE )
132139sf <- tplan $ scoreFrame
133140newvars <- sf $ varName [sf $ code %in% c(' clean' , ' lev' , ' isBad' )]
134- trainVtreat <- as.matrix(vtreat :: prepare(tplan , titanic_train ,
141+ trainVtreat <- as.matrix(vtreat :: prepare(tplan , dTrain ,
135142 varRestriction = newvars ))
136143print(dim(trainVtreat ))
137144```
@@ -149,12 +156,20 @@ print(colnames(trainVtreat))
149156 ## [13] "Embarked_lev_x.Q" "Embarked_lev_x.S"
150157
151158``` r
152- titanic_train $ predVtreatZ <- evaluateModelingProcedure(trainVtreat ,
153- titanic_train [[outcome ]]== 1 ,
159+ dTrain $ predVtreatZ <- evaluateModelingProcedure(trainVtreat ,
160+ dTrain [[outcome ]]== target ,
154161 crossValPlan )
155- WVPlots :: ROCPlot(titanic_train ,
162+ sigr :: permTestAUC(dTrain ,
163+ ' predVtreatZ' ,
164+ outcome , target )
165+ ```
166+
167+ ## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.86, s.d.=0.02, p<1e-05)."
168+
169+ ``` r
170+ WVPlots :: ROCPlot(dTrain ,
156171 ' predVtreatZ' ,
157- outcome , 1 ,
172+ outcome , target ,
158173 ' vtreat encoder performance' )
159174```
160175
@@ -170,7 +185,7 @@ f <- paste('~ 0 + ', paste(vars, collapse = ' + '))
170185oldOpt <- getOption(' na.action' )
171186options(na.action = ' na.pass' )
172187trainModelMatrix <- stats :: model.matrix(as.formula(f ),
173- titanic_train )
188+ dTrain )
174189# note model.matrix does not conveniently store the encoding
175190# plan, so you may run into difficulty if you were to encode
176191# new data which didn't have all the levels seen in the training
@@ -191,12 +206,20 @@ print(colnames(trainModelMatrix))
191206 ## [16] "EmbarkedS"
192207
193208``` r
194- titanic_train $ predModelMatrix <- evaluateModelingProcedure(trainModelMatrix ,
195- titanic_train [[outcome ]]== 1 ,
209+ dTrain $ predModelMatrix <- evaluateModelingProcedure(trainModelMatrix ,
210+ dTrain [[outcome ]]== target ,
196211 crossValPlan )
197- WVPlots :: ROCPlot(titanic_train ,
212+ sigr :: permTestAUC(dTrain ,
213+ ' predModelMatrix' ,
214+ outcome , target )
215+ ```
216+
217+ ## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.86, s.d.=0.019, p<1e-05)."
218+
219+ ``` r
220+ WVPlots :: ROCPlot(dTrain ,
198221 ' predModelMatrix' ,
199- outcome , 1 ,
222+ outcome , target ,
200223 ' model.matrix encoder performance' )
201224```
202225
@@ -215,8 +238,8 @@ library("caret")
215238``` r
216239set.seed(3425656 )
217240f <- paste(' ~' , paste(vars , collapse = ' + ' ))
218- encoder <- caret :: dummyVars(as.formula(f ), titanic_train )
219- trainCaret <- predict(encoder , titanic_train )
241+ encoder <- caret :: dummyVars(as.formula(f ), dTrain )
242+ trainCaret <- predict(encoder , dTrain )
220243print(dim(trainCaret ))
221244```
222245
@@ -232,12 +255,20 @@ print(colnames(trainCaret))
232255 ## [16] "Embarked" "EmbarkedC" "EmbarkedQ" "EmbarkedS"
233256
234257``` r
235- titanic_train $ predCaret <- evaluateModelingProcedure(trainCaret ,
236- titanic_train [[outcome ]]== 1 ,
258+ dTrain $ predCaret <- evaluateModelingProcedure(trainCaret ,
259+ dTrain [[outcome ]]== target ,
237260 crossValPlan )
238- WVPlots :: ROCPlot(titanic_train ,
261+ sigr :: permTestAUC(dTrain ,
262+ ' predCaret' ,
263+ outcome , target )
264+ ```
265+
266+ ## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.86, s.d.=0.019, p<1e-05)."
267+
268+ ``` r
269+ WVPlots :: ROCPlot(dTrain ,
239270 ' predCaret' ,
240- outcome , 1 ,
271+ outcome , target ,
241272 ' caret encoder performance' )
242273```
243274
@@ -249,39 +280,47 @@ You can also try y-aware encoding, but it isn't adding much in this situation.
249280set.seed(3425656 )
250281# for y aware evaluation must cross-validate whole procedure, designing
251282# on data you intend to score on can leak information.
252- preds <- rep(NA_real_ , nrow(titanic_train ))
283+ preds <- rep(NA_real_ , nrow(dTrain ))
253284for (ci in crossValPlan ) {
254- cfe <- vtreat :: mkCrossFrameCExperiment(titanic_train [ci $ train , , drop = FALSE ],
285+ cfe <- vtreat :: mkCrossFrameCExperiment(dTrain [ci $ train , , drop = FALSE ],
255286 vars ,
256- outcome , 1 )
287+ outcome , target )
257288 tplan <- cfe $ treatments
258289 sf <- tplan $ scoreFrame
259290 newvars <- sf $ varName [sf $ sig < 1 / nrow(sf )]
260291 trainVtreat <- cfe $ crossFrame [ , c(newvars , outcome ), drop = FALSE ]
261292 nrounds <- 1000
262293 cv <- xgb.cv(data = as.matrix(trainVtreat [, newvars , drop = FALSE ]),
263- label = trainVtreat [[outcome ]]== 1 ,
294+ label = trainVtreat [[outcome ]]== target ,
264295 objective = ' binary:logistic' ,
265296 nrounds = nrounds ,
266297 verbose = 0 ,
267298 nfold = 5 )
268299 # nrounds <- which.min(cv$evaluation_log$test_rmse_mean) # regression
269300 nrounds <- which.min(cv $ evaluation_log $ test_error_mean ) # classification
270301 model <- xgboost(data = as.matrix(trainVtreat [, newvars , drop = FALSE ]),
271- label = trainVtreat [[outcome ]]== 1 ,
302+ label = trainVtreat [[outcome ]]== target ,
272303 objective = ' binary:logistic' ,
273304 nrounds = nrounds ,
274305 verbose = 0 )
275306 appVtreat <- vtreat :: prepare(tplan ,
276- titanic_train [ci $ app , , drop = FALSE ],
307+ dTrain [ci $ app , , drop = FALSE ],
277308 varRestriction = newvars )
278309 preds [ci $ app ] <- predict(model ,
279310 as.matrix(appVtreat [, newvars , drop = FALSE ]))
280311}
281- titanic_train $ predVtreatC <- preds
282- WVPlots :: ROCPlot(titanic_train ,
312+ dTrain $ predVtreatC <- preds
313+ sigr :: permTestAUC(dTrain ,
314+ ' predVtreatC' ,
315+ outcome , target )
316+ ```
317+
318+ ## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.85, s.d.=0.019, p<1e-05)."
319+
320+ ``` r
321+ WVPlots :: ROCPlot(dTrain ,
283322 ' predVtreatC' ,
284- outcome , 1 ,
323+ outcome , target ,
285324 ' vtreat y-aware encoder performance' )
286325```
287326
0 commit comments