Skip to content

Commit b608549

Browse files
committed
make names a bit more generic
add text AUC
1 parent 0b96923 commit b608549

File tree

2 files changed

+117
-60
lines changed

2 files changed

+117
-60
lines changed

Encoding/Example.Rmd

Lines changed: 49 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,27 +22,33 @@ stats::model.matrix(~x, dTest)
2222

2323
Let's try the Titanic data set to see encoding in action. `xgboost` requires a numeric matrix for its input, so unlike many `R` modeling methods we must manage the data encoding ourselves (instead of leaving that to `R` which often hides the encoding plan in the trained model).
2424

25-
```{r titanicexample}
25+
```{r modelingexample}
2626
library("titanic")
2727
library("xgboost")
28+
library("sigr")
2829
library("WVPlots")
2930
31+
# select example data set
3032
data(titanic_train)
3133
str(titanic_train)
3234
summary(titanic_train)
33-
35+
outcome <- 'Survived'
36+
target <- 1
3437
shouldBeCategorical <- c('PassengerId', 'Pclass', 'Parch')
3538
for(v in shouldBeCategorical) {
3639
titanic_train[[v]] <- as.factor(titanic_train[[v]])
3740
}
38-
outcome <- 'Survived'
3941
tooDetailed <- c("Ticket", "Cabin", "Name", "PassengerId")
4042
vars <- setdiff(colnames(titanic_train), c(outcome, tooDetailed))
4143
44+
dTrain <- titanic_train
45+
46+
47+
4248
set.seed(3425656)
43-
crossValPlan <- vtreat::kWayStratifiedY(nrow(titanic_train),
49+
crossValPlan <- vtreat::kWayStratifiedY(nrow(dTrain),
4450
10,
45-
titanic_train,
51+
dTrain,
4652
outcome)
4753
4854
evaluateModelingProcedure <- function(xMatrix, outcomeV, crossValPlan) {
@@ -73,19 +79,22 @@ Our preferred way to encode data is to use the `vtreat` package either in the "n
7379
```{r vtreatZ}
7480
library("vtreat")
7581
set.seed(3425656)
76-
tplan <- vtreat::designTreatmentsZ(titanic_train, vars, verbose=FALSE)
82+
tplan <- vtreat::designTreatmentsZ(dTrain, vars, verbose=FALSE)
7783
sf <- tplan$scoreFrame
7884
newvars <- sf$varName[sf$code %in% c('clean', 'lev', 'isBad')]
79-
trainVtreat <- as.matrix(vtreat::prepare(tplan, titanic_train,
85+
trainVtreat <- as.matrix(vtreat::prepare(tplan, dTrain,
8086
varRestriction = newvars))
8187
print(dim(trainVtreat))
8288
print(colnames(trainVtreat))
83-
titanic_train$predVtreatZ <- evaluateModelingProcedure(trainVtreat,
84-
titanic_train[[outcome]]==1,
89+
dTrain$predVtreatZ <- evaluateModelingProcedure(trainVtreat,
90+
dTrain[[outcome]]==target,
8591
crossValPlan)
86-
WVPlots::ROCPlot(titanic_train,
92+
sigr::permTestAUC(dTrain,
93+
'predVtreatZ',
94+
outcome, target)
95+
WVPlots::ROCPlot(dTrain,
8796
'predVtreatZ',
88-
outcome, 1,
97+
outcome, target,
8998
'vtreat encoder performance')
9099
```
91100

@@ -99,7 +108,7 @@ f <- paste('~ 0 + ', paste(vars, collapse = ' + '))
99108
oldOpt <- getOption('na.action')
100109
options(na.action='na.pass')
101110
trainModelMatrix <- stats::model.matrix(as.formula(f),
102-
titanic_train)
111+
dTrain)
103112
# note model.matrix does not conveniently store the encoding
104113
# plan, so you may run into difficulty if you were to encode
105114
# new data which didn't have all the levels seen in the training
@@ -108,12 +117,15 @@ options(na.action=oldOpt)
108117
print(dim(trainModelMatrix))
109118
print(colnames(trainModelMatrix))
110119
111-
titanic_train$predModelMatrix <- evaluateModelingProcedure(trainModelMatrix,
112-
titanic_train[[outcome]]==1,
120+
dTrain$predModelMatrix <- evaluateModelingProcedure(trainModelMatrix,
121+
dTrain[[outcome]]==target,
113122
crossValPlan)
114-
WVPlots::ROCPlot(titanic_train,
123+
sigr::permTestAUC(dTrain,
124+
'predModelMatrix',
125+
outcome, target)
126+
WVPlots::ROCPlot(dTrain,
115127
'predModelMatrix',
116-
outcome, 1,
128+
outcome, target,
117129
'model.matrix encoder performance')
118130
```
119131

@@ -123,17 +135,20 @@ WVPlots::ROCPlot(titanic_train,
123135
library("caret")
124136
set.seed(3425656)
125137
f <- paste('~', paste(vars, collapse = ' + '))
126-
encoder <- caret::dummyVars(as.formula(f), titanic_train)
127-
trainCaret <- predict(encoder, titanic_train)
138+
encoder <- caret::dummyVars(as.formula(f), dTrain)
139+
trainCaret <- predict(encoder, dTrain)
128140
print(dim(trainCaret))
129141
print(colnames(trainCaret))
130142
131-
titanic_train$predCaret <- evaluateModelingProcedure(trainCaret,
132-
titanic_train[[outcome]]==1,
143+
dTrain$predCaret <- evaluateModelingProcedure(trainCaret,
144+
dTrain[[outcome]]==target,
133145
crossValPlan)
134-
WVPlots::ROCPlot(titanic_train,
146+
sigr::permTestAUC(dTrain,
147+
'predCaret',
148+
outcome, target)
149+
WVPlots::ROCPlot(dTrain,
135150
'predCaret',
136-
outcome, 1,
151+
outcome, target,
137152
'caret encoder performance')
138153
```
139154

@@ -143,38 +158,41 @@ You can also try y-aware encoding, but it isn't adding much in this situation.
143158
set.seed(3425656)
144159
# for y aware evaluation must cross-validate whole procedure, designing
145160
# on data you intend to score on can leak information.
146-
preds <- rep(NA_real_, nrow(titanic_train))
161+
preds <- rep(NA_real_, nrow(dTrain))
147162
for(ci in crossValPlan) {
148-
cfe <- vtreat::mkCrossFrameCExperiment(titanic_train[ci$train, , drop=FALSE],
163+
cfe <- vtreat::mkCrossFrameCExperiment(dTrain[ci$train, , drop=FALSE],
149164
vars,
150-
outcome, 1)
165+
outcome, target)
151166
tplan <- cfe$treatments
152167
sf <- tplan$scoreFrame
153168
newvars <- sf$varName[sf$sig < 1/nrow(sf)]
154169
trainVtreat <- cfe$crossFrame[ , c(newvars, outcome), drop=FALSE]
155170
nrounds <- 1000
156171
cv <- xgb.cv(data= as.matrix(trainVtreat[, newvars, drop=FALSE]),
157-
label= trainVtreat[[outcome]]==1,
172+
label= trainVtreat[[outcome]]==target,
158173
objective= 'binary:logistic',
159174
nrounds= nrounds,
160175
verbose= 0,
161176
nfold= 5)
162177
#nrounds <- which.min(cv$evaluation_log$test_rmse_mean) # regression
163178
nrounds <- which.min(cv$evaluation_log$test_error_mean) # classification
164179
model <- xgboost(data= as.matrix(trainVtreat[, newvars, drop=FALSE]),
165-
label= trainVtreat[[outcome]]==1,
180+
label= trainVtreat[[outcome]]==target,
166181
objective= 'binary:logistic',
167182
nrounds= nrounds,
168183
verbose= 0)
169184
appVtreat <- vtreat::prepare(tplan,
170-
titanic_train[ci$app, , drop=FALSE],
185+
dTrain[ci$app, , drop=FALSE],
171186
varRestriction = newvars)
172187
preds[ci$app] <- predict(model,
173188
as.matrix(appVtreat[, newvars, drop=FALSE]))
174189
}
175-
titanic_train$predVtreatC <- preds
176-
WVPlots::ROCPlot(titanic_train,
190+
dTrain$predVtreatC <- preds
191+
sigr::permTestAUC(dTrain,
192+
'predVtreatC',
193+
outcome, target)
194+
WVPlots::ROCPlot(dTrain,
177195
'predVtreatC',
178-
outcome, 1,
196+
outcome, target,
179197
'vtreat y-aware encoder performance')
180198
```

Encoding/Example.md

Lines changed: 68 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,10 @@ Let's try the Titanic data set to see encoding in action. `xgboost` requires a n
3636
``` r
3737
library("titanic")
3838
library("xgboost")
39+
library("sigr")
3940
library("WVPlots")
4041

42+
# select example data set
4143
data(titanic_train)
4244
str(titanic_train)
4345
```
@@ -86,18 +88,23 @@ summary(titanic_train)
8688
##
8789

8890
``` r
91+
outcome <- 'Survived'
92+
target <- 1
8993
shouldBeCategorical <- c('PassengerId', 'Pclass', 'Parch')
9094
for(v in shouldBeCategorical) {
9195
titanic_train[[v]] <- as.factor(titanic_train[[v]])
9296
}
93-
outcome <- 'Survived'
9497
tooDetailed <- c("Ticket", "Cabin", "Name", "PassengerId")
9598
vars <- setdiff(colnames(titanic_train), c(outcome, tooDetailed))
9699

100+
dTrain <- titanic_train
101+
102+
103+
97104
set.seed(3425656)
98-
crossValPlan <- vtreat::kWayStratifiedY(nrow(titanic_train),
105+
crossValPlan <- vtreat::kWayStratifiedY(nrow(dTrain),
99106
10,
100-
titanic_train,
107+
dTrain,
101108
outcome)
102109

103110
evaluateModelingProcedure <- function(xMatrix, outcomeV, crossValPlan) {
@@ -128,10 +135,10 @@ Our preferred way to encode data is to use the `vtreat` package either in the "n
128135
``` r
129136
library("vtreat")
130137
set.seed(3425656)
131-
tplan <- vtreat::designTreatmentsZ(titanic_train, vars, verbose=FALSE)
138+
tplan <- vtreat::designTreatmentsZ(dTrain, vars, verbose=FALSE)
132139
sf <- tplan$scoreFrame
133140
newvars <- sf$varName[sf$code %in% c('clean', 'lev', 'isBad')]
134-
trainVtreat <- as.matrix(vtreat::prepare(tplan, titanic_train,
141+
trainVtreat <- as.matrix(vtreat::prepare(tplan, dTrain,
135142
varRestriction = newvars))
136143
print(dim(trainVtreat))
137144
```
@@ -149,12 +156,20 @@ print(colnames(trainVtreat))
149156
## [13] "Embarked_lev_x.Q" "Embarked_lev_x.S"
150157

151158
``` r
152-
titanic_train$predVtreatZ <- evaluateModelingProcedure(trainVtreat,
153-
titanic_train[[outcome]]==1,
159+
dTrain$predVtreatZ <- evaluateModelingProcedure(trainVtreat,
160+
dTrain[[outcome]]==target,
154161
crossValPlan)
155-
WVPlots::ROCPlot(titanic_train,
162+
sigr::permTestAUC(dTrain,
163+
'predVtreatZ',
164+
outcome, target)
165+
```
166+
167+
## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.86, s.d.=0.02, p<1e-05)."
168+
169+
``` r
170+
WVPlots::ROCPlot(dTrain,
156171
'predVtreatZ',
157-
outcome, 1,
172+
outcome, target,
158173
'vtreat encoder performance')
159174
```
160175

@@ -170,7 +185,7 @@ f <- paste('~ 0 + ', paste(vars, collapse = ' + '))
170185
oldOpt <- getOption('na.action')
171186
options(na.action='na.pass')
172187
trainModelMatrix <- stats::model.matrix(as.formula(f),
173-
titanic_train)
188+
dTrain)
174189
# note model.matrix does not conveniently store the encoding
175190
# plan, so you may run into difficulty if you were to encode
176191
# new data which didn't have all the levels seen in the training
@@ -191,12 +206,20 @@ print(colnames(trainModelMatrix))
191206
## [16] "EmbarkedS"
192207

193208
``` r
194-
titanic_train$predModelMatrix <- evaluateModelingProcedure(trainModelMatrix,
195-
titanic_train[[outcome]]==1,
209+
dTrain$predModelMatrix <- evaluateModelingProcedure(trainModelMatrix,
210+
dTrain[[outcome]]==target,
196211
crossValPlan)
197-
WVPlots::ROCPlot(titanic_train,
212+
sigr::permTestAUC(dTrain,
213+
'predModelMatrix',
214+
outcome, target)
215+
```
216+
217+
## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.86, s.d.=0.019, p<1e-05)."
218+
219+
``` r
220+
WVPlots::ROCPlot(dTrain,
198221
'predModelMatrix',
199-
outcome, 1,
222+
outcome, target,
200223
'model.matrix encoder performance')
201224
```
202225

@@ -215,8 +238,8 @@ library("caret")
215238
``` r
216239
set.seed(3425656)
217240
f <- paste('~', paste(vars, collapse = ' + '))
218-
encoder <- caret::dummyVars(as.formula(f), titanic_train)
219-
trainCaret <- predict(encoder, titanic_train)
241+
encoder <- caret::dummyVars(as.formula(f), dTrain)
242+
trainCaret <- predict(encoder, dTrain)
220243
print(dim(trainCaret))
221244
```
222245

@@ -232,12 +255,20 @@ print(colnames(trainCaret))
232255
## [16] "Embarked" "EmbarkedC" "EmbarkedQ" "EmbarkedS"
233256

234257
``` r
235-
titanic_train$predCaret <- evaluateModelingProcedure(trainCaret,
236-
titanic_train[[outcome]]==1,
258+
dTrain$predCaret <- evaluateModelingProcedure(trainCaret,
259+
dTrain[[outcome]]==target,
237260
crossValPlan)
238-
WVPlots::ROCPlot(titanic_train,
261+
sigr::permTestAUC(dTrain,
262+
'predCaret',
263+
outcome, target)
264+
```
265+
266+
## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.86, s.d.=0.019, p<1e-05)."
267+
268+
``` r
269+
WVPlots::ROCPlot(dTrain,
239270
'predCaret',
240-
outcome, 1,
271+
outcome, target,
241272
'caret encoder performance')
242273
```
243274

@@ -249,39 +280,47 @@ You can also try y-aware encoding, but it isn't adding much in this situation.
249280
set.seed(3425656)
250281
# for y aware evaluation must cross-validate whole procedure, designing
251282
# on data you intend to score on can leak information.
252-
preds <- rep(NA_real_, nrow(titanic_train))
283+
preds <- rep(NA_real_, nrow(dTrain))
253284
for(ci in crossValPlan) {
254-
cfe <- vtreat::mkCrossFrameCExperiment(titanic_train[ci$train, , drop=FALSE],
285+
cfe <- vtreat::mkCrossFrameCExperiment(dTrain[ci$train, , drop=FALSE],
255286
vars,
256-
outcome, 1)
287+
outcome, target)
257288
tplan <- cfe$treatments
258289
sf <- tplan$scoreFrame
259290
newvars <- sf$varName[sf$sig < 1/nrow(sf)]
260291
trainVtreat <- cfe$crossFrame[ , c(newvars, outcome), drop=FALSE]
261292
nrounds <- 1000
262293
cv <- xgb.cv(data= as.matrix(trainVtreat[, newvars, drop=FALSE]),
263-
label= trainVtreat[[outcome]]==1,
294+
label= trainVtreat[[outcome]]==target,
264295
objective= 'binary:logistic',
265296
nrounds= nrounds,
266297
verbose= 0,
267298
nfold= 5)
268299
#nrounds <- which.min(cv$evaluation_log$test_rmse_mean) # regression
269300
nrounds <- which.min(cv$evaluation_log$test_error_mean) # classification
270301
model <- xgboost(data= as.matrix(trainVtreat[, newvars, drop=FALSE]),
271-
label= trainVtreat[[outcome]]==1,
302+
label= trainVtreat[[outcome]]==target,
272303
objective= 'binary:logistic',
273304
nrounds= nrounds,
274305
verbose= 0)
275306
appVtreat <- vtreat::prepare(tplan,
276-
titanic_train[ci$app, , drop=FALSE],
307+
dTrain[ci$app, , drop=FALSE],
277308
varRestriction = newvars)
278309
preds[ci$app] <- predict(model,
279310
as.matrix(appVtreat[, newvars, drop=FALSE]))
280311
}
281-
titanic_train$predVtreatC <- preds
282-
WVPlots::ROCPlot(titanic_train,
312+
dTrain$predVtreatC <- preds
313+
sigr::permTestAUC(dTrain,
314+
'predVtreatC',
315+
outcome, target)
316+
```
317+
318+
## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.85, s.d.=0.019, p<1e-05)."
319+
320+
``` r
321+
WVPlots::ROCPlot(dTrain,
283322
'predVtreatC',
284-
outcome, 1,
323+
outcome, target,
285324
'vtreat y-aware encoder performance')
286325
```
287326

0 commit comments

Comments
 (0)