Skip to content

Commit

Permalink
use underbar separators in new column names
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnMount committed Jun 16, 2018
1 parent 829859a commit 39d0ce9
Show file tree
Hide file tree
Showing 38 changed files with 486 additions and 474 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Expand Up @@ -4,6 +4,7 @@
* Translate treatment plans to rquery.
* Minor documentation improvements.
* Improve error messages on argument checking.
* Improve name generation (remove dots).
* Remove use of dplyr.

# vtreat 1.0.4 2018/05/05
Expand Down
2 changes: 1 addition & 1 deletion R/cleanTreatment.R
Expand Up @@ -47,7 +47,7 @@ as_rquery.vtreat_pass_through <- function(tstep,
if(max(xcol)<=min(xcol)) {
return(c())
}
newVarName <- make.names(paste(origVarName,'clean',sep='_'))
newVarName <- vtreat_make_names(paste(origVarName,'clean',sep='_'))
treatment <- list(origvar=origVarName,
newvars=newVarName,
f=.passThrough,
Expand Down
4 changes: 2 additions & 2 deletions R/customCoder.R
Expand Up @@ -60,7 +60,7 @@ makeCustomCoder <- function(customCode, coder, codeSeq,
conditionalScore <- as.list(as.numeric(agg$pred))
names(conditionalScore) <- as.character(agg$x)
conditionalScore <- conditionalScore[names(conditionalScore)!='zap'] # don't let zap group code
newVarName <- make.names(paste(v, customCode, sep='_'))
newVarName <- vtreat_make_names(paste(v, customCode, sep='_'))
treatment <- list(origvar=v,
newvars=newVarName,
f=.customCode,
Expand Down Expand Up @@ -174,7 +174,7 @@ makeCustomCoderNum <- function(customCode, coder, codeSeq,
ord <- order(agg$x)
predXs <- predXs[ord]
predYs <- predYs[ord]
newVarName <- make.names(paste(v, customCode, sep='_'))
newVarName <- vtreat_make_names(paste(v, customCode, sep='_'))
treatment <- list(origvar=v,
newvars=newVarName,
f=.customCodeNum,
Expand Down
2 changes: 1 addition & 1 deletion R/deviationFact.R
Expand Up @@ -36,7 +36,7 @@
}
scores <- as.list(scores)
scores <- scores[names(scores)!='zap'] # don't let zap code
newVarName <- make.names(paste(origVarName,'catD',sep='_'))
newVarName <- vtreat_make_names(paste(origVarName,'catD',sep='_'))
treatment <- list(origvar=origVarName,
newvars=newVarName,
f=.catD,
Expand Down
2 changes: 1 addition & 1 deletion R/effectTreatmentC.R
Expand Up @@ -58,7 +58,7 @@ as_rquery.vtreat_cat_Bayes <- function(tstep,
conditionalScore <- as.list(conditionalScore)
conditionalScore <- conditionalScore[names(conditionalScore)!='zap'] # don't let zap group code
# fall back for novel levels, use zero impact
newVarName <- make.names(paste(origVarName,'catB',sep='_'))
newVarName <- vtreat_make_names(paste(origVarName,'catB',sep='_'))
treatment <- list(origvar=origVarName,
newvars=newVarName,
f=.catBayes,
Expand Down
2 changes: 1 addition & 1 deletion R/effectTreatmentN.R
Expand Up @@ -26,7 +26,7 @@
den <- tapply(weights,vcol,sum)
scores <- as.list((num+smFactor*baseMean)/(den+smFactor)-baseMean)
scores <- scores[names(scores)!='zap'] # don't let zap code
newVarName <- make.names(paste(origVarName,'catN',sep='_'))
newVarName <- vtreat_make_names(paste(origVarName,'catN',sep='_'))
treatment <- list(origvar=origVarName,
newvars=newVarName,
f=.catNum,
Expand Down
7 changes: 4 additions & 3 deletions R/indicatorTreatment.R
Expand Up @@ -23,11 +23,12 @@ as_rquery.vtreat_cat_ind <- function(tstep,
li <- tstep$arg$tracked[[i]]
vi <- tstep$newvars[[i]]
if(li == "NA") {
expri <- vi %:=% paste0("ifelse(is.na(", origvar, "), 1, 0)")
expri <- paste0("ifelse(is.na(", origvar, "), 1, 0)")
} else {
li <- gsub("^x ", "", li)
expri <- vi %:=% paste0("ifelse(is.na(", origvar, "), 0, ifelse(", origvar, " == \"", li, "\", 1, 0))")
expri <- paste0("ifelse(is.na(", origvar, "), 0, ifelse(", origvar, " == \"", li, "\", 1, 0))")
}
names(expri) <- vi
exprs <- c(exprs, expri)
}
list(
Expand All @@ -48,7 +49,7 @@ as_rquery.vtreat_cat_ind <- function(tstep,
if(length(tracked)<=0) {
return(c())
}
newVarNames <- make.names(paste(origVarName,'lev',tracked,sep="_"),unique=TRUE)
newVarNames <- vtreat_make_names(paste(origVarName,'lev',tracked,sep="_"))
treatment <- list(origvar=origVarName,
newvars=newVarNames,
f=.catInd,
Expand Down
2 changes: 1 addition & 1 deletion R/isBadTreatment.R
Expand Up @@ -25,7 +25,7 @@ as_rquery.vtreat_is_bad <- function(tstep,
if((nna<=0)||(nna>=length(xcol))) {
return(c())
}
newVarName <- make.names(paste(origVarName,'isBAD',sep='_'))
newVarName <- vtreat_make_names(paste(origVarName,'isBAD',sep='_'))
treatment <- list(origvar=origVarName,
newvars=newVarName,
f=.isBAD,
Expand Down
2 changes: 1 addition & 1 deletion R/prevalenceFact.R
Expand Up @@ -22,7 +22,7 @@
den <- sum(weights)
scores <- num/den
scores <- as.list(scores)
newVarName <- make.names(paste(origVarName,'catP',sep='_'))
newVarName <- vtreat_make_names(paste(origVarName,'catP',sep='_'))
treatment <- list(origvar=origVarName,
newvars=newVarName,
f=.catP,
Expand Down
18 changes: 11 additions & 7 deletions R/rquery_treatment.R
Expand Up @@ -190,6 +190,7 @@ rquery_code_categorical <- function(colname, resname,
if(!requireNamespace("rquery", quietly = TRUE)) {
stop("vtreat::rquery_code_categorical requires the rquery package")
}
effect_values <- unlist(effect_values)
wrapr::stop_if_dot_args(substitute(list(...)),
"vtreat:::rquery_code_categorical")
# work out coding table
Expand All @@ -203,21 +204,24 @@ rquery_code_categorical <- function(colname, resname,
}
tnum <- tnum + 1
}
coding_levels <- c(coding_levels, new_novel_level, NA)
new_novel_value <- as.numeric(effect_values[.preProcCat(new_novel_level, levRestriction)])
if(is.na(new_novel_value)) {
new_novel_value <- default_value
}
na_value <- as.numeric(effect_values[.preProcCat(NA_character_, levRestriction)])
if(is.na(na_value)) {
na_value <- default_value
}
ctab <- data.frame(levels = coding_levels,
stringsAsFactors = FALSE)
codes <- .preProcCat(ctab$levels, levRestriction)
ctab$levels[[nrow(ctab)]] <- "NA"
ctab$effect <- as.numeric(unlist(effect_values)[codes])
ctab$effect <- as.numeric(effect_values[codes])
ctab$effect[is.na(ctab$effect)] <- default_value
new_novel_value <- ctab$effect[ctab$levels == new_novel_level]
na_value <- ctab$effect[ctab$levels == "NA"]
ctab <- ctab[seq_len(nrow(ctab)-2), , drop = FALSE]
names(ctab) <- c(colname, resname)
code_tab <- name_source()
ctabd <- rquery::table_source(code_tab, c(colname, resname))
expr <- resname %:=% paste0("ifelse(is.na(", colname, "), ", na_value,
", ifelse(is.na(", resname, "), ", default_value, ", ", resname, "))")
", ifelse(is.na(", resname, "), ", new_novel_value, ", ", resname, "))")
f <- function(d) {
rquery::natural_join(d, ctabd, jointype = "LEFT", by = colname) %.>%
rquery::extend_se(., expr)
Expand Down
7 changes: 6 additions & 1 deletion R/utils.R
Expand Up @@ -277,5 +277,10 @@ catScore <- function(varName,x,yC,yTarget,weights,numberOfHiddenDegrees=0) {
stringsAsFactors=FALSE)
}


vtreat_make_names <- function(nms_in) {
nms <- gsub("[^A-Za-z0-9]+", "_", nms_in)
nms <- make.names(nms, unique = TRUE, allow_ = TRUE)
nms <- gsub("[^A-Za-z0-9]+", "_", nms)
nms
}

52 changes: 26 additions & 26 deletions README.md
Expand Up @@ -149,14 +149,14 @@ dTestC <- data.frame(x=c('a','b','c',NA),z=c(10,20,30,NA))
treatmentsC <- designTreatmentsC(dTrainC,colnames(dTrainC),'y',TRUE,
verbose=FALSE)
print(treatmentsC$scoreFrame[,c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees')])
# origName varName code rsq sig extraModelDegrees
# 1 x x_catP catP 0.166956795 0.20643885 2
# 2 x x_catB catB 0.254788311 0.11858143 2
# 3 z z_clean clean 0.237601767 0.13176020 0
# 4 z z_isBAD isBAD 0.296065432 0.09248399 0
# 5 x x_lev_NA lev 0.296065432 0.09248399 0
# 6 x x_lev_x.a lev 0.130005705 0.26490379 0
# 7 x x_lev_x.b lev 0.006067337 0.80967242 0
# origName varName code rsq sig extraModelDegrees
# 1 x x_catP catP 1.559780e-01 0.22202097 2
# 2 x x_catB catB 1.142159e-05 0.99166241 2
# 3 z z_clean clean 2.376018e-01 0.13176020 0
# 4 z z_isBAD isBAD 2.960654e-01 0.09248399 0
# 5 x x_lev_NA lev 2.960654e-01 0.09248399 0
# 6 x x_lev_x_a lev 1.300057e-01 0.26490379 0
# 7 x x_lev_x_b lev 6.067337e-03 0.80967242 0

# help("prepare")

Expand All @@ -166,21 +166,21 @@ varsC <- setdiff(colnames(dTrainCTreated),'y')
sapply(dTrainCTreated[,varsC,drop=FALSE],mean)
# x_catP x_catB z_clean z_isBAD x_lev_NA
# 1.585994e-16 0.000000e+00 7.927952e-18 -7.926292e-18 3.965082e-18
# x_lev_x.a x_lev_x.b
# x_lev_x_a x_lev_x_b
# -1.982154e-17 9.917546e-19
# all non NA slopes should be 1
sapply(varsC,function(c) { lm(paste('y',c,sep='~'),
data=dTrainCTreated)$coefficients[[2]]})
# x_catP x_catB z_clean z_isBAD x_lev_NA x_lev_x.a x_lev_x.b
# x_catP x_catB z_clean z_isBAD x_lev_NA x_lev_x_a x_lev_x_b
# 1 1 1 1 1 1 1
dTestCTreated <- prepare(treatmentsC,dTestC,pruneSig=c(),scale=TRUE)
print(dTestCTreated)
# x_catP x_catB z_clean z_isBAD x_lev_NA x_lev_x.a
# x_catP x_catB z_clean z_isBAD x_lev_NA x_lev_x_a
# 1 -0.2380952 -0.1897682 1.194595 -0.1714286 -0.1714286 -0.2380952
# 2 0.1785714 -0.1489924 2.951351 -0.1714286 -0.1714286 0.1785714
# 3 1.0119048 -0.1320682 4.708108 -0.1714286 -0.1714286 0.1785714
# 4 0.1785714 0.4336447 0.000000 0.4285714 0.4285714 0.1785714
# x_lev_x.b
# x_lev_x_b
# 1 0.02857143
# 2 -0.07142857
# 3 0.02857143
Expand All @@ -196,38 +196,38 @@ dTestN <- data.frame(x=c('a','b','c',NA),z=c(10,20,30,NA))
treatmentsN = designTreatmentsN(dTrainN,colnames(dTrainN),'y',
verbose=FALSE)
print(treatmentsN$scoreFrame[,c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees')])
# origName varName code rsq sig extraModelDegrees
# 1 x x_catP catP 4.047085e-01 0.08994062 2
# 2 x x_catN catN 2.822908e-01 0.17539581 2
# 3 x x_catD catD 2.096931e-02 0.73225708 2
# 4 z z_clean clean 2.880952e-01 0.17018920 0
# 5 z z_isBAD isBAD 3.333333e-01 0.13397460 0
# 6 x x_lev_NA lev 3.333333e-01 0.13397460 0
# 7 x x_lev_x.a lev 2.500000e-01 0.20703125 0
# 8 x x_lev_x.b lev 1.110223e-16 0.99999998 0
# origName varName code rsq sig extraModelDegrees
# 1 x x_catP catP 2.941176e-01 0.1649303 2
# 2 x x_catN catN 6.583561e-02 0.5396025 2
# 3 x x_catD catD 9.777348e-03 0.8158041 2
# 4 z z_clean clean 2.880952e-01 0.1701892 0
# 5 z z_isBAD isBAD 3.333333e-01 0.1339746 0
# 6 x x_lev_NA lev 3.333333e-01 0.1339746 0
# 7 x x_lev_x_a lev 2.500000e-01 0.2070312 0
# 8 x x_lev_x_b lev 1.110223e-16 1.0000000 0
dTrainNTreated <- prepare(treatmentsN,dTrainN,pruneSig=1.0,scale=TRUE)
varsN <- setdiff(colnames(dTrainNTreated),'y')
# all input variables should be mean 0
sapply(dTrainNTreated[,varsN,drop=FALSE],mean)
# x_catP x_catN x_catD z_clean z_isBAD
# 2.775558e-17 0.000000e+00 -2.775558e-17 4.857226e-17 6.938894e-18
# x_lev_NA x_lev_x.a x_lev_x.b
# x_lev_NA x_lev_x_a x_lev_x_b
# 6.938894e-18 0.000000e+00 7.703720e-34
# all non NA slopes should be 1
sapply(varsN,function(c) { lm(paste('y',c,sep='~'),
data=dTrainNTreated)$coefficients[[2]]})
# x_catP x_catN x_catD z_clean z_isBAD x_lev_NA x_lev_x.a
# x_catP x_catN x_catD z_clean z_isBAD x_lev_NA x_lev_x_a
# 1 1 1 1 1 1 1
# x_lev_x.b
# x_lev_x_b
# 1
dTestNTreated <- prepare(treatmentsN,dTestN,pruneSig=c(),scale=TRUE)
print(dTestNTreated)
# x_catP x_catN x_catD z_clean z_isBAD x_lev_NA x_lev_x.a
# x_catP x_catN x_catD z_clean z_isBAD x_lev_NA x_lev_x_a
# 1 -0.25 -0.25 -0.06743804 0.9952381 -0.1666667 -0.1666667 -0.25
# 2 0.25 0.00 -0.25818161 2.5666667 -0.1666667 -0.1666667 0.25
# 3 0.75 0.00 -0.25818161 4.1380952 -0.1666667 -0.1666667 0.25
# 4 0.25 0.50 0.39305768 0.0000000 0.5000000 0.5000000 0.25
# x_lev_x.b
# x_lev_x_b
# 1 -2.266233e-17
# 2 6.798700e-17
# 3 -2.266233e-17
Expand Down
2 changes: 1 addition & 1 deletion docs/articles/SavingTreamentPlans.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 39d0ce9

Please sign in to comment.