Join GitHub today
GitHub is home to over 50 million developers working together to host and review code, manage projects, and build software together.
Sign up| # low frequency model | |
| # encode x as lambdas | |
| # x_i = previous + lambda * next_x | |
| # copying forward the variable causes later models to pick up the previous as above. | |
| encode_x_as_lambdas <- function(x, minx, maxx, xs) { | |
| n <- length(x) | |
| k <- length(xs) | |
| x[is.na(x)] <- (minx+maxx)/2 | |
| x <- pmin(maxx, pmax(minx, x)) | |
| ff <- data.frame("intercept" = rep(1, n)) | |
| for(ki in seq_len(k)) { | |
| vname <- paste0("lambda_", sprintf("%05g", ki)) | |
| v <- numeric(n) | |
| left <- xs[[ki]] | |
| if(ki<k) { | |
| right <- xs[[ki+1]] | |
| } else { | |
| right <- maxx | |
| } | |
| v <- (x-left)/(right-left) | |
| v[x<left] <- 0 | |
| v[x>=right] <- 1 # copy to rest of the models | |
| ff[[vname]] <- v | |
| } | |
| ff | |
| } | |
| # Fit a piecewise linear function at cut-points | |
| fit_segments <- function(x, y, k, | |
| ..., | |
| w = NULL) { | |
| wrapr::stop_if_dot_args(substitute(list(...)), "vtreat::fit_segments") | |
| if(is.null(w)) { | |
| w = numeric(length(x)) + 1 | |
| } | |
| meany = mean(y) | |
| missing_pred = meany | |
| na_posns = is.na(x) | |
| if(sum(na_posns)>20) { | |
| missing_pred = mean(y[na_posns]) | |
| } | |
| x <- x[!na_posns] | |
| y <- y[!na_posns] | |
| w <- w[!na_posns] | |
| n <- length(x) | |
| minx <- min(x) | |
| maxx <- max(x) | |
| xs <- sort(x) | |
| idxs <- sort(unique(c(1, round(seq(1, n, length.out = k))))) | |
| idxs <- pmin(n, pmax(1, idxs)) | |
| idxs <- idxs[idxs<n] | |
| xs <- sort(unique(xs[idxs])) | |
| xs <- xs[xs<maxx] | |
| ff <- encode_x_as_lambdas(x, minx, maxx, xs) | |
| vars <- colnames(ff) | |
| ff$y <- y | |
| f <- paste("y", paste(c("0", vars), collapse = " + "), sep = " ~ ") | |
| f <- as.formula(f) | |
| model <- stats::lm(f, data = ff, weights = w) | |
| coef <- model$coefficients | |
| coef[is.na(coef)] <- 0 | |
| list(k =k, | |
| minx = minx, | |
| maxx = maxx, | |
| xs = xs, | |
| meany = meany, | |
| missing_pred = missing_pred, | |
| coef = coef) | |
| } | |
| pred_segs <- function(model, x) { | |
| ff <- encode_x_as_lambdas(x, model$min, model$maxx, model$xs) | |
| preds <- as.matrix(ff) %*% model$coef | |
| preds[is.na(x)] <- model$missing_pred | |
| preds | |
| } | |
| #' Solve as piecewise linear problem, numeric target. | |
| #' | |
| #' Return a vector of length y that is a piecewise function of x. | |
| #' This vector is picked as close to | |
| #' y (by square-distance) as possible for a set of x-only determined | |
| #' cut-points. Cross-validates for a good number of segments. | |
| #' | |
| #' @param varName character, name of variable | |
| #' @param x numeric input (not empty, no NAs). | |
| #' @param y numeric or castable to such (same length as x no NAs), output to match | |
| #' @param w numeric positive, same length as x (weights, can be NULL) | |
| #' @return segmented y prediction | |
| #' | |
| #' | |
| #' @export | |
| #' | |
| solve_piecewise <- function(varName, x, y, w = NULL) { | |
| tryCatch({ | |
| n <- length(x) | |
| if(n<=2) { | |
| return(NULL) | |
| } | |
| nunique <- length(unique(x)) | |
| if(nunique<=2) { | |
| return(NULL) | |
| } | |
| if(is.null(w)) { | |
| w <- numeric(n) + 1 | |
| } | |
| xorig <- x | |
| order <- order(x) | |
| x <- x[order] | |
| y <- y[order] | |
| w <- w[order] | |
| if(n<=20) { | |
| # too small, 1 or 2 segments | |
| k <- 2 | |
| } else { | |
| # cross-val for a good k | |
| ks <- c(1, 2, 4, 8, 16, 32, 64) | |
| ks <- ks[ks<=min(n/10, nunique)] | |
| if(length(ks)<1) { | |
| return(NULL) | |
| } | |
| is_test <- seq_len(n) %in% sample.int(n, n, replace = FALSE)[seq_len(floor(n/2))] | |
| xvals <- vapply( | |
| ks, | |
| function(k) { | |
| model <- fit_segments(x[!is_test], y[!is_test], k=k, w=w[!is_test]) | |
| preds <- pred_segs(model, x[is_test]) | |
| mean((y[is_test] - preds)^2) | |
| }, numeric(1)) | |
| idx <- which.min(xvals) | |
| k <- ks[[idx]] | |
| # names(xvals) <- as.character(ks) | |
| # print(xvals) | |
| # print(k) | |
| } | |
| model <- fit_segments(x, y, k=k, w=w) | |
| estimate <- pred_segs(model, xorig) | |
| approx_table <- data.frame(predXs = sort(unique(c(min(x), model$xs, max(x))))) | |
| approx_table$predYs <- pred_segs(model, approx_table$predXs) | |
| attr(estimate, "approx_table") <- approx_table | |
| attr(estimate, "method") <- "linear" | |
| return(estimate) | |
| }, | |
| error = function(e) { return(NULL) }) | |
| } | |
| #' Solve as piecewise logit problem, categorical target. | |
| #' | |
| #' Return a vector of length y that is a piecewise function of x. | |
| #' This vector is picked as close to | |
| #' y (by square-distance) as possible for a set of x-only determined | |
| #' cut-points. Cross-validates for a good number of segments. | |
| #' | |
| #' @param varName character, name of variable | |
| #' @param x numeric input (not empty, no NAs). | |
| #' @param y numeric or castable to such (same length as x no NAs), output to match | |
| #' @param w numeric positive, same length as x (weights, can be NULL) | |
| #' @return segmented y prediction | |
| #' | |
| #' | |
| #' @export | |
| #' | |
| solve_piecewisec <- function(varName, x, y, w = NULL) { | |
| v <- solve_piecewise(varName = varName, | |
| x = x, y = y , w = w) | |
| # don't copy over approx table as we are piecewise logit, not piecewise linear. | |
| .logit(v) - .logit(.wmean(y, w)) | |
| } |