Skip to content

Commit

Permalink
Created unit test for rxLinPredError, altered rxSample2Df so that it …
Browse files Browse the repository at this point in the history
…now accepts maxRowsByCols argument

Created unit test for rxLinPredError, altered rxSample2Df so that it now
accepts maxRowsByCols argument
  • Loading branch information
ChibisiAtRevolution committed Aug 30, 2013
1 parent baf5991 commit 9cc4535
Show file tree
Hide file tree
Showing 27 changed files with 405 additions and 7 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
227 changes: 227 additions & 0 deletions .Rhistory
@@ -0,0 +1,227 @@
devtools::load_all(".")
install.packages("hexbin")
install.packages("hexbin")
devtools::load_all(".")
install.packages("devtools")
install.packages("devtools")
search()
devtools::load_all(".")
install.packages("mlbench")
library(RevoEnhancements)
sampleRXD <- function(iNRows = 1E4,
rxData = "work\split.TrainingValidation.T.xdf",
outFile = NULL){
iTotalRows <- rxGetInfo(rxData)$numRows
dPropData <- iNRows/iTotalRows
.e <- new.env(parent = baseenv())
assign("dPropData", dPropData, envir = .e)
sampleFunction <- function(datalist){
n <- length(datalist[[1]])
t <- round(n * dPropData)
tv <- rep(c(TRUE, FALSE), c(t, n-t))
datalist <- lapply(datalist, function(x){
x[tv]
})
datalist
}
rxDataStep(inData = rxData, outFile = outFile, transformFunc = sampleFunction)
}
sampleRXD <- function(iNRows = 1E4,
rxData = "work\split.TrainingValidation.T.xdf",
outFile = NULL){
iTotalRows <- rxGetInfo(rxData)$numRows
dPropData <- iNRows/iTotalRows
.e <- new.env(parent = baseenv())
assign("dPropData", dPropData, envir = .e)
sampleFunction <- function(datalist){
n <- length(datalist[[1]])
t <- round(n * dPropData)
tv <- rep(c(TRUE, FALSE), c(t, n-t))
datalist <- lapply(datalist, function(x){
x[tv]
})
datalist
}
rxDataStep(inData = rxData, outFile = outFile, transformFunc = sampleFunction)
}
sampleRXD <- function(iNRows = 1E4,
rxData = "work/split.TrainingValidation.T.xdf",
outFile = NULL){
iTotalRows <- rxGetInfo(rxData)$numRows
dPropData <- iNRows/iTotalRows
.e <- new.env(parent = baseenv())
assign("dPropData", dPropData, envir = .e)
sampleFunction <- function(datalist){
n <- length(datalist[[1]])
t <- round(n * dPropData)
tv <- rep(c(TRUE, FALSE), c(t, n-t))
datalist <- lapply(datalist, function(x){
x[tv]
})
datalist
}
rxDataStep(inData = rxData, outFile = outFile, transformFunc = sampleFunction)
}
?rxGlm
?rxGetOption
require(devtools)
rxLinPredError <- function (actualVarName, predVarName, data, sWeights = NULL, blocksPerRead = 1,
reportProgress = rxGetOption("reportProgress")) {
if(exists("data", mode = "list")){
numRow = nrow(data)
}else{
datInfo <- rxGetInfo(data)
numRow <- datInfo$numRows
}
.rxGet <- function() {}
.rxSet <- function() {}
rm(.rxGet, .rxSet)
BlockCompute <- function(datalist){
# Getting the data
dActualY <- datalist[[actualVarName]]
dPredY <- datalist[[predVarName]]
# Error
dError <- (dPredY - dActualY)
# Missing boolean
bMissing <- is.na(dError)
# Keeping only non-missing data
dError <- dError[!bMissing]
# Weights
if(is.null(sWeights)){
dWeights <- rep(1, length(dError))/numRow
}else{
dWeights <- datalist[[sWeights]]
}
dWeights <- dWeights[!bMissing]
dActualY <- dActualY[!bMissing]
# For MAPE
dSumABSPropError <- sum(abs(dError/dActualY))
# For MPE
dSumPropError <- sum(dError/dActualY)
# For RSS
RSS <- sum(dError^2)
# Weighted errors
dSumSQWeightedErrors <- sum(dWeights*(dError^2))
.rxSet("dSumABSPropError", .rxGet("dSumABSPropError") + dSumABSPropError)
.rxSet("dSumPropError", .rxGet("dSumPropError") + dSumPropError)
.rxSet("dSumWeights", .rxGet("dSumWeights") + sum(dWeights))
.rxSet("dSumSQWeightedErrors", .rxGet("dSumSQWeightedErrors") + dSumSQWeightedErrors)
.rxSet("RSS", .rxGet("RSS") + RSS)
.rxSet("N", .rxGet("N") + length(dError))
return(NULL)
}
ret <- rxDataStep(
inData = data,
varsToKeep = c(actualVarName, predVarName, sWeights),
blocksPerRead = blocksPerRead,
reportProgress = reportProgress,
returnTransformObjects = TRUE,
transformFunc = BlockCompute,
transformObjects = list(dSumABSPropError = 0, dSumPropError = 0, dSumWeights = 0,
dSumSQWeightedErrors = 0, RSS = 0, N = 0)
)
# Retreiving the values
dSumABSPropError <- ret[["dSumABSPropError"]]
dSumPropError <- ret[["dSumPropError"]]
dSumWeights <- ret[["dSumWeights"]]
dSumSQWeightedErrors <- ret[["dSumSQWeightedErrors"]]
RSS <- ret[["RSS"]]
N <- ret[["N"]]
MAPE <- dSumABSPropError/N
MPE <- dSumPropError/N
MSE <- RSS/N
MSWD <- (dSumSQWeightedErrors)*((N-1)*dSumWeights/N)
list(MAPE = MAPE, MPE = MPE, MSE = MSE, MSWD = MSWD)
}
fit <- rxLinMod(Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
prd <- rxPredict(fit, iris)$Sepal.Length_Pred
dat <- data.frame(Sepal.Length=iris$Sepal.Length, Sepal.Length_Pred=prd, Weights = rep(1, nrow(iris))/nrow(iris))
rxLinPredError("Sepal.Length", "Sepal.Length_Pred", data=dat, sWeights="Weights")
rxLinPredError("Sepal.Length", "Sepal.Length_Pred", data=dat)
library(RevoEnhancements)
library(RevoEnhancements)
rxGetOption("sampleDataDir")
?rxDataStep
library(RevoEnhancements)
require(roxygenize)
require(roxygen)
require(roxygen2)
?roxygenize
getwd()
setwd("../")
getwd()
roxygenize("RevoEnhancements")
roxygenize("RevoEnhancements")
data = "work/split.TrainingValidation.T.xdf"
size = 1E6; replace = FALSE; maxRowsByCols = 220E6
extraRows <- 100
dataInfo <- rxGetInfo(data, getVarInfo = TRUE)
oneVar <- names(dataInfo$varInfo[1])
dataSize <- dataInfo$numRows
if (size > 1) {
p <- size / dataSize
} else if (size > 0) {
size <- ceiling(size * dataSize)
p <- size / dataSize
} else {
stop("'size' must be greater than zero")
}
if (is.data.frame(data)) {
mySamp <- sample.int(n = dataSize, size = size,
replace = ifelse(size > dataSize, TRUE, replace))
return(data[mySamp,])
}
createRandomSample <- function(dataList) {
# Trick to pass R CMD check: create and remove variables without binding
.rxStartRow <- .rxChunkNum <- function(){}
rm(.rxStartRow, .rxChunkNum)
zP <- character()
rm(zP)
numRows <- length(dataList[[1]])
dataList$.rxRowSelection <- as.logical(rbinom(numRows,1, zP))
return(dataList)
}
createRandomSampleReplace <- function(dataList) {
# Trick to pass R CMD check: create and remove variables without binding
.rxGet <- .rxSet <- .rxStartRow <- .rxChunkNum <- function() {}
rm(.rxGet, .rxSet, .rxStartRow, .rxChunkNum)
numRows <- length(dataList[[1]])
rowNum <- seq_len(numRows) + .rxStartRow - 1
rows <- sample[sample %in% rowNum]
tmpDf <- as.data.frame(dataList)[rows - .rxStartRow + 1,]
row.names(tmpDf) <- as.numeric(row.names(tmpDf)) + .rxStartRow - 1
ret[[.rxChunkNum]] <- tmpDf
.rxSet("ret", ret)
return(NULL)
}
replace
newP <- p + extraRows / dataSize
ret <- head(rxDataStep(data,
transformFunc = createRandomSample,
transformVars = oneVar,
transformObjects = list(zP = newP),
maxRowsByCols = maxRowsByCols),
n = size)
}
newP <- p + extraRows / dataSize
ret <- head(rxDataStep(data,
transformFunc = createRandomSample,
transformVars = oneVar,
transformObjects = list(zP = newP),
maxRowsByCols = maxRowsByCols),
n = size)
library(RevoEnhancements)
setwd("..\")
setwd("../")
require(roxygen)
require(roxygen2)
roxygenize("RevoEnhancements")
library(RevoEnhancements)
rxSample2Df
?rxDataStep
require(roxygen2)
setwd("..\")
setwd("../")
roxygenize("RevoScaleR")
roxygenize("RevoEnhancements")
args(rxFormula)
9 changes: 9 additions & 0 deletions .Rproj.user/1C05EE18/pcs/files-pane.pper
@@ -0,0 +1,9 @@
{
"path" : "~/GitHub/RevoEnhancements",
"sortOrder" : [
{
"ascending" : true,
"columnIndex" : 2
}
]
}
3 changes: 3 additions & 0 deletions .Rproj.user/1C05EE18/pcs/source-pane.pper
@@ -0,0 +1,3 @@
{
"activeTab" : 2
}
14 changes: 14 additions & 0 deletions .Rproj.user/1C05EE18/pcs/windowlayoutstate.pper
@@ -0,0 +1,14 @@
{
"left" : {
"panelheight" : 794,
"splitterpos" : 261,
"topwindowstate" : "NORMAL",
"windowheight" : 831
},
"right" : {
"panelheight" : 794,
"splitterpos" : 273,
"topwindowstate" : "NORMAL",
"windowheight" : 831
}
}
4 changes: 4 additions & 0 deletions .Rproj.user/1C05EE18/pcs/workbench-pane.pper
@@ -0,0 +1,4 @@
{
"TabSet1" : 2,
"TabSet2" : 3
}
9 changes: 9 additions & 0 deletions .Rproj.user/1C05EE18/persistent-state
@@ -0,0 +1,9 @@
build-last-errors="[]"
build-last-errors-base-dir="~/GitHub/RevoEnhancements/"
build-last-outputs="[{\"output\":\"==> Rcmd.exe INSTALL --preclean --no-multiarch RevoEnhancements\\n\\n\",\"type\":0},{\"output\":\"* installing to library 'C:/Users/chib/Documents/R/win-library/2.15'\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"* installing *source* package 'RevoEnhancements' ...\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** R\\r\\n\",\"type\":1},{\"output\":\"** inst\\r\\n\",\"type\":1},{\"output\":\"** preparing package for lazy loading\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** help\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"*** installing help indices\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** building package indices\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** testing if installed package can be loaded\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"\\r\\n\",\"type\":1},{\"output\":\"* DONE (RevoEnhancements)\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1}]"
compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}"
console_procs="[]"
files.monitored-path=""
find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":true,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOff\":[],\"matchOn\":[]},\"running\":false}"
imageDirtyState="0"
saveActionState="0"
2 changes: 2 additions & 0 deletions .Rproj.user/1C05EE18/sdb/prop/1CA8DED9
@@ -0,0 +1,2 @@
{
}
2 changes: 2 additions & 0 deletions .Rproj.user/1C05EE18/sdb/prop/2EF25910
@@ -0,0 +1,2 @@
{
}
2 changes: 2 additions & 0 deletions .Rproj.user/1C05EE18/sdb/prop/4A27412C
@@ -0,0 +1,2 @@
{
}
2 changes: 2 additions & 0 deletions .Rproj.user/1C05EE18/sdb/prop/DD7343FF
@@ -0,0 +1,2 @@
{
}
2 changes: 2 additions & 0 deletions .Rproj.user/1C05EE18/sdb/prop/DE4AB10B
@@ -0,0 +1,2 @@
{
}
2 changes: 2 additions & 0 deletions .Rproj.user/1C05EE18/sdb/prop/ECBBE075
@@ -0,0 +1,2 @@
{
}
2 changes: 2 additions & 0 deletions .Rproj.user/1C05EE18/sdb/prop/ECBF0051
@@ -0,0 +1,2 @@
{
}
2 changes: 2 additions & 0 deletions .Rproj.user/1C05EE18/sdb/prop/ED4EAC23
@@ -0,0 +1,2 @@
{
}
8 changes: 8 additions & 0 deletions .Rproj.user/1C05EE18/sdb/prop/INDEX
@@ -0,0 +1,8 @@
C%3A%2FUsers%2Fchib%2FGoogle%20Drive%2FOgilvy%2FCode%2Fattribution-6-logit-sample-data.R="DE4AB10B"
~%2FGitHub%2FRevoEnhancements%2FDESCRIPTION="4A27412C"
~%2FGitHub%2FRevoEnhancements%2FNEWS="ED4EAC23"
~%2FGitHub%2FRevoEnhancements%2FR%2FrxAIC.R="ECBF0051"
~%2FGitHub%2FRevoEnhancements%2FR%2FrxLinPredError.R="2EF25910"
~%2FGitHub%2FRevoEnhancements%2FR%2FrxSample.R="1CA8DED9"
~%2FGitHub%2FRevoEnhancements%2Finst%2Ftests%2Ftest-rxLinPredError.R="DD7343FF"
~%2FRevoConsulting%2FRevoWork%2FRevoScaleR%2FDevelopment%2FRevoHelper%2FsampleRXD.R="ECBBE075"
15 changes: 15 additions & 0 deletions .Rproj.user/1C05EE18/sdb/s-F96D22C3/6A924DA5
@@ -0,0 +1,15 @@
{
"contents" : "#\n# RevoEnhancements/R/rxSample.R by Derek Norton and Andrie de Vries\n#\n# Copyright 2013 Revolution Analytics\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n# http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n#' Function to sample, with or without replacement, from an XDF to a data frame.\n#' \n#' @param data xdf File\n#' @param size Number of rows to sample\n#' @param replace If TRUE, samples with replacement. Passed to \\code{\\link{sample.int}}\n#' @param maxRowsByCols the maximum number of rows x columns as specified by rxDataStep\n#' @export\n#' @return A data frame\n#' @family Data mining functions\n#' @examples\n#' library(RevoScaleR)\n#' xdfFile <- file.path(rxGetOption(\"sampleDataDir\"), \"CensusWorkers.xdf\")\n#' df <- rxSample2Df(xdfFile, size = 10000, replace = FALSE)\n#' df <- rxSample2Df(xdfFile, size = 10000, replace = TRUE)\nrxSample2Df <- function(data, size, replace = FALSE, maxRowsByCols = 3E6) {\n extraRows <- 100\n dataInfo <- rxGetInfo(data, getVarInfo = TRUE)\n oneVar <- names(dataInfo$varInfo[1])\n dataSize <- dataInfo$numRows\n if (size > 1) {\n p <- size / dataSize \n } else if (size > 0) {\n size <- ceiling(size * dataSize)\n p <- size / dataSize\n } else {\n stop(\"'size' must be greater than zero\")\n }\n if (is.data.frame(data)) {\n mySamp <- sample.int(n = dataSize, size = size, \n replace = ifelse(size > dataSize, TRUE, replace))\n return(data[mySamp,])\n }\n createRandomSample <- function(dataList) {\n # Trick to pass R CMD check: create and remove variables without binding\n .rxStartRow <- .rxChunkNum <- function(){}\n rm(.rxStartRow, .rxChunkNum)\n zP <- character()\n rm(zP)\n \n numRows <- length(dataList[[1]])\n dataList$.rxRowSelection <- as.logical(rbinom(numRows,1, zP))\n return(dataList)\n }\n createRandomSampleReplace <- function(dataList) {\n # Trick to pass R CMD check: create and remove variables without binding\n .rxGet <- .rxSet <- .rxStartRow <- .rxChunkNum <- function() {}\n rm(.rxGet, .rxSet, .rxStartRow, .rxChunkNum)\n \n numRows <- length(dataList[[1]])\n rowNum <- seq_len(numRows) + .rxStartRow - 1 \n rows <- sample[sample %in% rowNum]\n tmpDf <- as.data.frame(dataList)[rows - .rxStartRow + 1,]\n row.names(tmpDf) <- as.numeric(row.names(tmpDf)) + .rxStartRow - 1 \n ret[[.rxChunkNum]] <- tmpDf\n .rxSet(\"ret\", ret)\n return(NULL)\n }\n if (replace) {\n mySamp <- sample.int(dataSize, size = size, replace = replace)\n ret <- rxDataStep(data, \n transformFunc = createRandomSampleReplace,\n transformObjects = list(sample = mySamp, ret = list()),\n returnTransformObjects = TRUE, maxRowsByCols = maxRowsByCols)\n ret <- do.call(rbind, ret$ret)\n } else {\n newP <- p + extraRows / dataSize\n ret <- head(rxDataStep(data, \n transformFunc = createRandomSample,\n transformVars = oneVar, \n transformObjects = list(zP = newP),\n maxRowsByCols = maxRowsByCols), \n n = size)\n }\n return(ret)\n}\n\n",
"created" : 1377848616134.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "88026942",
"id" : "6A924DA5",
"lastKnownWriteTime" : 1377850999,
"path" : "~/GitHub/RevoEnhancements/R/rxSample.R",
"properties" : {
},
"source_on_save" : false,
"type" : "r_source"
}
15 changes: 15 additions & 0 deletions .Rproj.user/1C05EE18/sdb/s-F96D22C3/D042D746
@@ -0,0 +1,15 @@
{
"contents" : "#\n# RevoEnhancements/R/rXAIC by Derek Norton\n#\n# Copyright 2013 Revolution Analytics\n# \n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n# \n# http://www.apache.org/licenses/LICENSE-2.0\n# \n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, \n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n#' Calculates Akaikes Information Criterium (AIC) from a model object.\n#' \n#' @param object Model object, the result of \\code{\\link[RevoScaleR]{rxLogit}}, \\code{\\link[RevoScaleR]{rxLinMod}} or \\code{\\link[RevoScaleR]{rxGlm}}\n#' @param k Multiplier. Defaults to 2, the value for AIC\n#' @return Numeric\n#' @export\n#' @family Model summary statistics\n#' @examples\n#' library(RevoScaleR)\n#' sampleDataDir <- rxGetOption(\"sampleDataDir\")\n#' working.file <- file.path(sampleDataDir, \"AirlineDemoSmall.xdf\")\nrxAIC <- function(object, k = 2) {\n deviance(object) + k * object$df[1]\n}\n",
"created" : 1377851540932.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "2133822883",
"id" : "D042D746",
"lastKnownWriteTime" : 1377851546,
"path" : "~/GitHub/RevoEnhancements/R/rxAIC.R",
"properties" : {
},
"source_on_save" : false,
"type" : "r_source"
}

0 comments on commit 9cc4535

Please sign in to comment.