-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_folds.R
119 lines (106 loc) · 3.36 KB
/
create_folds.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# inspired from caret::createFolds
create_folds <- function(y, k = 10)
{
if (is.numeric(y)) {
cuts <- floor(length(y)/k)
if (cuts < 2)
cuts <- 2
if (cuts > 5)
cuts <- 5
breaks <- unique(quantile(y, probs = seq(0, 1, length = cuts)))
y <- cut(y, breaks, include.lowest = TRUE)
}
if (k < length(y)) {
y <- factor(as.character(y))
numInClass <- table(y)
foldVector <- vector(mode = "integer", length(y))
for (i in 1:length(numInClass)) {
min_reps <- numInClass[i]%/%k
if (min_reps > 0) {
spares <- numInClass[i]%%k
seqVector <- rep(1:k, min_reps)
if (spares > 0)
seqVector <- c(seqVector, sample(1:k, spares))
foldVector[which(y == names(numInClass)[i])] <- sample(seqVector)
}
else {
foldVector[which(y == names(numInClass)[i])] <- sample(1:k,
size = numInClass[i])
}
}
}
else foldVector <- seq(along = y)
out <- split(seq(along = y), foldVector)
names(out) <- paste("Fold", gsub(" ", "0", format(seq(along = out))),
sep = "")
return(out)
}
create_folds <- compiler::cmpfun(create_folds)
# borrowed from caret::createTimeSlices
create_time_slices <- function(y, initial_window, horizon = 1,
fixed_window = TRUE, skip = 0)
{
if(!is.null(ncol(y)))
{
n_y <- dim(y)[1]
} else {
n_y <- length(y)
}
stops <- seq(initial_window, (n_y - horizon), by = skip + 1)
if (fixed_window) {
starts <- stops - initial_window + 1
}
else {
starts <- rep(1, length(stops))
}
train <- mapply(seq, starts, stops, SIMPLIFY = FALSE)
test <- mapply(seq, stops + 1, stops + horizon, SIMPLIFY = FALSE)
nums <- gsub(" ", "0", format(stops))
names(train) <- paste("training", nums, sep = "")
names(test) <- paste("testing", nums, sep = "")
out <- list(train = train, test = test)
out
}
create_time_slices <- compiler::cmpfun(create_time_slices)
#' Split a time series
#'
#' @param y univariate or multivariate time series
#' @param p proportion of data in training set
#' @param return_indices return indices instead of time series?
#'
#' @return
#' @export
#'
#' @examples
split_ts <- function(y, p = 0.8, return_indices = FALSE)
{
n_y <- base::ifelse(test = is.null(dim(y)),
yes = length(y),
no = dim(y)[1])
index_train <- 1:floor(p*n_y)
if (return_indices)
return(index_train)
start_y <- stats::start(y)
frequency_y <- stats::frequency(y)
if(is.null(ncol(y))) # univariate case
{
training <- ts(y[index_train],
start = start_y,
frequency = frequency_y)
start_testing <- tsp(training )[2] + 1 / frequency_y
return(list(training = training,
testing = ts(y[-index_train],
start = start_testing,
frequency = frequency_y)))
} else { # multivariate case
training <- ts(y[index_train, ],
start = start_y,
frequency = frequency_y)
start_testing <- tsp(training)[2] + 1 / frequency_y
return(list(training = training,
testing = ts(y[-index_train, ],
start = start_testing,
frequency = frequency_y)))
}
}
split_ts <- compiler::cmpfun(split_ts)