/
LiftCurve.R
243 lines (228 loc) · 9.16 KB
/
LiftCurve.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#' @importFrom wrapr :=
NULL
#' Plot the cumulative lift curve of a sort-order.
#'
#' Plot the cumulative lift curve of a sort-order.
#'
#' The use case for this visualization is to compare a predictive model
#' score to an actual outcome (either binary (0/1) or continuous). In this case the
#' lift curve plot measures how well the model score sorts the data compared
#' to the true outcome value.
#'
#' The x-axis represents the fraction of items seen when sorted by score, and the
#' y-axis represents the lift seen so far (cumulative value of model over cummulative value of random selection)..
#'
#' For comparison, \code{LiftCurvePlot} also plots the "wizard curve": the lift curve when the
#' data is sorted according to its true outcome.
#'
#' To improve presentation quality, the plot is limited to approximately \code{large_count} points (default: 1000).
#' For larger data sets, the data is appropriately randomly sampled down before plotting.
#'
#'
#' @param frame data frame to get values from
#' @param xvar name of the independent (input or model score) column in frame
#' @param truthVar name of the dependent (output or result to be modeled) column in frame
#' @param title title to place on plot
#' @param ... no unnamed argument, added to force named binding of later arguments.
#' @param large_count numeric, upper bound target for number of plotting points
#' @param include_wizard logical, if TRUE plot the ideal or wizard plot.
#' @param truth_target if not NULL compare to this scalar value.
#' @param model_color color for the model curve
#' @param wizard_color color for the "wizard" (best possible) curve
#' @examples
#'
#' set.seed(34903490)
#' y = abs(rnorm(20)) + 0.1
#' x = abs(y + 0.5*rnorm(20))
#' frm = data.frame(model=x, value=y)
#' WVPlots::LiftCurvePlot(frm, "model", "value",
#' title="Example Continuous Lift Curve")
#'
#' @export
LiftCurvePlot = function(frame, xvar, truthVar, title,
...,
large_count = 1000,
include_wizard = TRUE,
truth_target = NULL,
model_color='darkblue',
wizard_color='darkgreen') {
frame <- check_frame_args_list(...,
frame = frame,
name_var_list = list(xvar = xvar, truthVar = truthVar),
title = title,
funname = "WVPlots::LiftCurvePlot")
pct_outcome <- pctpop <- sort_criterion <- NULL # mark as not unbound variables
if(!is.null(truth_target)) {
truthcol <- as.numeric(frame[[truthVar]]==truth_target)
} else {
truthcol <- as.numeric(frame[[truthVar]])
}
predcol <- as.numeric(frame[[xvar]])
# data frame of pred and truth, sorted in order of the predictions
d = data.frame(predcol = predcol, truthcol = truthcol)
n <- nrow(d)
predord = order(d[['predcol']],
sample.int(n, n, replace = FALSE),
decreasing = TRUE) # reorder, with highest first
wizard = order(d[['truthcol']],
sample.int(n, n, replace = FALSE),
decreasing = TRUE)
npop = dim(d)[1]
# data frame the cumulative prediction/truth as a function
# of the fraction of the population we're considering, highest first
results = data.frame(
pctpop = (1:npop) / npop,
model = cumsum(d[predord, 'truthcol']) / sum(d[['truthcol']]),
wizard = cumsum(d[wizard, 'truthcol']) / sum(d[['truthcol']])
)
results$model_lift <- results$model/results$pctpop
results$wizard_lift <- results$wizard/results$pctpop
# transform the frame into the tall form, for plotting
r1 <- data.frame(pctpop = results$pctpop,
pct_outcome = results$model_lift,
sort_criterion = "model",
stringsAsFactors = FALSE)
r2 <- data.frame(pctpop = results$pctpop,
pct_outcome = results$wizard_lift,
sort_criterion = "wizard",
stringsAsFactors = FALSE)
results <- rbind(r1, r2, stringsAsFactors = FALSE)
# rename sort_criterion
msort_str <- paste('model: sort by', xvar)
sortKeyM <- c('model' = msort_str,
'wizard' = paste('wizard: sort by', truthVar))
results$sort_criterion <- sortKeyM[results$sort_criterion]
# rename levels of sort criterion
colorKey = as.character(sortKeyM) %:=% c(model_color, wizard_color)
names(colorKey) = c(paste('model: sort by', xvar),
paste('wizard: sort by', truthVar))
modelKey = names(colorKey)[[1]]
if(!include_wizard) {
results <- results[results$sort_criterion==msort_str, , drop=FALSE]
}
# cut down the number of points
results <- thin_frame_by_orders(results,
c("pctpop", "pct_outcome"),
"sort_criterion",
large_count)
# plot
gplot = ggplot2::ggplot(data = results) +
ggplot2::geom_point(
mapping = ggplot2::aes(
x = pctpop,
y = pct_outcome,
color = sort_criterion,
shape = sort_criterion
),
alpha = 0.5
) +
ggplot2::geom_line(
mapping = ggplot2::aes(
x = pctpop,
y = pct_outcome,
color = sort_criterion,
linetype = sort_criterion
)
) +
ggplot2::ggtitle(
title,
subtitle =
paste0(
truthVar,
'~',
xvar)) +
ggplot2::xlab("fraction items in sort order") +
ggplot2::ylab("lift") +
ggplot2::geom_hline(yintercept=1) +
ggplot2::scale_x_continuous(breaks = seq(0, 1, 0.1)) +
ggplot2::scale_color_manual(values = colorKey) +
ggplot2::theme(legend.position = "bottom")
gplot
}
#' Plot the cumulative lift curves of a sort-order.
#'
#' Plot the cumulative lift curves of a sort-order.
#'
#' The use case for this visualization is to compare a predictive model
#' score to an actual outcome (either binary (0/1) or continuous). In this case the
#' lift curve plot measures how well the model score sorts the data compared
#' to the true outcome value.
#'
#' The x-axis represents the fraction of items seen when sorted by score, and the
#' y-axis represents the lift seen so far (cumulative value of model over cummulative value of random selection)..
#'
#'
#'
#' @param frame data frame to get values from
#' @param xvars name of the independent (input or model score) columns in frame
#' @param truthVar name of the dependent (output or result to be modeled) column in frame
#' @param title title to place on plot
#' @param ... no unnamed argument, added to force named binding of later arguments.
#' @param truth_target if not NULL compare to this scalar value.
#' @param palette color palette for the model curves
#' @examples
#'
#' set.seed(34903490)
#' y = abs(rnorm(20)) + 0.1
#' x = abs(y + 0.5*rnorm(20))
#' frm = data.frame(model=x, value=y)
#' WVPlots::LiftCurvePlotList(frm, c("model", "value"), "value",
#' title="Example Continuous Lift Curves")
#'
#' @export
LiftCurvePlotList = function(frame, xvars, truthVar, title,
...,
truth_target = NULL,
palette = 'Dark2') {
frame <- check_frame_args_list(...,
frame = frame,
name_var_list = c(xvars = xvars, truthVar = truthVar),
title = title,
funname = "WVPlots::LiftCurvePlot")
curve <- lift <- percent_total <- NULL # mark as not unbound variables
pct_outcome <- pctpop <- sort_criterion <- NULL # mark as not unbound variables
if(!is.null(truth_target)) {
truthcol <- as.numeric(frame[[truthVar]]==truth_target)
} else {
truthcol <- as.numeric(frame[[truthVar]])
}
n <- nrow(frame)
# data frame the cumulative prediction/truth as a function
# of the fraction of the population we're considering, highest first
results <- data.frame(
pctpop = (1:n) / n
)
for(xvar in xvars) {
predcol <- as.numeric(frame[[xvar]])
# data frame of pred and truth, sorted in order of the predictions
d = data.frame(predcol = predcol, truthcol = truthcol)
predord <- order(d$predcol,
sample.int(n, n, replace = FALSE),
decreasing = TRUE) # reorder, with highest first
gain <- cumsum(d[predord, 'truthcol']) / sum(d[['truthcol']])
results[[xvar]] <- gain/results$pctpop
}
# transform the frame into the tall form, for plotting
results <- cdata::pivot_to_blocks(results,
nameForNewKeyColumn = 'curve',
nameForNewValueColumn = 'lift',
columnsToTakeFrom = setdiff(colnames(results), 'pctpop'))
# plot
gplot = ggplot2::ggplot(
data = results,
mapping = ggplot2::aes(
x = pctpop,
y = lift,
color = curve)) +
ggplot2::geom_point(alpha = 0.5) +
ggplot2::geom_line() +
ggplot2::scale_color_brewer(palette = palette) +
ggplot2::xlab("fraction items in sort order") +
ggplot2::ylab("lift") +
ggplot2::geom_hline(yintercept=1, alpha=0.5) +
ggplot2::theme(legend.position = "bottom")
gplot
}
#' @export
#' @rdname LiftCurvePlotList
LiftCurveListPlot <- LiftCurvePlotList