-
Notifications
You must be signed in to change notification settings - Fork 19
/
ht_single_mean_sim.R
133 lines (115 loc) 路 4.29 KB
/
ht_single_mean_sim.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#' Hypothesis testing for one mean, using shifted bootstrapping
#'
#' Helper for the `inference()` function
#'
#' @param y Response variable, can be numerical or categorical
#' @param null null value for a hypothesis test
#' @param alternative direction of the alternative hypothesis; "less",
#' "greater", or "twosided"
#' @param y_name Name of response variable as a character string (passed
#' from inference function)
#' @param nsim number of simulations
#' @param seed seed to be set, default is NULL
#' @param show_var_types print variable types, set to verbose by default
#' @param show_summ_stats print summary stats, set to verbose by default
#' @param show_eda_plot print EDA plot, set to verbose by default
#' @param show_inf_plot print inference plot, set to verbose by default
#' @param show_res print results, set to verbose by default
ht_single_mean_sim <- function(y, null, alternative, y_name,
nsim, seed,
show_var_types, show_summ_stats, show_res,
show_eda_plot, show_inf_plot){
# set seed
if(!is.null(seed)){ set.seed(seed) }
# calculate sample size
n <- length(y)
# calculate y-bar
y_bar <- mean(y)
# create bootstrap distribution
sim_dist <- rep(NA, nsim)
for(i in 1:nsim){
boot_samp <- sample(y, size = n, replace = TRUE)
sim_dist[i] <- mean(boot_samp)
}
# center bootstrap distribution at null
sim_dist_temp <- sim_dist
sim_dist <- sim_dist_temp - (mean(sim_dist_temp) - null)
# shading cutoffs
if(alternative == "greater"){ x_min = y_bar; x_max = Inf }
if(alternative == "less"){ x_min = -Inf; x_max = y_bar }
if(alternative == "twosided"){
if(y_bar >= null){
x_min = c(null - (y_bar - null), y_bar)
x_max = c(-Inf, Inf)
}
if(y_bar <= null){
x_min = c(y_bar, null + (null - y_bar))
x_max = c(-Inf, Inf)
}
}
# calculate p-value
if(alternative == "greater"){ p_value <- sum(sim_dist >= y_bar) / nsim }
if(alternative == "less"){ p_value <- sum(sim_dist <= y_bar) / nsim }
if(alternative == "twosided"){
if(y_bar > null){
p_value <- min(2 * (sum(sim_dist >= y_bar) / nsim), 1)
}
if(y_bar < null){
p_value <- min(2 * (sum(sim_dist <= y_bar) / nsim), 1)
}
if(y_bar == null){ p_value <- 1 }
}
# print variable types
if(show_var_types == TRUE){
cat("Single numerical variable\n")
}
# print summary statistics
if(show_summ_stats == TRUE){
s <- sd(y)
cat(paste0("n = ", n, ", y-bar = ", round(y_bar, 4), ", s = ", round(s, 4), "\n"))
}
# print results
if(show_res == TRUE){
if(alternative == "greater"){
alt_sign <- ">"
} else if(alternative == "less"){
alt_sign <- "<"
} else {
alt_sign <- "!="
}
cat(paste0("H0: mu = ", null, "\n"))
cat(paste0("HA: mu ", alt_sign, " ", null, "\n"))
p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4))
cat(paste0("p_value = ", p_val_to_print))
}
# eda_plot
d_eda <- data.frame(y = y)
eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) +
ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) +
ggplot2::xlab(y_name) +
ggplot2::ylab("") +
ggplot2::ggtitle("Sample Distribution") +
ggplot2::geom_vline(xintercept = y_bar, col = "#1FBEC3", lwd = 1.5)
# inf_plot
d_inf <- data.frame(sim_dist = sim_dist)
inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) +
ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) +
ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf,
alpha = 0.3, fill = "#FABAB8") +
ggplot2::xlab("simulated means") +
ggplot2::ylab("") +
ggplot2::ggtitle("Null Distribution") +
ggplot2::geom_vline(xintercept = y_bar, color = "#F57670", lwd = 1.5)
# print plots
if(show_eda_plot & !show_inf_plot){
suppressWarnings(print(eda_plot))
}
if(!show_eda_plot & show_inf_plot){
suppressWarnings(print(inf_plot))
}
if(show_eda_plot & show_inf_plot){
suppressWarnings(gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2))
}
# return
return(list(sim_dist = sim_dist, p_value = p_value))
}