R/ht_many_mean_theo.R

#' Hypothesis testing for many means, using ANOVA
#' 
#' Helper for the `inference()` function
#' 
#' @param y Response variable, can be numerical or categorical
#' @param x Explanatory variable, categorical (optional)
#' @param null null value for a hypothesis test
#' @param alternative direction of the alternative hypothesis; "less","greater", or "twosided"
#' @param sig_level significance level, value between 0 and 1 
#' (used only for ANOVA to determine if posttests are necessary)
#' @param y_name Name of response variable as a character string (passed 
#' from inference function)
#' @param x_name Name of explanatory variable as a character string (passed 
#' from inference function)
#' @param show_var_types print variable types, set to verbose by default
#' @param show_summ_stats print summary stats, set to verbose by default
#' @param show_eda_plot print EDA plot, set to verbose by default
#' @param show_inf_plot print inference plot, set to verbose by default
#' @param show_res print results, set to verbose by default

ht_many_mean_theo <- function(y, x, null, alternative, sig_level,
                              y_name, x_name, 
                              show_var_types, show_summ_stats, show_res,
                              show_eda_plot, show_inf_plot){
  # summary stats
  ns <- by(y, x, length)
  y_bars <- by(y, x, mean)
  sds <- by(y, x, sd)
  
  # anova
  res <- anova(lm(y ~ x))
  
  # anova pieces
  terms <- c(x_name, "Residuals", "Total")
  deg_frs <- res$Df
  ss <- res$`Sum Sq`
  ms <- res$`Mean Sq`
  stat <- res$`F value`[1]
  p_value <- res$`Pr(>F)`[1]
  
  # calculate totals
  ss_tot <- sum(ss)
  ss <- c(ss, ss_tot)
  df_tot <- sum(deg_frs)
  deg_frs <- c(deg_frs, df_tot)
  
  # ss format
  ss_format <- as.character(round(ss, 4))
  
  # ms format
  ms_format <- as.character(c(round(ms, 4), NA))
  
  # stat format
  stat_format <- as.character(c(round(stat, 4), NA, NA))
  
  # p-value format
  p_value_format <- as.character(c(ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)), NA, NA))
  
  # format output
  anova_output <- data.frame(
    df = deg_frs,
    Sum_Sq = ss_format,
    Mean_Sq = ms_format,
    F = stat_format,
    p_value = p_value_format, 
    row.names = terms
  )
  
  # print variable types
  if(show_var_types == TRUE){
    n_x_levels <- length(levels(x))
    cat(paste0("Response variable: numerical\n"))
    cat(paste0("Explanatory variable: categorical (", n_x_levels, " levels) \n"))
  }
  
  # print summary statistics
  if(show_summ_stats == TRUE){
    grs <- levels(x)
    ns <- by(y, x, length)
    ybars <- round(by(y, x, mean), 4)
    sds <- round(by(y, x, sd), 4)
    for(i in 1:n_x_levels){
      cat(paste0("n_", grs[i], " = ", ns[i], ", y_bar_", grs[i], " = ", round(ybars[i], 4),
                 ", s_", grs[i], " = ", sds[i] , "\n"))
    }
    cat("\n")
  }

  # print results
  if(show_res == TRUE){
    cat("ANOVA:\n")
    print(anova_output, na.print = "", digits = 4)
    
    # post-hoc tests (if ANOVA is significant)
    if(p_value < sig_level){
      cat("\nPairwise tests - ")
      pairwise <- pairwise.t.test(y, x, p.adjust.method = "none", pool.sd = TRUE)
      cat(paste0(pairwise$method, ":\n"))
      print(broom::tidy(pairwise), digits = 4)
    }
  }
  
  # eda_plot
  d_eda <- data.frame(y = y, x = x)
  d_means <- data.frame(y_bars = as.numeric(y_bars), x = levels(x))
  
  eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) +
    ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) +
    ggplot2::xlab(y_name) +
    ggplot2::ylab(x_name) +
    ggplot2::ggtitle("Sample Distribution") +
    ggplot2::geom_vline(data = d_means, ggplot2::aes(xintercept = y_bars), col = "#1FBEC3", lwd = 1.5) +
    ggplot2::facet_grid(x ~ .)
  
  # inf_plot
  x_max <- max(qf(0.99, df1 = deg_frs[1], df2 = deg_frs[2]), stat*1.1)
  inf_plot <- ggplot2::ggplot(data.frame(x = c(0, x_max)), ggplot2::aes(x)) +
    ggplot2::stat_function(fun = df, args = list(df1 = deg_frs[1], df2 = deg_frs[2]), color = "#999999") +
    ggplot2::annotate("rect", xmin = stat, xmax = stat+Inf, ymin = 0, ymax = Inf, 
             alpha = 0.3, fill = "#FABAB8") +
    ggplot2::ggtitle(paste0("F Distribution\n(df_G = ", deg_frs[1], ", df_E = ", deg_frs[2], ")")) +
    ggplot2::xlab("") +
    ggplot2::ylab("") +
    ggplot2::geom_vline(xintercept = stat, color = "#F57670", lwd = 1.5)
  
  # print plots
  if(show_eda_plot & !show_inf_plot){ 
    print(eda_plot)
  }
  if(!show_eda_plot & show_inf_plot){ 
    print(inf_plot)
  }
  if(show_eda_plot & show_inf_plot){
    gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2)
  }
  
  # return
  return(list(F = stat, df1 = deg_frs[1], df2 = deg_frs[2], p_value = p_value))
}