# Calculate power analysis given Linear Model effect sizes

In [1]:
library(pwr)
suppressPackageStartupMessages(library(dplyr))

In [2]:
output_file <- file.path("results", "power_analysis_cp_features_lm.tsv")

In [3]:
# Load data
lm_results_file <- file.path("results", "linear_model_cp_features.tsv")
lm_results_df <- readr::read_tsv(
    lm_results_file,
    col_types = readr::cols(.default="d", feature="c")
)

print(dim(lm_results_df))
head(lm_results_df)

[1] 1043    5


feature,r2_score,cell_count_coef,Null_coef,WT_coef
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Cytoplasm_Number_Object_Number,0.309208943,0.033454088,-0.01216966,0.01216966
Cytoplasm_AreaShape_Area,0.316174998,-0.02666059,-0.1761977,0.1761977
Cytoplasm_AreaShape_BoundingBoxArea,0.167981769,-0.022168935,-0.05959405,0.05959405
Cytoplasm_AreaShape_BoundingBoxMaximum_X,0.008340445,0.002454425,-0.13958882,0.13958882
Cytoplasm_AreaShape_BoundingBoxMaximum_Y,0.00168455,-0.003408135,0.04023927,-0.04023927
Cytoplasm_AreaShape_BoundingBoxMinimum_X,0.005920801,0.006485507,-0.08859939,0.08859939


In [4]:
# Load feature data (for calculating n)
data_dir <-file.path("..", "..", "..", "4_processing_features", "data")
cp_file <- file.path(data_dir, "nf1_sc_norm_cellprofiler.csv.gz")

cp_df <- readr::read_csv(
    cp_file,
    col_types = readr::cols(
        .default="d",
        Metadata_WellRow="c",
        Metadata_WellCol="c",
        Metadata_Well="c",
        Metadata_gene_name="c",
        Metadata_genotype="c"
    )
)

print(dim(cp_df))
head(cp_df, 3)

[1m[22mNew names:
[36m•[39m `` -> `...1`


[1]  149 1056


...1,Metadata_WellRow,Metadata_WellCol,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,⋯,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0,C,6,12,NF1,WT,1,1,C6,1,⋯,3.1415403,3.2022729,-0.09735552,-0.096165089,-0.0942023,-0.10645635,3.3379688,3.3505284,3.2781681,3.3103705
1,C,6,12,NF1,WT,1,1,C6,2,⋯,0.315924,0.2586328,-0.08797075,-0.069492845,-0.06553894,-0.09537677,0.3147762,0.3139198,0.3484196,0.3186928
2,C,6,12,NF1,WT,1,1,C6,3,⋯,0.2952335,0.383161,0.06525064,0.005549586,-0.01521187,-0.02908654,0.3484921,0.3339402,0.3413119,0.3479994


In [5]:
# Define constants for power analysis
n_conditions <- 2  # NF1 WT and Null
n_samples <- dim(cp_df)[1]

u <- n_conditions - 1
v <- n_samples - u - 1
sig_level <- 0.05 / dim(lm_results_df)[1]
power <- 0.8

print(c(u, v))
print(sig_level)

[1]   1 147
[1] 4.793864e-05


In [6]:
# Given all R2 values perform power analysis
all_power_results <- list()
for (cp_feature in lm_results_df$feature) {
    # Subset to the given feature lm results
    lm_result_subset_df <- lm_results_df %>%
        dplyr::filter(feature == !!cp_feature)
    
    # Pull out the estimated R2 value
    r2_val <- lm_result_subset_df %>% dplyr::pull(r2_score)
    
    # The power estimate is undefined for r2_val = 1, skip if so
    if (r2_val == 1) {
        all_power_results[[cp_feature]] <- c(cp_feature, u, v, sig_level, NULL, NULL)
        next
    }
    
    # Transform R2 score to F2 effect size
    f2_val <- r2_val / (1 - r2_val)
    
    # Calculate power, note that v contains an estimate of sample size
    power_result <- pwr.f2.test(u = u, v = NULL, f2 = f2_val, sig.level = sig_level, power = power)
    
    # Calculate required sample size from the v formula
    estimated_sample_size <- power_result$v + u + 1
    
    # Save results for future visualization
    all_power_results[[cp_feature]] <- c(cp_feature, u, v, sig_level, power, estimated_sample_size)
    
}

In [7]:
power_results_df <- do.call(rbind, all_power_results) %>% dplyr::as_tibble()

colnames(power_results_df) <- c("feature", "u", "v", "sig_level", "power", "estimated_sample_size")

# Output to file
power_results_df %>%
    readr::write_tsv(output_file)

print(dim(power_results_df))
head(power_results_df)

“number of columns of result is not a multiple of vector length (arg 65)”
“[1m[22mThe `x` argument of `as_tibble.matrix()` must have
unique column names if `.name_repair` is omitted as
of tibble 2.0.0.
[36mℹ[39m Using compatibility `.name_repair`.”


[1] 1043    6


feature,u,v,sig_level,power,estimated_sample_size
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Cytoplasm_Number_Object_Number,1,147,4.79386385426654e-05,0.8,62.0461740901187
Cytoplasm_AreaShape_Area,1,147,4.79386385426654e-05,0.8,60.3294079142944
Cytoplasm_AreaShape_BoundingBoxArea,1,147,4.79386385426654e-05,0.8,127.52924923438
Cytoplasm_AreaShape_BoundingBoxMaximum_X,1,147,4.79386385426654e-05,0.8,2871.24963895602
Cytoplasm_AreaShape_BoundingBoxMaximum_Y,1,147,4.79386385426654e-05,0.8,14278.4471735682
Cytoplasm_AreaShape_BoundingBoxMinimum_X,1,147,4.79386385426654e-05,0.8,4051.10195794731
