In [1]:
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(argparse))

“package ‘ggplot2’ was built under R version 4.2.3”


## Four Beta Model Vizualization

In [None]:
# define command line arguments
parser <- ArgumentParser(description = "Visualize linear modeling results")
# add arguments
parser$add_argument('--celltype', type='character', help='Cell type to visualize')

# parse arguments from command line
args <- parser$parse_args()

# define cell type
celltype <- args$celltype


In [2]:
celltype = "SHSY5Y"

In [3]:

lm_file <- file.path(paste0("./results/", celltype, "/lm_four_beta.tsv"))

lm_cp_fig <- file.path(paste0("./figures/", celltype, "/lm_four_beta.pdf"))
lm_cp_fig_abs <- file.path(paste0("./figures/", celltype, "/lm_four_beta_abs.pdf"))

# if path does not exist, create it
if (!dir.exists(file.path(paste0("./figures/", celltype)))) {
    dir.create(file.path(paste0("./figures/", celltype)))
}
     
lm_df <- readr::read_tsv(lm_file, col_types = readr::cols(.default = "d", feature ="c", inducer1__inducer1_dose__inhibitor__inhibitor_dose = "c"))
head(lm_df)

feature,r2_score,Metadata_number_of_singlecells,fourb_Treatment,fourb_Treatment_Dose,fourb_Inhibitor,fourb_Inhibitor_Dose,inducer1__inducer1_dose__inhibitor__inhibitor_dose
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
Cytoplasm_AreaShape_Compactness,0.0010758123,-7.546792e-05,-0.0201709707,0.0201709707,-0.0201709707,0.0201709707,media ctr__0__Media ctr__0.0
Cytoplasm_AreaShape_Eccentricity,0.0005340345,-6.444612e-05,-0.0002461108,0.0002461108,-0.0002461108,0.0002461108,media ctr__0__Media ctr__0.0
Cytoplasm_AreaShape_Extent,0.0010686948,9.424593e-05,0.0195559884,-0.0195559884,0.0195559884,-0.0195559884,media ctr__0__Media ctr__0.0
Cytoplasm_AreaShape_FormFactor,0.0006766865,7.923037e-05,0.015213815,-0.015213815,0.015213815,-0.015213815,media ctr__0__Media ctr__0.0
Cytoplasm_AreaShape_MajorAxisLength,0.001552285,-7.570263e-05,0.008414208,-0.008414208,0.008414208,-0.008414208,media ctr__0__Media ctr__0.0
Cytoplasm_AreaShape_MinorAxisLength,0.0004700079,8.941444e-06,0.0120887511,-0.0120887511,0.0120887511,-0.0120887511,media ctr__0__Media ctr__0.0


In [4]:
unique(lm_df$inducer1__inducer1_dose__inhibitor__inhibitor_dose)

In [5]:

# Arrange by absolute value coefficient
# Split out components of feature name for visualization
lm_df <- lm_df %>%
    dplyr::arrange(desc(abs(fourb_Inhibitor_Dose))) %>%
    tidyr::separate(
        feature,
        into = c(
            "compartment",
            "feature_group",
            "measurement",
            "channel",
            "parameter1",
            "parameter2"
        ),
        sep = "_",
        remove = FALSE
    ) %>%
    dplyr::mutate(channel_cleaned = channel)



“[1m[22mExpected 6 pieces. Additional pieces discarded in 7956 rows [3, 7, 9, 10, 13,
14, 16, 17, 18, 20, 23, 28, 35, 40, 52, 54, 56, 60, 66, 71, ...].”
“[1m[22mExpected 6 pieces. Missing pieces filled with `NA` in 15264 rows [1, 2, 4, 5,
6, 8, 11, 12, 15, 19, 21, 22, 24, 25, 26, 27, 29, 30, 31, 32, ...].”


In [6]:
unique(lm_df$channel)

In [7]:
unique(lm_df$channel_cleaned)

In [8]:
# Clean channel for visualization
lm_df$channel_learned <- dplyr::recode(lm_df$channel_cleaned,
        "CorrDNA" = "nuclei",
        "CorrMito" = "Mito",
        "CorrER" = "ER",
        "CorrGasdermin" = "gasdermin",
        "CorrPM" = "PM",
        .default = "other",
        .missing="other"
    )

print(dim(lm_df))
head(lm_df, 2)
unique(lm_df$channel_learned)
lm_df$abs_Metadata_number_of_singlecells <- abs(lm_df$Metadata_number_of_singlecells)
lm_df$abs_fourb_Treatment <- abs(lm_df$fourb_Treatment)
lm_df$abs_fourb_Treatment_Dose <- abs(lm_df$fourb_Treatment_Dose)
lm_df$abs_fourb_Inhibitor <- abs(lm_df$fourb_Inhibitor)
lm_df$abs_fourb_Inhibitor_Dose <- abs(lm_df$fourb_Inhibitor_Dose)

[1] 45036    16


feature,compartment,feature_group,measurement,channel,parameter1,parameter2,r2_score,Metadata_number_of_singlecells,fourb_Treatment,fourb_Treatment_Dose,fourb_Inhibitor,fourb_Inhibitor_Dose,inducer1__inducer1_dose__inhibitor__inhibitor_dose,channel_cleaned,channel_learned
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
Cytoplasm_Correlation_RWC_CorrPM_CorrMito,Cytoplasm,Correlation,RWC,CorrPM,CorrMito,,0.3742011,-0.0002044911,-0.6900744,-0.6900744,-0.6900744,-0.6900744,H2O2__100.000__Disulfiram__1.0,CorrPM,PM
Nuclei_Correlation_Overlap_CorrMito_CorrPM,Nuclei,Correlation,Overlap,CorrMito,CorrPM,,0.3394022,-0.0002052585,-0.6693642,-0.6693642,-0.6693642,-0.6693642,H2O2__100.000__Disulfiram__1.0,CorrMito,Mito


In [9]:
loop_list <- unique(lm_df$inducer1__inducer1_dose__inhibitor__inhibitor_dose)
x_list_abs <- c('abs_fourb_Treatment','abs_fourb_Treatment_Dose','abs_fourb_Inhibitor','abs_fourb_Inhibitor_Dose')
x_list <- c('fourb_Treatment','fourb_Treatment_Dose','fourb_Inhibitor','fourb_Inhibitor_Dose')

In [10]:
pdf(file=lm_cp_fig )
for (i in 1:length(loop_list)){
    df <- lm_df[lm_df$inducer1__inducer1_dose__inhibitor__inhibitor_dose == loop_list[i],]
    for (j in 1:length(x_list)){
        col = x_list[j]
        # print(x_list[j])
        # print(df[1,col])
        if (df[1,col] == 0){
            next
        } else {
            lm_fig_gg <- (
                ggplot(df, aes(x = Metadata_number_of_singlecells, y = .data[[x_list[j]]]))
                + geom_point(aes(size = r2_score, color = channel_learned), alpha = 0.7)
                + theme_bw()
                + guides(
                    color = guide_legend(title = "Channel\n(if applicable)", order = 1),
                    size = guide_legend(title = "R2 score of LM feature")
                )
                + geom_vline(xintercept = 0, linetype = "dashed", color = "red")
                + geom_hline(yintercept = 0, linetype = "dashed", color = "red")
                + geom_density2d(color="black", show.legend = FALSE)
                + ylab(paste0(x_list[j]," contribution (LM beta coefficient)"))
                + xlab("Cell count contribution (LM beta coefficient)")
                + ggtitle(paste0("How CellProfiler features contribute\nto ",loop_list[i], "\ntreatments and cell density"))
            )
            plot(lm_fig_gg)
        }
    }
}
dev.off()

In [11]:
pdf(file=lm_cp_fig_abs )
for (i in 1:length(loop_list)){
    df <- lm_df[lm_df$inducer1__inducer1_dose__inhibitor__inhibitor_dose == loop_list[i],]
    for (j in 1:length(x_list_abs)){
        lm_fig_gg <- (
                ggplot(df, aes(x = Metadata_number_of_singlecells, y = .data[[x_list[j]]]))
                + geom_point(aes(size = r2_score, color = channel_learned), alpha = 0.7)
                + theme_bw()
                + guides(
                    color = guide_legend(title = "Channel\n(if applicable)", order = 1),
                    size = guide_legend(title = "R2 score of LM feature")
                )
                + geom_vline(xintercept = 0, linetype = "dashed", color = "red")
                + geom_hline(yintercept = 0, linetype = "dashed", color = "red")
                + geom_density2d(color="black", show.legend = FALSE)
                + ylab(paste0(x_list[j]," contribution (LM beta coefficient)"))
                + xlab("Cell count contribution (LM beta coefficient)")
                + ggtitle(paste0("How CellProfiler features contribute\nto ",loop_list[i], "\ntreatments and cell density"))
            )
        plot(lm_fig_gg)    
    }
}
dev.off()

“[1m[22mComputation failed in `stat_density2d()`
Caused by error in `MASS::kde2d()`:
[33m![39m bandwidths must be strictly positive”
“no non-missing arguments to min; returning Inf”
“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to max; returning -Inf”


ERROR: [1m[33mError[39m in `geom_density2d()`:[22m
[1m[22m[33m![39m Problem while computing stat.
[36mℹ[39m Error occurred in the 4th layer.
[1mCaused by error in `seq_len()`:[22m
[33m![39m argument must be coercible to non-negative integer
