In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(reticulate))
suppressPackageStartupMessages(library(arrow))

“package ‘ggplot2’ was built under R version 4.2.3”
“package ‘cowplot’ was built under R version 4.2.3”
“package ‘arrow’ was built under R version 4.2.3”


In [2]:
#Load data
data_dir <- file.path("../4.gene-expression-signatures/gsea_results")

results_file <- file.path(data_dir, "combined_z_matrix_gsea_results.parquet")
gsea_results_df <- arrow::read_parquet(
    results_file,
)

In [3]:
# Source the themes.R file from the utils folder
source("../utils/themes.r")

In [4]:
# Prepare the data
# Filter for the single highest ES score for each model at each dimension
max_es_df <- gsea_results_df %>%
  group_by(model, full_model_z) %>%
  summarize(max_es = max(abs(`gsea_es_score`), na.rm = TRUE)) %>%
  ungroup()

[1m[22m`summarise()` has grouped output by 'model'. You can override using the
`.groups` argument.


In [5]:
# Prepare the data
# Filter for some specific pathway the single highest ES score for each model at each dimension
path_es_df <- gsea_results_df %>%
  filter(`reactome_pathway` == "Cardiac Conduction R-HSA-5576891") %>%
  group_by(model, full_model_z) %>%
  summarize(max_es = max(abs(`gsea_es_score`), na.rm = TRUE)) %>%
  ungroup()

“no non-missing arguments to max; returning -Inf”
[1m[22m`summarise()` has grouped output by 'model'. You can override using the
`.groups` argument.


In [6]:
# Plot the data
latent_plot <- ggplot(max_es_df, aes(x = factor(full_model_z), y = log(max_es), color = model, fill = model)) +
  geom_point(size = 3, shape = 21) +  # Points on the line
  geom_smooth(aes(group = model), method = "loess", se = TRUE, size = 1, alpha = 0.1) +  # Trend line with shading 
  scale_color_manual(name = "Algorithm", values = model_colors, labels = model_labels) +
  scale_fill_manual(name = "Algorithm", values = model_colors, labels = model_labels) +
  labs(x = "Latent Dimensions", y = "Highest ES Score", title = "Highest ES Score Across All Pathways by Latent Dimension for Each Model") +
  custom_theme()

“[1m[22mUsing `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
[36mℹ[39m Please use `linewidth` instead.”


In [7]:
# Save the plot with custom dimensions
ggsave("./visualize/latent_plot.png", plot = latent_plot, width = 10, height = 8, units = "in")

[1m[22m`geom_smooth()` using formula = 'y ~ x'


In [8]:
# Plot the data
path_plot <- ggplot(path_es_df, aes(x = factor(full_model_z), y = log(max_es), color = model, fill = model)) +
  geom_point(size = 3, shape = 21) +  # Points on the line
  geom_smooth(aes(group = model), method = "loess", se = TRUE, size = 1, alpha = 0.1) +  # Trend line with shading 
  scale_color_manual(name = "Algorithm", values = model_colors, labels = model_labels) +
  scale_fill_manual(name = "Algorithm", values = model_colors, labels = model_labels) +
  labs(x = "Latent Dimensions", y = "Highest ES Score", title = "Highest ES Score for Regulation of Cardiac Conduction Pathway by Latent Dimension for Each Model") +
  theme(legend.position = "right")

In [9]:
# Save the plot with custom dimensions
ggsave("./visualize/cardiac_conduction_latent_plot.png", plot = path_plot, width = 10, height = 8, units = "in")

“[1m[22mNo shared levels found between `names(values)` of the manual scale and the
data's [32mcolour[39m values.”
“[1m[22mNo shared levels found between `names(values)` of the manual scale and the
data's [32mfill[39m values.”
