# Visualize UMAPs for only the JUMP dataset embeddings

In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(patchwork))

# Load variables important for plotting (e.g., themes, phenotypes, etc.)
source("themes.r")

“package ‘ggplot2’ was built under R version 4.2.3”
“package ‘patchwork’ was built under R version 4.2.3”


In [2]:
# Set directory for data split
data_split_dir <- file.path("./results/Only_JUMP_all_features")
# File structure in dir
umap_files <- list.files(data_split_dir, full.names = TRUE)
print(umap_files)

output_fig_dir <- file.path("figures/Only_JUMP_all_features")
umap_prefix <- "Only_JUMP_all_features_"
umap_suffix <- ".tsv"

# Define output figure paths as a dictionary where each plate has a figure output path
output_umap_files <- list()
for (umap_file in umap_files) {
    # Use the file name to extract plate
    file_name <- basename(umap_file)
    plate <- gsub(umap_suffix, "", gsub(umap_prefix, "", file_name))

    output_umap_files[[plate]] <- file.path(output_fig_dir, paste0(umap_prefix, plate))
}
        
print(output_umap_files)

[1] "./results/Only_JUMP_all_features/Only_JUMP_all_features_final_all_features_model.tsv"            
[2] "./results/Only_JUMP_all_features/Only_JUMP_all_features_final_greg_areashape_model.tsv"          
[3] "./results/Only_JUMP_all_features/Only_JUMP_all_features_shuffled_baseline_all_features_model.tsv"
[4] "./results/Only_JUMP_all_features/Only_JUMP_all_features_shuffled_greg_areashape_model.tsv"       
$final_all_features_model
[1] "figures/Only_JUMP_all_features/Only_JUMP_all_features_final_all_features_model"

$final_greg_areashape_model
[1] "figures/Only_JUMP_all_features/Only_JUMP_all_features_final_greg_areashape_model"

$shuffled_baseline_all_features_model
[1] "figures/Only_JUMP_all_features/Only_JUMP_all_features_shuffled_baseline_all_features_model"

$shuffled_greg_areashape_model
[1] "figures/Only_JUMP_all_features/Only_JUMP_all_features_shuffled_greg_areashape_model"



## Load in data frames

In [3]:
# Load data
umap_cp_df <- list()
for (plate in names(output_umap_files)) {
    # Find the umap file associated with the plate
    umap_file <- umap_files[stringr::str_detect(umap_files, plate)]
    
    # Load in the umap data
    df <- readr::read_tsv(
        umap_file,
        col_types = readr::cols(
            .default = "d",
            "Metadata_Predicted_Class" = "c",
            "Metadata_Phenotypic_Value" = "d",
            "Metadata_model_type" = "c",
            "Metadata_Well" = "c",
            "Metadata_Plate" = "c",
            "Metadata_Predicted_Class" = "c",
            "Metadata_treatment" = "c"
        )
    ) %>%
    # Generate a new column that we will use for plotting
    # Note, we define focus_phenotypes in themes.r
    dplyr::mutate(Metadata_Plot_Label = if_else(
        Metadata_Predicted_Class %in% focus_phenotypes,
        Metadata_Predicted_Class,
        "Other"
    ))
    
    df$Metadata_Predicted_Class <-
        dplyr::recode_factor(df$Metadata_Predicted_Class, !!!focus_phenotype_labels)

    # Reorder columns, move Metadata_Predicted_Class to the second position
    df <- dplyr::select(df, Metadata_Predicted_Class, everything())

    # Append the data frame to the list
    umap_cp_df[[plate]] <- df 
}

# print example of loaded in file
head(df)

“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)”
“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)”
“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)”
“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)”


Metadata_Predicted_Class,Metadata_treatment,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_ObjectNumber_cytoplasm,Metadata_broad_sample,Metadata_solvent,Metadata_ImageNumber,Metadata_TableNumber,⋯,Metadata_gene,Metadata_target_sequence,Metadata_negcon_control_type,Metadata_model_type,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Phenotypic_Value,UMAP0,UMAP1,Metadata_Plot_Label
<fct>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
Binuclear,compound,BR00117054,H04,1,58,,,1540,2.286199e+38,⋯,,,,shuffled,148.7466,262.9333,0.1150475,0.1643815,3.012419,Other
Polylobed,compound,BR00117054,H04,9,90,,,1548,2.283918e+38,⋯,,,,shuffled,232.5218,387.4818,0.1459852,-1.1476176,0.929445,Other
MetaphaseAlignment,compound,BR00117054,M17,6,200,,,2742,7.326611e+37,⋯,,,,shuffled,537.9539,999.6129,0.1671423,-2.6269443,-3.596078,Other
Metaphase,compound,BR00117054,G14,6,102,,,1419,5.0908489999999995e+37,⋯,,,,shuffled,448.5,482.7932,0.1767198,8.371599,-1.490912,Metaphase
Elongated,compound,BR00117054,D02,9,106,,,666,2.997026e+38,⋯,,,,shuffled,179.4531,373.2509,0.1834371,-0.6425644,-3.611323,Elongated
Hole,compound,BR00117054,M08,5,143,,,2660,1.4749880000000002e+38,⋯,,,,shuffled,216.1765,652.9412,0.194023,-2.121301,-4.552017,Other


## Create UMAP labeling all phenotypic classes

In [4]:
for (plate in names(umap_cp_df)) {
    # Focus phenotypic class UMAP file path
    output_file <- output_umap_files[[plate]]
    output_file <- paste0(output_file, "_all_phenotypes_UMAP.png")

    # UMAP labelled with focus phenotypic classes
    phenotype_gg <- (
        ggplot(umap_cp_df[[plate]], aes(x = UMAP0, y = UMAP1))
        + geom_point(
            aes(color = Metadata_Predicted_Class), size = 0.4, alpha = 0.5
        )
        + theme_bw()
        + scale_color_manual(
            name = "Phenotypes",
            values = all_phenotype_class_colors
        )
    )

    ggsave(output_file, phenotype_gg, dpi = 500, height = 6, width = 8)
}

## Create UMAP labeling focus phenotypic classes

In [None]:
for (plate in names(umap_cp_df)) {
    # Focus phenotypic class UMAP file path
    output_file <- output_umap_files[[plate]]
    output_file <- paste0(output_file, "_focused_phenotypes_UMAP.png")

    # UMAP labelled with focus phenotypic classes
    phenotype_gg <- (
        ggplot(umap_cp_df[[plate]], aes(x = UMAP0, y = UMAP1))
        + geom_point(
            aes(color = Metadata_Predicted_Class), size = 0.4, alpha = 0.5
        )
        + theme_bw()
        + scale_color_manual(
            name = "Phenotypes",
            values = focus_phenotype_colors
        )
    )

    ggsave(output_file, phenotype_gg, dpi = 500, height = 6, width = 8)
}

## Create UMAP labeling treatment

Treatments include crispr, orf, or compound. Only need to create one of these figures, not one per model probabilities, since the compounds are data dependent.

In [None]:
# Specify only one model and change file name to show only the data_split
desired_plate <- "final_greg_areashape_model"
output_file_name <- "Only_JUMP_all_features"

# Check if the plate is the desired one, and run the code only for that plate
if (desired_plate %in% names(umap_cp_df)) {
    plate <- desired_plate

    # Treatment UMAP file path
    output_file <- file.path(output_fig_dir, paste0(output_file_name, "_treatment_UMAP.png"))

    # UMAP labelled with treatment
    treatment_gg <- (
        ggplot(umap_cp_df[[plate]], aes(x = UMAP0, y = UMAP1))
        + geom_point(
            aes(color = Metadata_treatment), size = 0.4, alpha = 0.5
        )
        + theme_bw()
        + scale_color_manual(
            name = "Treatment",
            values = treatment_colors
        )
    )

    ggsave(output_file, treatment_gg, dpi = 500, height = 6, width = 8)
}

## Create UMAP labeling plate

Only need to create one of these figures, not one per model probabilities, since the plates are data dependent.

In [None]:
# Check if the plate is the desired one, and run the code only for that plate
if (desired_plate %in% names(umap_cp_df)) {
    plate <- desired_plate

    # Treatment UMAP file path
    output_file <- file.path(output_fig_dir, paste0(output_file_name, "_plate_UMAP.png"))

    # UMAP labelled with treatment
    treatment_gg <- (
        ggplot(umap_cp_df[[plate]], aes(x = UMAP0, y = UMAP1))
        + geom_point(
            aes(color = Metadata_Plate), size = 0.4, alpha = 0.5
        )
        + theme_bw()
        + scale_color_manual(
            name = "Plate",
            values = plate_colors
        )
        + theme(legend.position = "none")  # Remove the legend since there are 51 plates (too many to fit)
    )

    ggsave(output_file, treatment_gg, dpi = 500, height = 6, width = 8)
}

## Create faceted UMAP labeling only focus phenotypes split between phenotype

In [None]:
# Custom function for name repair
name_repair_function <- function(names) {
  names[1] <- paste0(names[1], "_original")
  return(names)
}

for (plate in names(umap_cp_df)) {
    # Focus phenotypic class/data set facet UMAP file path
    output_file <- output_umap_files[[plate]]
    output_file <- paste0(output_file, "_facet_focus_phenotype_UMAP.png")
    
    umap_focus_df <- umap_cp_df[[plate]] %>% dplyr::filter(Metadata_Predicted_Class %in% focus_phenotypes)

    # add grey points to each facet by duplicating the UMAP coords
    df_background <- tidyr::crossing(
        umap_focus_df,
        .name_repair = name_repair_function
    )

    # Facet UMAP labelling phenotype and data set
    umap_facet_phenotype_gg <- (
        ggplot(
            umap_cp_df[[plate]] %>% dplyr::filter(Metadata_Plot_Label %in% focus_phenotypes),
            aes(x = UMAP0, y = UMAP1)
        )
        + geom_point(
            data = df_background,
            color = "lightgray",
            size = 0.1,
            alpha = 0.4
        )
        + geom_point(
            aes(color = Metadata_Plot_Label),
            size = 0.1
        )

        + facet_grid("~Metadata_Predicted_Class")
        + theme_bw()
        + phenotypic_ggplot_theme
        + guides(
            color = guide_legend(
                override.aes = list(size = 2)
            )
        )
        + labs(x = "UMAP0", y = "UMAP1")
        + scale_color_manual(
            "Phenotype",
            values = focus_phenotype_colors,
            labels = focus_phenotype_labels
        )
    )

    ggsave(output_file, umap_facet_phenotype_gg, dpi = 500, height = 4, width = 10)

}