Source: https://github.com/UCSC-Treehouse/rank_drugs_for_a_sample

Sample input: 4.0.json
References input: target-drug_relationships_2020-02-27.tsv

Outputs:
-  drugs_by_percentile_SAMPLEID.png"
-  drug-relevant_expression_info_SAMPLEID.tsv"

In [None]:
library(tidyverse)
library(knitr)
library(jsonlite)

c = read_json("conf.json")
sample_id = c$sample_id
print(paste0("Running on sample: ", sample_id))
logpath = c$info$logging_config$filename

and_log <- function(text){
    write(text, file=logpath,append=TRUE)
    return(text)
}

j <- list()

print(and_log("8.75: Running Rank Drugs for a Sample..."))

In [None]:
# Load gene-drug relationship file.
# Load the outlier results from 4.0.json and transform into a dataframe

target_drug_relationships <- read_tsv(c$ref_file$target_drug_relationships,
                                      col_types = cols(.default = "c"))

data_json <- read_json(c$json$"4.0")$outlier_results
data_list <- lapply(names(data_json),
      function(this_colname) {
        this_data <- data_json[[this_colname]]
        this_result <- tibble(Gene = names(this_data),
               Value = as.character(this_data))
        colnames(this_result)[2] <- this_colname
        return(this_result)
      })
raw_outliers <- data_list %>%
    reduce(left_join, by = "Gene")
raw_outliers$pc_percentile <-  as.numeric(raw_outliers$pc_percentile)

In [None]:
# Columns in the results, in order of printing
desired_column_list <- c("Gene", "drug", "sample","is_top_5",
                         "pc_low", "pc_median", "pc_high",
                         "pc_outlier", "pd_outlier", "pc_percentile")


# Rank the results sorted by the highest pancan percentile for each drug
outlier_results <- raw_outliers %>%
  inner_join(target_drug_relationships, by=c("Gene"="target")) %>%
  select(desired_column_list) %>%
  mutate_at(vars(sample, pc_low, pc_median, pc_high), function(x) round(as.numeric(x),2)) %>%
  replace_na(list(pc_outlier="non outlier", is_top_5="")) %>%
  group_by(drug) %>%
  mutate(max_pctl_for_drug = max(pc_percentile)) %>%
  ungroup %>%
  arrange(desc(max_pctl_for_drug), drug, desc(pc_percentile) ) %>%
  mutate(gene_drug = factor( paste0(Gene, " (", drug, ")"), levels = rev(unique( paste0(Gene, " (", drug, ")")))))

In [None]:
# plot
this_title <- paste0("pancan percentile in ", sample_id)
ggplot(outlier_results) +
geom_point(aes(y = gene_drug, x=pc_percentile, 
               shape=pc_outlier, color = is_top_5)) +
  xlab("pan cancer percentile of expression") +
  ylab("") + 
  ggtitle(this_title) +
  scale_color_brewer(palette = "Set1")

In [None]:
# Write output files & save the uri-encoded PNG and drug-relevant expression table to the json
print(paste0("Saving plot to: ", c$file$drugs_by_percentile_plot))
print(paste0("Saving table to: ", c$file$drug_relevant_expression))

ggsave(c$file$drugs_by_percentile_plot)


drug_relevant_expression_table <- outlier_results %>%
  select(-gene_drug, -max_pctl_for_drug) %>%
  mutate(pc_outlier = gsub("non outlier", "", pc_outlier))

drug_relevant_expression_table %>% write_tsv(c$file$drug_relevant_expression)
j$drug_relevant_expression_table <- drug_relevant_expression_table


# Also make a lower resolution image for the summary
tmp_png <- tempfile(tmpdir=".", fileext=".png")
print(paste0("Low res table temporarily saved to ", tmp_png))
ggsave(tmp_png, dpi=150)
j$drugs_by_percentile_plot <- image_uri(tmp_png)
file.remove(tmp_png)

j$expression_table_key_order <- desired_column_list

write_json(j, pretty=TRUE, c$json$"8.75", auto_unbox=TRUE) 
# use auto_unbox: saves the plot as a bare string rather than 1-element array

print("Done!")