Generate Isoform report for Summary.html #212
Wrapper for isoform_report.R . Based on version:
https://github.com/UCSC-Treehouse/isoform_report/blob/ca339b96e0a9dda1281b0b6b064af29dfd4a3e70/isoform_report.R with slight modification.

Dependencies - the following R libraries are required to run this script:
 - dplyr >= 0.8.0
 - tidyverse >= 1.3.0
 - treemapify (2.5.3)
 - Sushi (1.12.0)

Sub-dependency notes
 - Sushi requires BiocManager (1.30.10) to install
 - tidyverse 1.3.0 has a dependency vctrs; as of 2020/01/15 the latest vctrs (0.2.1) isn't installing, so vctrs 0.2.0 is used instead.

Outputs:
Output files are placed into an isoform-report subdirectory. For each gene:
 - {gene}\_isoform_expression.tsv
 - Expressed\_{gene}\_isoforms\_in\_{sample_id}.pdf
 - Expressed\_{gene}\_isoforms\_in\_{sample_id}.png
 - Frequency\_of\_{gene}\_transcript\_biotypes\_in\_{sample_id}.png


In [None]:
import os
import glob
import json
import logging
import base64
import pandas as pd
from PIL import Image
from string import Template
from collections import OrderedDict
from distutils.version import LooseVersion


In [None]:
with open("conf.json","r") as conf:
    c=json.load(conf)
sample_id = c["sample_id"]    
print("Running on sample: {}".format(sample_id))
logging.basicConfig(**c["info"]["logging_config"])

def and_log(s):
    logging.info(s)
    return s

j = {}

def load_notebook_output(notebook_num):
    outputfile = "{}.json".format(notebook_num)
    try:
        with open(outputfile, "r") as f:
            result = json.load(f)
        return result
    except IOError:
        print("Error! Couldn't find output of previous notebook at {}".format(outputfile))
        return {}
    
print and_log("8.25: Isoform Report")
notebook_name = "8.25"

In [None]:
# Locate rsem-isoforms.results

# pipeline_name = glob-able basedir within secondary (format = PIPELINE-NAME-*)
# internal_path = path to file within that pipeline name
def locate_in_secondary(pipeline_name, internal_path):
    all_pipeline_versions = sorted(glob.glob(os.path.join(c["dir"]["secondary"], pipeline_name)),
                        key=LooseVersion)
    # Get the file, or most recent if there are more than one.
    if len(all_pipeline_versions) >= 1:
        return os.path.join(all_pipeline_versions[-1], internal_path)
    else:
        return False

rsem_isoforms_path = locate_in_secondary(
    "ucsc_cgl-rnaseq-cgl-pipeline-*", os.path.join("RSEM", "rsem_isoforms.results"))
j["found_isoforms_file"] = os.access(rsem_isoforms_path, os.R_OK) # we'll only expect results if file can be read

# get genes of interest as comma-separated
nb8_results = load_notebook_output("8")
leads = nb8_results["automated_leads_identified"]
genes_of_interest_list = { k:v for (k, v) in leads["results"].items() if 
                              leads["assay"][k] == "druggableUpOutlier"}.values()
genes_of_interest = ",".join(genes_of_interest_list)

In [None]:
# Parameters for Rscript
sid = sample_id
genes = genes_of_interest
enshugo = c["ref_file"]["ensembl_hugo_mapping_file"]
rir = rsem_isoforms_path
gtf = c["ref_file"]["treehouse_druggable_gencode"] # Treehouse_druggable_gencode.v23.annotation.gtf.gz
outdir = c["dir"]["isoform_report_plots_dir"] # isoform-report

print("Making isoform report for genes {}".format(genes_of_interest))

In [None]:
# make output dir
try: 
    os.makedirs(outdir)
    print("Made output dir {}".format(outdir))
except OSError as e:
    print("Found error, but perhaps the dir simply already exists?\nError: {}".format(e))
    if not os.path.isdir(outdir):
        raise

Run the Rscript. In the case where there is no rsem_isoforms file, it will fail to generate output files, which will be caught in the following steps.


In [None]:
%%script Rscript - "$sid" "$genes" "$enshugo" "$rir" "$gtf" "$outdir"

params <-
list(date = structure(18249, class = "Date"))


library(tidyverse)
library(Sushi)
library(knitr)
library(treemapify)
library(RColorBrewer)


f_RMD <- isTRUE(getOption('knitr.in.progress')) | interactive()

# parameters when run as a script

if (! f_RMD) {
args<-commandArgs(TRUE)

sample_of_interest <- args[1]
genes_of_interest_hugo <- strsplit(args[2], ",") %>% unlist
EnsGeneID_Hugo_Observed_Conversions_file <- args[3] # "EnsGeneID_Hugo_Observed_Conversions.txt"
rsem_isoforms.results_file <- args[4] # "rsem_isoforms.results"
gtf_file <- args[5] # "Treehouse_druggable_gencode.v23.annotation.gtf.gz"
output_dir <-  args[6] # "isoform-report"
}


# parameters when run interactively or knitted

if (f_RMD) {
  genes_of_interest_hugo <- c("KIT", "PDGFRA")
  sample_of_interest <- "TH34_1349_S02"
  EnsGeneID_Hugo_Observed_Conversions_file <- "EnsGeneID_Hugo_Observed_Conversions.txt"
  rsem_isoforms.results_file <- "rsem_isoforms.results"
  gtf_file <- "Treehouse_druggable_gencode.v23.annotation.gtf.gz"
  output_dir <- "isoform-report"
}


# if (f_RMD) {
print(paste(genes_of_interest_hugo, collapse = "-"))
print(sample_of_interest)
# }


ens_hugo_conversions <- read_tsv(EnsGeneID_Hugo_Observed_Conversions_file) %>%  na.omit

genes_of_interest_ensembl <- ens_hugo_conversions$EnsGeneID[
  ens_hugo_conversions$HugoID %in% genes_of_interest_hugo]

iso_results <- read_tsv(rsem_isoforms.results_file)

these_genes_iso_results <- iso_results %>% filter(gene_id %in% genes_of_interest_ensembl)

gtf_colnames <- c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")

# gtf_file <- "gencode.v23.annotation.gtf.gz"
# gtf_file <- "Treehouse_druggable_gencode.v23.annotation.gtf.gz"

gencode_v23 <- read_tsv(gtf_file, comment = "#", col_names = gtf_colnames)

gencode_v23_these_genes <- gencode_v23 %>%
  mutate(gene_id = gsub("\".*$", "", 
                        gsub("^gene_id \"", "\\1", attribute)),
         transcript_id = gsub("^.*transcript_id \"([A-Z0-9\\.]*)\".*$", 
                              "\\1", attribute),
         feature_length = end - start
  )  %>%
  filter(gene_id %in% genes_of_interest_ensembl)


gencode_v23_these_genes_transcripts <- gencode_v23_these_genes %>%
  filter(feature == "transcript")

KVsep <- fixed("; ")  #key-value separator
Vsep <- fixed(" ")     #value separator

gencode_v23_these_genes_transcript_minutia <-  gencode_v23_these_genes_transcripts %>% 
  mutate(KVpairs = str_split(attribute, KVsep)) %>%
  unnest(KVpairs) %>%
  separate(KVpairs, into = c("key", "value"), Vsep) %>%
  filter( !(key == "tag" & value !="basic"))  %>% # keep tag only if basic
  filter(! key %in% c("transcript_id", "gene_id")) %>% # these value were already extracted
  mutate(value = gsub("\"", "", value)) %>%
  spread(key, value)


these_genes_iso_results_anno <- these_genes_iso_results %>%
  left_join(gencode_v23_these_genes_transcript_minutia %>% 
             dplyr::select (-gene_id), 
            by="transcript_id") %>%
  mutate(transcript_id = fct_reorder(transcript_id, IsoPct))

n_transcripts_to_analyze <- 10
top_iso_results_anno <- these_genes_iso_results_anno %>%
  top_n(n_transcripts_to_analyze, IsoPct)
  



# Error in .f(.x[[i]], ...) : object 'transcript_type' not found
# Calls: %>% ... <Anonymous> -> vars_select_eval -> map_if -> map -> .f
#Execution halted

exon_locations <- gencode_v23_these_genes %>%
  filter(feature %in% c("exon", "UTR")) %>%
  left_join(top_iso_results_anno %>% 
              dplyr::select(transcript_id, IsoPct, TPM, transcript_type, transcript_name, hugo_id = gene_name),
            by=c("transcript_id")) %>%
  mutate(score = IsoPct,
         transcript_label = paste0(transcript_name, " (",
                                   #IsoPct, "%, ", transcript_type, ")")) %>%
                                   IsoPct, "%)")) %>%
  dplyr::select(chrom = seqname, start, stop = end, 
                gene = transcript_id, score, 
                strand, type = feature, IsoPct, TPM, transcript_label, transcript_name, hugo_id) %>%
  arrange(desc(IsoPct))



plot_gene <- function(submitted_bed_data, buffer_size = 5e2, plot_title = ""){
  bed_data = data.frame(submitted_bed_data)
  chrom <- bed_data$chrom[1]
  chromstart = min(bed_data$start) - buffer_size
  chromend = max(bed_data$stop) + buffer_size
    
  # Only colorby if there is more than one unique score or it will crash
  if( length(unique(bed_data$score))==1 ){
        pg = plotGenes(bed_data,chrom,chromstart,chromend,
                 #colorby=log10(bed_data$score+0.001),
                 #colorby=bed_data$score,
                 #colorbycol= SushiColors(5),colorbyrange=c(0,1.0),
                 labeltext=TRUE,maxrows=50,height=0.4,plotgenetype="box",
                 packrow = FALSE
      )   
  } else {
        pg = plotGenes(bed_data,chrom,chromstart,chromend,
                 #colorby=log10(bed_data$score+0.001),
                 colorby=bed_data$score,
                 #colorbycol= SushiColors(5),colorbyrange=c(0,1.0),
                 labeltext=TRUE,maxrows=50,height=0.4,plotgenetype="box",
                 packrow = FALSE
      )   
  }

  labelgenome( chrom, chromstart,chromend,n=3,scale="Mb")
 # note: add legend has to be hand-placed for each plot, so I've omitted it here
  title(main = plot_title, sub = "Colored by isoform percent, also reported in label")
}



multi_plot_gene <- function(bed_data) {
  # bed_data <- t5
  this_title <- paste("Expressed", bed_data$hugo_id[1], "isoforms in", sample_of_interest)
  
  base_filename <- gsub(" ", "_", this_title)

  ## If using RMD to generate html output, make plot
  if( f_RMD ) plot_gene (bed_data, plot_title = this_title)
  
  ## If scripted, make plots in output files
  if( ! f_RMD ) {
    ## Make PDF (small file size, can be endlessly enlarged, inconvenient to embed in html)
    pdf(file = file.path(output_dir, paste0(base_filename, ".pdf")),
        width = 8, height = 4)
    
    plot_gene (bed_data, plot_title = this_title)
    
    dev.off()  
    
    ## Make high res PNG (large file size, convenient to embed in html)
    png(file = file.path(output_dir, paste0(base_filename, ".png")),
        width = 12, height = 6, units = "in", res = 600)
    
    plot_gene (bed_data, plot_title = this_title)
    
    dev.off()  
  }
}


expressed_transcripts <- exon_locations %>% 
  mutate(gene = transcript_label) %>% 
  dplyr::filter(TPM > 0) 

expressed_transcripts %>% 
  group_by(hugo_id) %>%
  group_split %>%
  lapply(multi_plot_gene)



transcript_biotypes <- c("protein_coding", "processed_transcript", "retained_intron", "processed_pseudogene", "nonsense_mediated_decay", "transcribed_processed_pseudogene")
# biotype_color_codes <- tibble(transcript_biotypes, brewer.pal(12, "Set1")[1:length(transcript_biotypes)])
biotype_color_codes <- tibble(transcript_biotype = transcript_biotypes, color_code = brewer.pal(length(transcript_biotypes), "Set1"))


 
transcript_biotype_colors <- biotype_color_codes$color_code
names(transcript_biotype_colors) <-  biotype_color_codes$transcript_biotype



plot_biotype_frequency <- function(this_gene_iso_results_anno){
  
  biotype_freq <- this_gene_iso_results_anno %>% 
    group_by(transcript_type) %>%
    summarize(total_isoform_pct_for_type = sum(IsoPct)) %>%
    mutate(biotype_label = paste0(transcript_type, " (", total_isoform_pct_for_type, "%)"))
  
  this_title <- paste("Frequency of", this_gene_iso_results_anno$gene_name[1] ,"transcript biotypes in", sample_of_interest)
  base_filename <- gsub(" ", "_", this_title)
  
  ggplot(biotype_freq, 
         aes(fill = transcript_type, 
             area = total_isoform_pct_for_type, 
             label = biotype_label)) +
    geom_treemap() + 
    geom_treemap_text(colour = "white", 
                      place = "centre") +
    labs(title = this_title) +
    scale_fill_manual(values = transcript_biotype_colors)
  
  ggsave(file.path(output_dir, paste0(base_filename, ".png")), width=5, height = 5)
}



these_genes_iso_results_anno %>% 
  group_by(gene_name) %>%
  group_split %>%
  lapply(plot_biotype_frequency)



output_table <- these_genes_iso_results_anno  %>% 
  dplyr::filter(TPM > 0) %>%
  mutate(log2TPM1 = round(log2(TPM +1),2)) %>%
  dplyr::select(transcript_name, length, log2TPM1, strand, IsoPct, transcript_type, transcript_id, gene_name) %>%
  arrange(desc(IsoPct)) %>% 
  group_by(gene_name)

if(f_RMD){
  list_of_output_tables <- output_table %>% 
    group_split
  
  for(i in list_of_output_tables) {
    print(kable(x = i))
  }
} 

for_silent_output <- output_table %>% 
  group_split %>%
  lapply(function(x) {write_tsv(x, file.path(output_dir, paste0(x$gene_name[1], "_isoform_expression.tsv")))})


In [None]:
def image_to_json(path, filename):
    try:
        with open(os.path.join(path,filename), "rb") as f:
            return base64.b64encode(f.read())
    except IOError:
        print("Couldn't read {}/{}; skipping".format(path, filename))
        return False
        
def downscale_png(png_path):
    try:
        png_to_resize = Image.open(png_path)
        png_to_resize.thumbnail((1080,540))
        png_to_resize.save(png_path, "PNG")
        return True
    except IOError:
        print("Couldn't resize image at {}; skipping".format(png_path))
        return False

expressed_isoforms_png = Template("Expressed_${gene}_isoforms_in_${sample_id}.png")
# We also generate Expressed_${gene}_isoforms_in_${sample_id}.pdf"
frequency_png = Template("Frequency_of_${gene}_transcript_biotypes_in_${sample_id}.png")
expression_tsv = Template("${gene}_isoform_expression.tsv")

In [None]:
# Downscale expressed isoform PNGs and import PNGs and table into JSON file
# any file not found will be imported as False    

j["isoform_results"] = {}
for gene in genes_of_interest_list:
    j["isoform_results"][gene]={}
    png_path = os.path.join(outdir, 
                            expressed_isoforms_png.substitute(gene=gene, sample_id=sample_id))    
    downscale_png(png_path)
    # Load images into JSON - will be False if file not found
    j["isoform_results"][gene]["expressed_isoforms_img_data"]=image_to_json(outdir,
        expressed_isoforms_png.substitute(gene=gene, sample_id=sample_id))
    j["isoform_results"][gene]["transcript_biotypes_img_data"]=image_to_json(outdir,
        frequency_png.substitute(gene=gene, sample_id=sample_id))
    
    try:
        isoform_table_file=os.path.join(outdir,expression_tsv.substitute(gene=gene))
        isoform_table_json=json.loads(
            pd.read_csv(isoform_table_file, delimiter="\t", dtype="str", na_filter=False
                   ).to_json(orient="records"),object_pairs_hook=OrderedDict)
    except IOError:
        print("Couldn't read {}; skipping".format(isoform_table_file))
        isoform_table_json = False
    j["isoform_results"][gene]["isoform_table"]=isoform_table_json

# Store order of isoform keys for proper display in summary
j["isoform_table_key_order"]=[
    "transcript_name",
    "length",
    "log2TPM1",
    "strand",
    "IsoPct",
    "transcript_type",
    "transcript_id",
    "gene_name"
]

In [None]:
with open("{}.json".format(notebook_name), "w") as jsonfile:
    json.dump(j, jsonfile, indent=2)
    
print("Done.")