In [None]:
# this will create a json file conf.json which contains all the paths necessary for a sample.

# and the sample and cohort and stuff.
import os
import json
import errno
import logging
import hashlib # for md5 sums
import time
import pandas as pd

configuration_file_name = "conf.json"

# If a conf.json already exists in the pwd, use that to get the basic conf info
try:
    with open(configuration_file_name,"r") as conf:
        c=json.load(conf)
except IOError:
    c = {}
    c["dir"] = {}

    # Set up the conf with the basic details
    c["sample_id"] = os.getenv("TREEHOUSE_SAMPLE_ID")

# possible incoming values from os.getenv are a string or None.
# Ensure that all values in the conf are strings (it's ok if medbook prefix is empty)
# If values aren't present, set them to a test value and also run in dryrun mode
if not c["sample_id"]:
    print("Sample ID can't be {}; setting it to a test value.".format(c["sample_id"]))
    c["sample_id"] = "TEST_SAMPLE_1"

print("Creating configuration file for sample {}".format(c["sample_id"]))

In [None]:
# Settings

if not "info" in c: # Don't clear info if exists, so we can check the git hash
    c["info"] = {}
c["file"]={}

# Interquartile range multiplier: 
c["info"]["iqr_multiplier"] = 1.5

# For expression filter, what proportion should have expression=0 to drop gene?
c["info"]["proportion_unexpressed_filter_cutoff"] = 0.8
# for variance filter, what percent of genes (sorted by variance) should we drop?
c["info"]["variance_filter_cutoff"]=0.2

c["error_delim"]="(PRINTTHIS)" # Surround error messages with this and they will be printed in the console
c["info"]

In [None]:
# Paths
# this should be used for all paths (to dirs and files) that are used in more than one notebook

#### Base Paths ###

# Cohort dir
c["dir"]["cohort"] = os.path.join(os.sep, "work", "cohort")

c["dir"]["ref"]=os.path.join(os.sep, "work", "references")

### sample_specific paths

c["dir"]["sample"]="." # use the CWD

c["file"]["conf"]=os.path.join(c["dir"]["sample"], configuration_file_name)

### Secondary output for this sample
c["dir"]["secondary"]=os.path.join(os.sep, "work", "inputs", c["sample_id"], "secondary")

# temporary dir for intra-notebook tmp files, eg, to be passed to R scripts
c["dir"]["temp"]="." # Testing : use the CWD?
 

## Original input file from rna-seq output
c["file"]["rsem_genes.results"]=os.path.join(c["dir"]["sample"], "rsem_genes.results")

# Log file for errors etc.
c["file"]["log"]=os.path.join(c["dir"]["sample"], "log.txt")

# Failure flag--if this json file is present, the tertiary has failed
# (but was not halted). Use this for a QC failure, for example.
# the format of the file is reason: { dict with each step number as the key}
# { "reason" : {  "2.0": "matches existing sample", 4.0" : "not enough genes"}}
c["file"]["flag_analysis_failed"] = os.path.join( c["dir"]["sample"], "ANALYSIS_FAILED.json")
print(("If the tertiary output fails QC checks,"
       "check the {} file for details".format(c["file"]["flag_analysis_failed"])))


In [None]:
# Sanity check:
# Confirm that input dirs (secondary, references, cohort) are a) present and b) readonly

readonly_dirs = [c["dir"]["cohort"], c["dir"]["secondary"], c["dir"]["ref"]]
for dir in readonly_dirs:
    if not os.path.isdir(dir):
        raise IOError(c["error_delim"]+"Required directory {} isn't present.\n".format(dir)+
                        "Please mount this as a :ro volume to continue."+c["error_delim"])   
    if os.access(dir, os.W_OK):
        raise IOError(c["error_delim"]+"Volume {} is mounted read-write; refusing to continue.\n".format(dir)+
                        "Please mount this volume with :ro to enforce read-only."+c["error_delim"])

In [None]:
# Set up the logging configuration for later notebooks
c["info"]["logging_config"] = dict(
    filename=c["file"]["log"],
    level=logging.INFO,
    format='%(message)s'
)
logging.basicConfig(**c["info"]["logging_config"])

In [None]:
# Logging: Basic info
logging.info("Tertiary Analysis Results\n-------------------------")
now = time.strftime("%Y/%m/%d %H:%M %Z")
logging.info("Started: {}".format(now))

In [None]:
### cohort-specific paths - files under c[cohort]
c["cohort"]={}


# Clinical data for cohort
c["dir"]["cohort_clinical"] = os.path.join(c["dir"]["cohort"], "clinical")

# Path to the hd5 file containing the expression data for this cohort.
# This file is used for outlier analysis and is used by default for Tumormap placement.
# Not expression & variance filtered
c["cohort"]["expression_hd5"] = os.path.join(c["dir"]["cohort"],"cohort.hd5")

# cohort percentiles file, for adding the percentile column to outlier analysis
c["cohort"]["percentiles"] = os.path.join(c["dir"]["cohort"],"percentiles.hd5")

# all-by-all correlation matrix - for step 2.2
c["cohort"]["all_by_all_tsv"] = os.path.join(
    c["dir"]["cohort"],
    "all_by_all_correlations.tsv")

# Essential clinical data - for step2.6
c["cohort"]["essential_clinical"] = os.path.join(
    c["dir"]["cohort"],
    "clinical.tsv")

# Get the compendium 0 threshold if it exists.
# This is a compendium v3 specific thing.
# keep it as a string because we're just writing it to json anyhow.
try:
    with open(os.path.join(c["dir"]["cohort"], "cohort.zero.threshold.value.txt"), "r") as zt:
        c["info"]["cohort_zero_threshold"] = zt.read().rstrip()
except IOError:
    c["info"]["cohort_zero_threshold"] = "0" 

In [None]:
# Sample-specific info from sample_info.json
# This replaces diagnosed_disease.txt and id_on_tumormap.txt 
sample_info_json_file = os.path.join(c["dir"]["sample"],"sample_info.json")
try:
    with open(sample_info_json_file, "r") as f:
        sample_info = json.load(f)
        print("Loading sample info from json...")
except IOError as e: # no json file?? Should always be present...
        print("Couldn't load sample_info.json: {}".format(e))
        raise

# The self-sample, if it already exists on the tumormap, may have a different ID than the ID used in
# processing. Set the ID that tumormap looks for here.
if sample_info["alias"]:
    c["info"]["id_for_tumormap"] = sample_info["alias"]
else:
    c["info"]["id_for_tumormap"] = c["sample_id"]
    
tid_str="Expecting this sample to appear on Tumormap as '{}'".format(c["info"]["id_for_tumormap"])
print(tid_str)
logging.info(tid_str)

# Diagnosed disease, if available.
if sample_info["disease"]:
    c["info"]["disease"] = sample_info["disease"]
    print("Found a diagnosis of {} for this sample.".format(c["info"]["disease"]))
else:
    c["info"]["disease"] = None
    
# roll-up cohort, if available.
# Check that the all of the provided roll-up cohort samples are present in the cohort clinical file.
if sample_info["rollup"]:
    rollup_samples = set(sample_info["rollup"])
    cohort_samples = set(pd.read_csv(c["cohort"]["essential_clinical"], sep="\t")["th_sampleid"])
    extra_samples_in_rollup = rollup_samples - cohort_samples
    if len(extra_samples_in_rollup) == 0:
        c["info"]["rollup"] = sorted(list(rollup_samples))
    else:
        raise IOError(c["error_delim"] +
                      "Error: Found samples in the rollup cohort that are not in the compendium:\n".format(dir) +
                      "{}".format(extra_samples_in_rollup) + c["error_delim"])
else:
    c["info"]["rollup"] = None
        

In [None]:
# Tumormap placement-specific paths & info
# For step 2.0

c["dir"]["tumormap"] = os.path.join(c["dir"]["cohort"], "tumormap")


# All files are present in the 'tumormap' subdirectory of the reference cohort
# When using an alternate tumormap background cohort, a docker volume should be mounted to alias that subdirectory.
# Either or both of the background hdf or tsv files may be present; a tsv will be used in preference to hdf.
# If the HDF is used, its genes will be filtered by the gene filter file. If the TSV is used, it will not be filtered;
# the expectation is that expression-variance filters have already been applied.

c["tumormap"] = {}
c["tumormap"]["background_hdf"] = os.path.join(c["dir"]["tumormap"], "tumormap_expression.hd5" )
c["tumormap"]["background_tsv"] = os.path.join(c["dir"]["tumormap"], "tumormap_expression.tsv" )

# Get info for tumormap.
# Should contain keys: url, name, expression_md5, mcs_similarity_threshold                                             
tumormap_info_file=os.path.join(c["dir"]["tumormap"], "tumormap_info.json" )
with open(tumormap_info_file, "r") as f:
    c["tumormap"]["info"] = json.load(f)
    

                                           
# Tumormap-specific list of clinical data. Must have at minimum:
# - sample column labeled "th_sampleid"
# - disease column labeled "disease"
# - age at diagnosis column labeled "age_at_dx"
# This list contains the samples on the tumormap (which may be a superset or subset of those in the outlier cohort.)
# It's used to determine the pan-disease cohort based on tumormap placement.

c["tumormap"]["essential_clinical"] = os.path.join(
    c["dir"]["tumormap"],
    "clinical.tsv")



# Expression & variance filters were applied to tumormap_expression.hd5, and the list of genes that passed the filters
# was retained. Thus, when we use the (unfiltered) tumormap_expression.hd5 cohort for tumormap placement, we can
# recreate the filter results.
# proportion unexpressed 0.8, variance cutoff 0.2
c["tumormap"]["filtered_genes_to_keep"] = os.path.join(c["dir"]["tumormap"],
                                                     "filtered_genes_to_keep.tsv")

# Euclidean positions of the cohort, for visual tumormap placement
c["tumormap"]["xy_coords"] = os.path.join(c["dir"]["tumormap"],"assignments0.tab")

# Tumormap disease-to-color json file (optional)
c["tumormap"]["disease_color_map"]=os.path.join(c["dir"]["tumormap"], "tumormap_disease_colors.json")

In [None]:
# Get info for the compendium.
# Should contain keys: description (Free text/HTML for summary document), name, and expression_md5 -
# the md5sum of the cohort.hd5 file.
# and mcs_similarity_threshold - the threshold for when an MCS is sufficiently similar (95% percentile).
with open(os.path.join(c["dir"]["cohort"],"compendium_info.json"), "r") as f:
    c["cohort"]["info"] = json.load(f)

In [None]:
# intermediate outputs

# 1 - convert tpm hugo

# INPUT single-column TPM file with non-unique hugo names
c["file"]["tpm_hugo"]=os.path.join(c["dir"]["sample"], "rsem.genes.tpm.hugo.tab")

# OUTPUT TPM hugo file normalized with log2(n+1) and unique hugo names
# This is the canonical "sample expression" file
c["file"]["tpm_hugo_norm_uniq"]=os.path.join(
    c["dir"]["sample"], 
    "rsem.genes.tpm.hugo.log2plus1.dedupe.tab")

# REFERENCE

# Reference files that are constant across submissions
c["ref_file"] = {}

# Ensembl IDs, 1 per line, first line is gene_id
c["ref_file"]["ensembl_id_list"]=os.path.join(c["dir"]["ref"], "ensembl_ids.txt")

# header entries from rsem_genes.results, 1 per line, first line is gene_id
c["ref_file"]["rsem_genes.results_header"]=os.path.join(c["dir"]["ref"], "rsem_genes.results.header.txt")

# First column: Hugo, ensembl_hugo_NA_key (ie, "NA") if none. Second column : ensembl ID
c["ref_file"]["ensembl_hugo_mapping_file"] = os.path.join(c["dir"]["ref"], "EnsGeneID_Hugo_Observed_Conversions.txt")

# col 1: Sample ID. col 2: comma-separated list of mutations in that sample
c["ref_file"]["TCGA_non_silent_cancer_mutations_by_sample"] = os.path.join(
    c["dir"]["ref"], 
    "TH_THR_and_TARGET_NonSilentMutationsInCancerGenesBySample_2019-03-07_03.59.59PM.txt")

# Input pathway file msigdb.v5.2.symbols.gmt
c["ref_file"]["msigdb_pathway_file"] = os.path.join(
    c["dir"]["ref"],
    "msigdb.v5.2.symbols.gmt")

# col 1: Pathway name (string); col 2: blank or pathway  group. No header line.
c["ref_file"]["curated_pathways_druggable_genes"] = os.path.join( c["dir"]["ref"],
    "tertiary-references", "curatedPathwaysContainingFDA_druggableGenes.txt" )

# col 1: gene (hugo symbol); col 2: group (string)
c["ref_file"]["druggable_genes_by_category"] = os.path.join( c["dir"]["ref"],
    "tertiary-references", "treehouseDruggableGenes_2019-06-12.txt" )

# qc_reporting reference ranges
c["ref_file"]["reference_ranges"] = os.path.join(c["dir"]["ref"],
    "qc_report", "QC_reference_ranges_2019_04_23.tsv")

# qc_reporting rin score
c["ref_file"]["redcap_download_rinscore"] = os.path.join(c["dir"]["ref"],
    "qc_report", "redcap_combined_sample_donor_data_downloaded_via_API_2019-02-06_06.19.14PM.txt")

# If an alternate tumormap file exists in the sample dir, set that as the expression file
# to use when running the tumormap call.
# If not, use the processed sample expression file.

# Typically the expression to place against tumormap is the standard n-of-1
# expression file generated by step 1. Sometimes however an alternate expression file needs
# to be used (eg if it is batch corrected). If a file is present at this path, it means that
# it should be used as the expression to place against tumormap.
c["file"]["alternate_tumormap_expression"]=os.path.join(
    c["dir"]["sample"],
    "alternate_expression_for_tumormap.tsv"     )

# 2.0 output : TSV for tumormap attribute of sample->compendium similarity
c["file"]["sample_vs_compendium_tumormap_attribute"] = (
    "Correlations_{}_vs_{}.tsv".format(c["sample_id"], c["cohort"]["info"]["name"]))

# 2.5 - tumormap report details - report file
c["file"]["tumormap_report"]=os.path.join(c["dir"]["sample"],"{}.tumormap_report.txt".format(c["sample_id"]))

# 2.6 - Most similar samples mutation report

# 3 - make thresholds

# 4.0 - outlier analysis
c["file"]["outlier_results"]=os.path.join(c["dir"]["sample"], "outlier_results_{}".format(c["sample_id"]))

# 4.5 - expression plots. Output folder for the pngs.
c["dir"]["gene_expression_plots_dir"]=os.path.join(c["dir"]["sample"], "expression_plots")

# 5 - gsea dgidb - pathway and gene files for gsea investigate gene sets

gsea_dir = "GSEA"

# Dictionary: keys are hugo symbols, values are (dgidb found gene name, [list of drugs])
# see analysis-methods for creation scripts
c["ref_file"]["dgidb_genes_and_drugs"] = os.path.join( c["dir"]["ref"],
    "HugoGenesVsDGIdbDrugs.json")

# Pathways with corresponding gene lists
c["ref_file"]["h_symbols_pathway"] = os.path.join( c["dir"]["ref"], gsea_dir, "h.all.v6.1.symbols.gmt")
c["ref_file"]["c2_cp_symbols_pathway"] = os.path.join( c["dir"]["ref"], gsea_dir, "c2.cp.v6.1.symbols.gmt")
c["ref_file"]["msigdb_symbols_pathway"] = os.path.join( c["dir"]["ref"], gsea_dir, "msigdb.v6.1.symbols.gmt")

# Pathway Descriptions
c["ref_file"]["h_symbols_pathway_description"] = os.path.join( 
    c["dir"]["ref"], gsea_dir, "h.all.v6.1.symbols.DESCRIPTIONS.gmt")
c["ref_file"]["c2_cp_symbols_pathway_description"] = os.path.join( 
    c["dir"]["ref"], gsea_dir, "c2.cp.v6.1.symbols.DESCRIPTIONS.gmt")

# Gene Descriptions
c["ref_file"]["hgnc_gene_description"] = os.path.join( 
    c["dir"]["ref"], gsea_dir, "HGNC_gene_descriptions_2018-11-30.tsv")

# 7 - pathway xls
c["file"]["7_out"] = {}
c["file"]["7_out"]["all_gene_aggregation"]=os.path.join(c["dir"]["sample"], "allGeneAggregation.txt")
c["file"]["7_out"]["druggable_gene_aggregation"]=os.path.join(c["dir"]["sample"], "druggableGeneAggregation.txt")
c["file"]["7_out"]["gene_set_aggregation"]=os.path.join(c["dir"]["sample"], "GeneSetAggregation.txt")
c["file"]["7_out"]["gene_set_details_per_list"]=os.path.join(c["dir"]["sample"], "GeneSetDetailsPerList.txt")
c["file"]["7_out"]["gsea_dgidb_output_excel"]=os.path.join(
    c["dir"]["sample"], "{}_gsea_dgidb_output.xlsx".format(c["sample_id"]))

# Step 8 - generate automated leads
c["file"]["automated_leads_identified"] = os.path.join( c["dir"]["sample"], "automatedLeadsIdentified.tsv")

# Step 9 - generate report
# input - templates to use
# output - final html and files for summary and slides
c["info"]["summary_template_name"] = "summary.template"
c["file"]["summary_html"] = os.path.join(c["dir"]["sample"], "Summary.html")
c["info"]["slides_template_name"] = "slides.template"
c["file"]["slides_html"] = os.path.join(c["dir"]["sample"], "Slides.html")

In [None]:
# paths to JSON files for each step
c["json"] = {}
c["json"]["1"] = os.path.join(c["dir"]["sample"], "1.json")
c["json"]["2.0"] = os.path.join(c["dir"]["sample"], "2.0.json")
c["json"]["2.2"] = os.path.join(c["dir"]["sample"], "2.2.json")
c["json"]["2.5"] = os.path.join(c["dir"]["sample"], "2.5.json")
c["json"]["2.6"] = os.path.join(c["dir"]["sample"], "2.6.json")
c["json"]["3"] = os.path.join(c["dir"]["sample"], "3.json")
c["json"]["4.0"] = os.path.join(c["dir"]["sample"], "4.0.json")
c["json"]["4.25"] = os.path.join(c["dir"]["sample"], "4.25.json")
c["json"]["4.5"] = os.path.join(c["dir"]["sample"], "4.5.json")
c["json"]["5"] = os.path.join(c["dir"]["sample"], "5.json")
c["json"]["7"] = os.path.join(c["dir"]["sample"], "7.json")
c["json"]["8"] = os.path.join(c["dir"]["sample"], "8.json")
c["json"]["8.5"] = os.path.join(c["dir"]["sample"], "8.5.json")

In [None]:
# Calculate the md5 for the original rsem_genes.results file
# NOTE: only works because this is a line-based file; typically should md5 in binary mode
sample_md5 = hashlib.md5()
# TODO - capture error on file nonexistence and fail analysis
with open(c["file"]["rsem_genes.results"], "r") as f:
    for line in f:
        sample_md5.update(line)
c["info"]["rsem_genes.results_md5"] = sample_md5.hexdigest()

In [None]:
# Protocol version info : log and add to conf
# If /app isn't a git dir, see if we have an existing conf item with the git dir
# If we can't do that either (running manually from scratch?), mark as unknown.
# NOTE TODO: This is hardcoded to check /app ; it will be inaccurate if not running
# from inside docker. Not sure of best way to find "canonical" git dir for a run.

in_git_dir=!git -C /app rev-parse --is-inside-work-tree
if in_git_dir.n == "true":
    git_hash = !git -C /app rev-parse HEAD
    git_hash = git_hash.n
    git_base_url = !git -C /app config --get remote.origin.url
    git_base_url = git_base_url.n
    if git_base_url.endswith('.git'):
        git_base_url = git_base_url[:-4]
    git_tag = !git -C /app describe --tags
    git_url = "{}/tree/{}".format(git_base_url, git_tag.n)
else:
    try:
        git_hash="CUSTOM; modified from {}".format(c["info"]["git_hash"])
        git_url=c["info"]["git_url"]
    except KeyError:
        git_hash="CUSTOM"
        git_url="https://github.com/UCSC-Treehouse/protocol"
    
logging.info("Protocol git hash: {}".format(git_hash))
logging.info("This version: {}".format(git_url))

c["info"]["git_hash"] = git_hash
c["info"]["git_url"] = git_url

In [None]:
# Secondary quality-control results
# JSON file as generated / copied by run.py. Keys are:
# input uniqMappedNonDupeReadCount estExonicUniqMappedNonDupeReadCount qc

try:
    with open("bam_umend_qc.json", "r") as f:
        c["info"]["secondary_qc"] = json.load(f)
except IOError:
    c["info"]["secondary_qc"] = {}        
        
print(c["info"]["secondary_qc"])

In [None]:
# Logging: cohort version info
logging.info("Outlier compendium: {}".format(c["cohort"]["info"]["name"] ))
logging.info("Tumormap background cohort: {}".format(c["tumormap"]["info"]["name"] ))


logging.info("rsem_genes.results md5sum: {}".format(c["info"]["rsem_genes.results_md5"]))
logging.info("Tumormap compendium md5sum (may be same as outlier): {}".format(c["tumormap"]["info"]["expression_md5"]))
logging.info("Outlier analysis compendium (cohort.hd5) md5sum: {}".format(c["cohort"]["info"]["expression_md5"]))

In [None]:
# Print the conf
c

In [None]:
# Write the conf file
# store it in both the sampledir and the PWD 
# (so that we can find it without needing the conf file...)

with open(c["file"]["conf"], "w") as conf:
    json.dump(c, conf, indent=2)

print("Wrote json conf to {}".format(c["file"]["conf"]))
print("Done!")