
Generate Threshold files
March 1, 2017

Based on : MakeThresholdsAndExprVarFilteredList-pancan-and-pandisease.v3.ipynb

create PanCancer & PanDisease threshold files for a single sample using the cohort samples list
generated in steps 2.0 and 2.2.

Also creates: self-disease, first-degree MCS, and first-and-second-degree MCS thresholds.

If any threshold list can't be created, it will be set to None.

In [None]:
import os
import pandas as pd
import json
import requests
import numpy as np
import math
import csv
import logging
import errno

# Setup: load conf, retrieve sample ID, logging
with open("conf.json","r") as conf:
    c=json.load(conf)
sample_id = c["sample_id"]    
print("Running on sample: {}".format(sample_id))

logging.basicConfig(**c["info"]["logging_config"])
logging.info("\n3: Generate Threshold Files")
def and_log(s):
    logging.info(s)
    return s

# if the analysis failed, create (if necessary) the flag file and
# add to it the reason it failed; increase max fail level if necessary
def mark_analysis_failed(text, level):
    try:
        with open(c["file"]["flag_analysis_failed"], "r") as jf:
            failed_json = json.load(jf)
    except IOError, e:
        if e.errno == errno.ENOENT:
            failed_json = {"reason": {}, "maxlevel": str(level)}
        else:
            raise
    if int(failed_json["maxlevel"]) < level:
        failed_json["maxlevel"] = str(level)
    if "3" in failed_json["reason"].keys():
        failed_json["reason"]["3"] = failed_json["reason"]["3"] + text
    else:
        failed_json["reason"]["3"] = text
    with open(c["file"]["flag_analysis_failed"], "w") as jf:
        json.dump(failed_json, jf, indent=2)

#### Configuration ####

iqr_multiplier=c["info"]["iqr_multiplier"]
# What % of a feature need have 0 expression for it to be dropped?
proportion_unexpressed = c["info"]["proportion_unexpressed_filter_cutoff"]
# Drop this proportion of the lowest-varying genes
filter_level = c["info"]["variance_filter_cutoff"]

# IMPORTANT
# What number is considered zero for the purposes of the "0 expression" cutoff?
# This - or anything below - will count as 0.
# See https://github.com/UCSC-Treehouse/operations/issues/25#issuecomment-268610013
#zero_threshold = 0.0001443
zero_threshold = np.float64(c["info"]["cohort_zero_threshold"])
print("Threshold Zero Value for this cohort:{}".format(zero_threshold))

# Input requires steps: 2.0, 2.2
with open(c["json"]["2.0"],"r") as jf:
        json_2pt0 = json.load(jf)
        
with open(c["json"]["2.2"],"r") as jf:
        json_2pt2 = json.load(jf)
        
# Output json
j = {}

In [None]:
# Functions

# Takes : pandas dataframe, float iqr_multiplier
# returns a pandas dataframe - columns high, median, low
def make_thresholds(dataframe, iqr_mult):
    
    # Calculate the thresholds at 25%, 50%, and 100%
    thresholds = dataframe.quantile(q=[0.25, 0.5, 0.75], axis=1)
    # add the IQR - subtract 25% quantile from 75% quantile
    thresholds.loc["IQR"] = thresholds.loc[0.75] - thresholds.loc[0.25]

    # Then, make high, median and low thresholds
    high_threshold = thresholds.loc[0.75] + (iqr_mult * thresholds.loc["IQR"])
    low_threshold = thresholds.loc[0.25] - (iqr_mult * thresholds.loc["IQR"])
    median = thresholds.loc[0.50] 
    return pd.concat({ "high": high_threshold, "median": median, "low": low_threshold}, axis=1)

# Takes : dataframe, float proportion unexpressed, float filter level
# float zero_threshold -- anything <= to this value is considered zero
# returns pandas dataframe where the index are the genes to KEEP
def make_expr_var_filters(dataframe, proportion_unexpressed, filter_level, zero_threshold):
    
    ### Expression Variance Filters

    # unroll the count of zeroes
    max_ok_zeroes = len(dataframe.columns) * proportion_unexpressed

    # Is the count of items less than threshold within the acceptable count?
    def sufficiently_expressed(series, max_zeroes,threshold):
            return len(series[series <= threshold]) < max_zeroes

    # Gene & whether it is Keep (True) or too many zeroes (False)    
    withZeroes = dataframe.apply(sufficiently_expressed,
                            args=(max_ok_zeroes, zero_threshold),
                            axis=1)

    # Next, do variance filtering
    expression_filtered_compendium = dataframe[withZeroes]
    print "{} genes remain after expression filter".format(
        len(expression_filtered_compendium))

    # Get the standard deviation
    variance = expression_filtered_compendium.apply(np.std, axis=1)
    cut_proportion = int(math.ceil(len(variance)*filter_level))
    keep_proportion = len(variance) - cut_proportion
    expression_and_variance_filtered = variance.nlargest(keep_proportion)
    print "{} genes remain after variance filter".format(
        len(expression_and_variance_filtered))
    return expression_and_variance_filtered

In [None]:
%%time

# Load the expression

# If this fails with 'no module named tables', uncomment and rerun:
#!pip2 install --quiet tables

expression = pd.read_hdf(c["cohort"]["expression_hd5"])

In [None]:
# Combine the sample lists from the 2.0 and 2.2 jsons.
sample_lists ={
    "pancan": json_2pt0["pancan_samples"],
    "pandis": json_2pt0["pandisease_samples"],
    "first_degree": json_2pt0["first_degree_mcs_cohort"],
    "first_and_second_degree": json_2pt2["first_and_second_degree_mcs_cohort"],
    "nof1_disease": json_2pt2["diagnosed_disease_cohort"]
}

In [None]:
%%time

# Filter the expression by the sample lists. Abort if there's any sample in the list
# that isn't found in the expression - this shouldn't be allowed to ever happen

# Drop cohorts with fewer than a threshold (currently 20) samples

MINIMUM_DIAGNOSIS_COHORT_THRESHOLD = 20

expression_lists = {}
cohorts_without_samples = []
for listname, samples in sample_lists.iteritems():
    if(len(samples) < MINIMUM_DIAGNOSIS_COHORT_THRESHOLD):
        print "{} found {} samples (minimum is {}). Skipping this cohort.".format(
            listname,
            len(samples),
            MINIMUM_DIAGNOSIS_COHORT_THRESHOLD)
        expression_lists[listname] = []
        cohorts_without_samples.append(listname)
    elif(len(expression.columns.intersection(samples)) != len(samples)):
        print("Error! At least one sample from the {} list was not found in the expression table!").format(listname)
        raise KeyboardInterrupt
    else:
        expression_lists[listname] = expression.loc[:, samples]

In [None]:
# Make some alerts / errors if any cohorts are missing samples.

error_reasons = {
    "pancan":("pancancer", "This should never happen!"),
    "pandis": ("disease of top6 MCS above threshold", 
               "This is typically because there were too few samples that passed the " +
               "95% similarity threshold."),
    "first_degree": ("first degree MCS",
                     "This is typically because there were too few samples that passed the " +
                     "95% similarity threshold."),
    "first_and_second_degree": ("first and second degree MCS",
                     "This is typically because there were too few samples that passed the " +
                     "95% similarity threshold."),
    "nof1_disease": ("focus sample's disease", "This is typically because there was no disease provided " +
                     "or there were too few samples with the focus sample's disease")
}

for cohort in cohorts_without_samples:
    longname, explanation = error_reasons[cohort]
    text = "<br/>ERROR: There are no samples in the {} cohort ({}). {}".format(longname, cohort, explanation)
    if cohort == "pancan":
        mark_analysis_failed(text, 4)
    else:
        mark_analysis_failed(text, 2)
    print text

if len(cohorts_without_samples) == 4:
    text = ("<br/>ERROR: In total, there were 4 cohorts with insufficent samples. There are no outliers "
        "from personalized cohorts for this sample, although there may be pan-cancer outliers.")
    mark_analysis_failed(text, 4)
    print text
elif len(cohorts_without_samples) == 3:
    text = ("<br/>ERROR: In total, there were 3 cohorts with insufficent samples. No consensus outliers "
            "are available, although there may be pan-cancer outliers. Outliers for the single "
            "personalized cohort can be found in the outliers file but were not further analyzed. ")
    mark_analysis_failed(text, 4)
    print text
elif len(cohorts_without_samples) == 2:
    text = ("<br/>ERROR: In total, there were 2 cohorts with insufficient samples.<br/>"
        "Any outliers from the remaining cohorts are not consensus outliers, but are reported "
        "in case a more weakly-supported finding is desired. Only consider  leads based on "
        "these results if you thoroughly understand the weaknesses of the data and report the "
        "weakness to the clinician.")
    mark_analysis_failed(text, 4)
    print text

In [None]:
%%time

# Get the thresholds and filtered genes. Skip empty cohorts.
thresholds = {}
filters = {}
for listname, expression_values in expression_lists.iteritems():
    if(len(expression_values)):
        thresholds[listname] = make_thresholds(expression_values, iqr_multiplier) 
        filters[listname] = make_expr_var_filters(
            expression_values, proportion_unexpressed, filter_level, zero_threshold)   
    else:
        thresholds[listname] = []
        filters[listname] = []

In [None]:
# Format thresholds for output
# pancan_thresholds, pandis_thresholds
# first_degree_thresholds, first_and_second_degree_thresholds, nof1_disease_thresholds
# pancan_filtered_genes, pandis_filtered_genes
# first_degree_filtered_genes, first_and_second_degree_filtered_genes, nof1_disease_filtered_genes

for listname in sample_lists.keys():
    threshold_key = "{}_thresholds".format(listname)
    filtered_key = "{}_filtered_genes".format(listname)
    if(len(thresholds[listname])):
        threshold_stringdata = thresholds[listname].applymap(lambda x: "%.15g" % x)
        j[threshold_key] = json.loads(threshold_stringdata.to_json(orient='columns'))
    else:
        j[threshold_key] = {}
    if(len(filters[listname])):
        j[filtered_key] = filters[listname].index.values.tolist()
    else:
        j[filtered_key] = []

In [None]:
# Write output
with open(c["json"]["3"], "w") as jsonfile:
    json.dump(j, jsonfile, indent=2)
    
print("Done.")