In addition to the pan-disease and first-degree-MSS cohort, generate two personalized cohorts to determine the N-of-1 sample's outliers.

Cohorts are :
- N-of-1 diagnosed disease (when available)
- First and second degree most similar samples 

In [None]:
import os
import json
import numpy as np
import pandas as pd
import errno
import logging

# Setup: load conf, retrieve sample ID, logging
with open("conf.json","r") as conf:
    c=json.load(conf)
sample_id = c["sample_id"]    
print("Running on sample: {}".format(sample_id))

logging.basicConfig(**c["info"]["logging_config"])
logging.info("\n2.2: Generate Additional Cohorts")
def and_log(s):
    logging.info(s)
    return s

# if the analysis failed, create (if necessary) the flag file and
# add to it the reason it failed; increase max fail level if necessary
def mark_analysis_failed(text, level):
    try:
        with open(c["file"]["flag_analysis_failed"], "r") as jf:
            failed_json = json.load(jf)
    except IOError, e:
        if e.errno == errno.ENOENT:
            failed_json = {"reason": {}, "maxlevel": str(level)}
        else:
            raise
    if int(failed_json["maxlevel"]) < level:
        failed_json["maxlevel"] = str(level)
    if "2.2" in failed_json["reason"].keys():
        failed_json["reason"]["2.2"] = failed_json["reason"]["2.2"] + text
    else:
        failed_json["reason"]["2.2"] = text
    with open(c["file"]["flag_analysis_failed"], "w") as jf:
        json.dump(failed_json, jf, indent=2)


j = {}

# Input requires steps: 2.0
with open(c["json"]["2.0"],"r") as jf:
        json_2pt0 = json.load(jf)
        
compendium_sample_alias=c["info"]["id_for_tumormap"]

if(compendium_sample_alias != sample_id):
    print "Using the alias '{}' when searching for this sample on the compendium.".format(compendium_sample_alias)

First and second degree most correlated sample cohort.

Use the all-by-all matrix to get the second degree most correlated samples.
For each first degree sample S, get all samples which have a correlation to S of the similarity threshold or greater. 

Note that this includes S itself (with a correlation of 1), so we get all the first-degree samples as we aggregate the second-degree ones.

We use the cohort correlation threshold rather than tumormap as we are only working with cohort samples for these second degree samples.

In [None]:
%%time

all_by_all=pd.read_csv(c["cohort"]["all_by_all_tsv"],
                       delimiter="\t", index_col=0)

In [None]:
threshold = float(c["cohort"]["info"]["mcs_similarity_threshold"])
first_and_second_degree_samples = set()

# TODO: The first-degree samples are retrieved from the tumormap cohort.
# If any of them are not present in the outlier pancancer cohort, this section will crash with a KeyError.

for first in json_2pt0['first_degree_mcs_cohort']:
    firsts_mcs = all_by_all[all_by_all[first] >= threshold].index
    first_and_second_degree_samples = first_and_second_degree_samples.union(firsts_mcs)
    print "First-degree sample {} has {} second-degree MCS above threshold".format(first, len(firsts_mcs) - 1)
    
j["first_and_second_degree_mcs_cohort"] = sorted(first_and_second_degree_samples)

Cohort: diagnosed disease.

Retrieve the N-of-1 sample's disease (when provided) and get all background samples that match it.

Then, when a roll-up cohort is provided, use that in preference to any diagnosis cohort; otherwise, use the collected samples as the n-of-1 disease cohort. (Dropping the N-of-1 sample subsequently when present)

In [None]:
cohort_diseases = pd.read_csv(
    c["cohort"]["essential_clinical"], 
    sep="\t", keep_default_na=False, na_values=['_'])

# If there's a disease, get the samples with that disease
if(c["info"]["disease"]):
    same_disease = cohort_diseases[
        cohort_diseases["disease"] == c["info"]["disease"]]
    samples_same_disease = sorted(list(same_disease["th_sampleid"]))
else:
    print "No disease was found for {}.".format(sample_id)
    samples_same_disease = []

# Get count of same-disease samples for future work (INCLUDING n-of-1 sample if present)
# this may be different from len of diagnosed disease cohort if a roll-up cohort is being used.
j["count_of_samples_same_disease"] = str(len(samples_same_disease)     )

# Then, figure out what the diagnosed disease cohort is.
# It's either the roll-up cohort, the samples w same disease, or empty.
if c["info"]["rollup"]:
    j["diagnosed_disease_cohort"] = sorted(list(c["info"]["rollup"]))
    print "Using provided roll-up cohort with {} samples.".format(len(j["diagnosed_disease_cohort"]))
    
elif samples_same_disease:
    j["diagnosed_disease_cohort"] = samples_same_disease
    print "Found disease '{}'. Generating cohort with {} samples.".format(
        c["info"]["disease"], len(j["diagnosed_disease_cohort"]))
    
else:
    print ("No samples were found matching the focus sample's disease, and a roll-up cohort was not provided."
           "The diagnosed-disease cohort will be empty.")
    j["diagnosed_disease_cohort"] = []

Next, drop the N-of-1 self-sample, as identified by id (or alias on the compendium if present), from these cohorts.

If the diagnosed disease cohort now has fewer than 20 samples, we'll omit it as being insufficient. (Roll-up cohorts with fewer than 20 samples are NOT omitted, but the analyst is alerted.)

In [None]:
MINIMUM_DIAGNOSIS_COHORT_THRESHOLD = 20

try:
    j["diagnosed_disease_cohort"].remove(compendium_sample_alias)
    ddc_str = "removed from"
except ValueError:
    ddc_str = "not found in"

try:
    j["first_and_second_degree_mcs_cohort"].remove(compendium_sample_alias)
    fsd_str = "removed from"
except ValueError:
    fsd_str = "not found in"

print "N-of-1 sample {} was {} the diagnosed-disease cohort [Alias: {}]".format(sample_id,
                                                                                ddc_str,
                                                                                compendium_sample_alias)

print "N-of-1 sample {} was {} the first-and-second-degree-MCS cohort [Alias {}]".format(sample_id,
                                                                                        fsd_str,
                                                                                        compendium_sample_alias)

if (len(j["diagnosed_disease_cohort"]) < MINIMUM_DIAGNOSIS_COHORT_THRESHOLD):
    if c["info"]["rollup"]:
        rollup_cohort_too_small_message = (
            "The provided roll-up cohort has {} samples; this is below the minimum size of {}. ".format(
            len(j["diagnosed_disease_cohort"]), MINIMUM_DIAGNOSIS_COHORT_THRESHOLD) +
            "This cohort has been used in analysis but its size may be too small to provide meaningful results."
        )
        mark_analysis_failed(rollup_cohort_too_small_message, 1)
        print rollup_cohort_too_small_message
    else:
        diagnosis_cohort_too_small_message = (
            "The N-of-1 disease cohort has {} samples; this is below the minimum size of {}. ".format(
            len(j["diagnosed_disease_cohort"]), MINIMUM_DIAGNOSIS_COHORT_THRESHOLD) +
            "This cohort was omitted. Provide a roll-up cohort in the manifest.tsv for this sample and rerun."
        )
        mark_analysis_failed(diagnosis_cohort_too_small_message, 4)
        print diagnosis_cohort_too_small_message
        j["diagnosed_disease_cohort"] = []
else:
    print "\nBuilt a diagnosed-disease cohort with {} sample IDs.".format(len(j["diagnosed_disease_cohort"]))
    
print "Built a first-and-second-degree-MCS cohort with {} sample IDs.".format(
    len(j["first_and_second_degree_mcs_cohort"]))

Create the tumormap legend for step 9: Take the union of all samples in all personalized cohorts, get all diseases, and add those to the legend.

Also store the full legend.


In [None]:
# Personalized cohorts:
all_cohorts = [
    #json_2pt0['first_degree_mcs_cohort'], # this is redundant with the 1st-and-2nd cohort in this context
    json_2pt0["pandisease_samples"],
    j["first_and_second_degree_mcs_cohort"],
    j["diagnosed_disease_cohort"]
]
all_neighbors = set()
for cohort in all_cohorts:
    all_neighbors = all_neighbors.union(cohort)

diseases_df = cohort_diseases[
    cohort_diseases["th_sampleid"].isin(all_neighbors)][["th_sampleid", "disease"]]
all_found_diseases =  filter(lambda x: x != "", diseases_df["disease"].unique().tolist())


j["mss_disease_colors"] = {}

try:
    with open(c["tumormap"]["disease_color_map"], "r") as f:
        disease_colors = json.load(f)
except IOError:
    j["tumormap_legend"] = {}
    j["mss_disease_colors"] = {}
else:
    j["tumormap_legend"] = disease_colors
    for disease_name in all_found_diseases:
        try:
            j["mss_disease_colors"][disease_name] = disease_colors[disease_name]
        except KeyError:
            print("Didn't find {} in map {} - skipping!".format(disease_name, c["tumormap"]["disease_color_map"]))

Get disease counts for each of the personalized cohorts.

In [None]:
all_cohorts = {
    "first_degree_mcs_cohort":json_2pt0['first_degree_mcs_cohort'],
    "pandisease_samples":json_2pt0["pandisease_samples"],
    "first_and_second_degree_mcs_cohort": j["first_and_second_degree_mcs_cohort"],
    "diagnosed_disease_cohort": j["diagnosed_disease_cohort"]
}
j["personalized_cohort_counts"] = {}

for (cohort, samples) in all_cohorts.iteritems():
    disease_items = cohort_diseases[cohort_diseases["th_sampleid"].isin(samples)]["disease"]
    j["personalized_cohort_counts"][cohort] = {"total" : len(disease_items)}
    j["personalized_cohort_counts"][cohort]["diseases"] = dict(zip(*np.unique(disease_items, return_counts=True)))

j["personalized_cohort_counts"]

In [None]:
# json output.
# Keys are:
# diagnosed_disease_cohort: array of sample IDs
# first_and_second_degree_mcs_cohort: array of sample IDs

with open(c["json"]["2.2"], "w") as jsonfile:
    json.dump(j, jsonfile, indent=2)
    
print "Done!"