Create the standard "Tumormap Report" with link to a tumormap URL
Output ends up in this script


Modified from :
report_on_local_neighborhoods.py by Yulia Newton

md5 of original script:
ac61f51b7dee830df54d8d59608c1c45  report_on_local_neighborhoods.py

#### Inputs
Depends on steps:
* 2.0
    - json tumormap_results



#### Outputs

Json results include keys:

 - calculated_nof1_url - string
 - calculated_nof1_and_mcs_url - string
 - most_similar_samples - array of sample IDs
 - mcs_threshold_status - dict - sample id : string- blank or "failed threshold" or "pivot sample"
 - mcs_above_threshold_url - string
 - attribute_info - array of strings - the raw info from clinical files
 - centroid_y - float
 - centroid_x - float
 - pivot_sample - string - pivot sample ID
 - nof1_original_url - string
 - mcs_only_url - string 
 - median_local_neighborhood_similarity - float
 - mcs_clinical_data - dict - sample id : { clin key : value, } - only from clinical.tsv


In [None]:
import optparse, sys, os
import operator
import numpy
import json
import logging
import csv

# Setup: load conf, retrieve sample ID, logging
with open("conf.json","r") as conf:
    c=json.load(conf)
sample_id = c["sample_id"]    
print("Running on sample: {}".format(sample_id))

logging.basicConfig(**c["info"]["logging_config"])
logging.info("\n2.5: Generate Tumormap Report")
def and_log(s):
    logging.info(s)
    return s

# Input : requires json from step 2.0
with open(c["json"]["2.0"],"r") as jf:
    json_2pt0 = json.load(jf)
        
j = {}

In [None]:
# Set up the "printreport" function.
# This will print the passed text, as well as append it to the tumormap report.
def create_print_append(outfile):
    def print_append(text):
        with open(outfile, "a+") as out:
            print >> out, text
        print text
    return print_append

print_report = create_print_append(c["file"]["tumormap_report"])

In [None]:
tumormap_report_text = []

#process input arguments:
in_sample = sample_id
in_euclidean_positions = c["tumormap"]["xy_coords"]

# Get all files in the cohort clinical data dir
attribute_files = []
for f in sorted(os.listdir(c["dir"]["cohort_clinical"])):
    path_f = os.path.join(c["dir"]["cohort_clinical"], f)
    if os.path.isfile(path_f):
        attribute_files.append(path_f)

In [None]:
#read neighborhoods        
n = json_2pt0["tumormap_results"]

j["pivot_sample"] = in_sample
j["most_similar_samples"] = n.keys()
j["median_local_neighborhood_similarity"] = numpy.median(n.values())

print_report( "Pivot sample: {}".format(j["pivot_sample"]) )
print_report( "Pivot neighbors: {}".format(", ".join(j["most_similar_samples"])))
print_report( "Median local neighborhood similarity: {}".format(str(j["median_local_neighborhood_similarity"])))

In [None]:
# position on tumormap if available

self_coords = []
neighbor_coords = []
mcs_above_threshold_coords = { "ids":[], "xcoords":[], "ycoords":[] }

if(os.path.exists(in_euclidean_positions)):
    with open(in_euclidean_positions, 'r') as input:
        x_pos = []
        y_pos = []
        for line in input:
            line_elems = line.strip().split("\t")
            # Don't  crash horribly if we get a line that's missing fields--just skip it
            if len(line_elems) < 3:
                continue
            # Get the MCS above threshold  coords (will never include self-sample)
            if line_elems[0] in json_2pt0["tumormap_results_above_threshold"].keys():
                mcs_above_threshold_coords["ids"].append(line_elems[0])
                mcs_above_threshold_coords["xcoords"].append(line_elems[1])
                mcs_above_threshold_coords["ycoords"].append(line_elems[2])
            if line_elems[0] in n.keys():
                # If we encounter the self-sample, don't add it to x_pos / y_post as it 
                # should not contribute to the centroid; but do store it for display
                if line_elems[0] == c["info"]["id_for_tumormap"]:
                    self_coords = [line_elems[1],line_elems[2]]
                else:
                    x_pos.append(float(line_elems[1]))
                    y_pos.append(float(line_elems[2]))
                    # Also store the neighbor info for display
                    neighbor_coords.append(line_elems)

    input.close()
    if not(len(x_pos) == len(y_pos)):
        raise KeyboardInterrupt("ERROR: number of x positions does not match number of y positions in the neighbors")

    if(len(x_pos) == 0):
        print_report( "No pivot position or TumorMap URL available: neighbors not found in coordinates file.")
    else:
        j["centroid_x"] = numpy.median(x_pos)    #median
        j["centroid_y"] = numpy.median(y_pos)    #median
        print_report( "Pivot position in the map: ("+str(j["centroid_x"])+", "+str(j["centroid_y"])+")")

        tumormap_url=c["tumormap"]["info"]["url"]
        if not tumormap_url:
            print_report( "No TumorMap URL available.")
        else:
            j["calculated_nof1_url"]="{}&node={}&x={}&y={}".format(
                tumormap_url, in_sample, str(j["centroid_x"]), str(j["centroid_y"]))
            print_report( "URL - Calculated:" )
            print_report(j["calculated_nof1_url"])
            
            # Also print URL with MCS
            # Format is node=Sample1,Sample2,Sample3&x=123,456,789&y=111,222,333
            neighbor_url_ids = [in_sample]
            neighbor_url_x = [str(j["centroid_x"])]
            neighbor_url_y = [str(j["centroid_y"])]
            
            for neighbor in neighbor_coords:
                neighbor_url_ids.append(neighbor[0])
                neighbor_url_x.append(neighbor[1])
                neighbor_url_y.append(neighbor[2])
                
            
            j["calculated_nof1_and_mcs_url"] = "{}&node={}&x={}&y={}".format(
                tumormap_url, ",".join(neighbor_url_ids), ",".join(neighbor_url_x), ",".join(neighbor_url_y))
            
            print_report( "URL - Sample and Most Similar Samples:" )           
            print_report( j["calculated_nof1_and_mcs_url"] )
            
            # Also print the url for JUST the MCS; guaranteed to be at least one (see above, No pivot position...)
            j["mcs_only_url"] = "{}&node={}&x={}&y={}".format(
                tumormap_url, 
                ",".join(neighbor_url_ids[1:]),
                ",".join(neighbor_url_x[1:]),
                ",".join(neighbor_url_y[1:])
            )
            print_report( "URL - Most Correlated Samples only:" )
            print_report(j["mcs_only_url"])
            
            # Report for just the MCS above threshold
            j["mcs_above_threshold_url"] = "{}&node={}&x={}&y={}".format(
                tumormap_url, 
                ",".join(mcs_above_threshold_coords["ids"]),
                ",".join(mcs_above_threshold_coords["xcoords"]),
                ",".join(mcs_above_threshold_coords["ycoords"])
            )
            print_report("URL - Most Correlated Samples Above Threshold:")
            print_report(j["mcs_above_threshold_url"])
            
            # If the sample was already on the map, print a URL for that too
            if(self_coords):
                j["nof1_original_url"] = "{}&node={}&x={}&y={}".format(
                    tumormap_url, in_sample, self_coords[0], self_coords[1])
                print_report( "URL - Original Placement:" )
                print_report(j["nof1_original_url"])
else:
    print_report( "No pivot position or TumorMap URL available: coordinates file not present." )

In [None]:
# Similarity with MCS & sample disease counts, rounded to 2 decimal places
# Also note the pivot sample and MCS that failed the correlation threshold

j["mcs_threshold_status"] = {}

print_report( "\nSimilarity with individual neighbors"
             " (in order from highest to lowest correlation):" )
for k in sorted(n, key=n.get, reverse=True):
    if k in json_2pt0["tumormap_results_above_threshold"].keys():
        failed_threshold = ""
    elif k == c["info"]["id_for_tumormap"]:
        failed_threshold = "\t(pivot sample)"
    else:
        failed_threshold = "\t(failed correlation threshold)"
    j["mcs_threshold_status"][k] = failed_threshold
    print_report( "{}\t{}{}".format(k, '{:.2f}'.format(n[k]), failed_threshold))
    
print_report("\n")
print_report(json_2pt0["sample_disease_counts"])

In [None]:
# Print attribute info

j["attribute_info"]= []

for a in attribute_files:
    print_report( "\n" )
    with open(a, 'r') as input:
        line_num = 0
        for line in input:

            line = line.replace("\n", "")

            if line_num == 0:
                print_report( line )
                j["attribute_info"].append(line)
            else:
                line_elems = line.split("\t")
                if line_elems[0] in n.keys():
                    print_report( line )
                    j["attribute_info"].append(line)

            line_num += 1

Get the essential clinical info into the json for easy parsing downstream

In [None]:
j["mcs_clinical_data"] = {}
allsid = []
with open(c["cohort"]["essential_clinical"], "r") as essential_clinical:
    clin_items = csv.DictReader(essential_clinical, dialect="excel-tab")
    for sample in clin_items:
        if sample["th_sampleid"] in n.keys():
            j["mcs_clinical_data"][sample["th_sampleid"]] = sample

j["mcs_clinical_data"]

In [None]:
with open(c["json"]["2.5"], "w") as jf:
    json.dump(j, jf, indent=2)

print("Done!")