# Plot a Single gene's expression vs a Cohort Distribution

For all druggable up outliers in this sample, generate a histogram and box-and-whisker plot of the cohort's expression (both pan-cancer and pan-disease), and locate the sample expression on it.

Creates a pan-cancer and a pan-disease png image output for each gene.

Note:
This step differs from most other steps in that it does not have a single predictable output file.
Instead, it has a predictable output folder; all png images will be captured in that subfolder of the sample dir.

In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import base64
import logging

# Setup: load conf, retrieve sample ID, logging
with open("conf.json","r") as conf:
    c=json.load(conf)
sample_id = c["sample_id"]    
print("Running on sample: {}".format(sample_id))

logging.basicConfig(**c["info"]["logging_config"])
logging.info("\n4.5: Generate Expression Plots vs Cohort Distribution")
def and_log(s):
    logging.info(s)
    return s
    
# Input requires steps: 1.0, 2.0, 4.0
with open(c["json"]["1"],"r") as jf:
    sample_exp_allgenes = pd.DataFrame.from_dict(
        json.load(jf)["tpm_hugo_norm_uniq"],
        orient="columns",
        dtype="float64")
# And modify to match original read_csv dataframe format
sample_exp_allgenes["Gene"] = sample_exp_allgenes.index
sample_exp_allgenes.reset_index(drop=True, inplace=True)

with open(c["json"]["2.0"],"r") as jf:
    json_2pt0 = json.load(jf)

with open(c["json"]["4.0"],"r") as jf:
    outliers = pd.DataFrame.from_dict(
        json.load(jf)["outlier_results"],
        orient="columns",
        dtype="float64")
    
# Output dir
output_dir=c["dir"]["gene_expression_plots_dir"]

j = {}

In [None]:
# Get up outliers and filter by druggable

pancancer_up_genes = outliers[outliers["pc_outlier"] == "pc_up"].index
pandisease_up_genes = outliers[outliers["pd_outlier"] == "pd_up"].index

druggable_genes = set(pd.read_csv(c["ref_file"]["druggable_genes_by_category"], delimiter="\t")["gene"])

all_up_genes = set(pancancer_up_genes).union(set(pandisease_up_genes))
druggable_up_genes = all_up_genes.intersection(druggable_genes)

print("Plotting the following {} druggable up outliers:".format(len(druggable_up_genes)))
print(", ".join(druggable_up_genes))

In [None]:
# Get the sample's expression for these genes.

gene_exp = dict.fromkeys(druggable_up_genes)
for gene in gene_exp:
    gene_exp[gene] = {}
    gene_exp[gene]["sample_expression"]=sample_exp_allgenes[
        sample_exp_allgenes["Gene"] == gene][sample_id].values[0]

In [None]:
%%time

# Load in expression
full_cohort_exp = pd.read_hdf(c["cohort"]["expression_hd5"])

In [None]:
%%time

# Set up the expression cohorts - excerpt the full expression by each sample list
# Then reorder the pancancer cohort to match the original sample column ordering.
# dropna removes any samples that were removed when slicing by the pancan_samples.
# This is necessary for exact png match to previous version (differences were visually imperceptible).

pd_samples = pd.Index(json_2pt0["pandisease_samples"])
pd_cohort_exp = full_cohort_exp[pd_samples]

pc_cohort_exp=full_cohort_exp[pd.Index(json_2pt0["pancan_samples"])].loc[
    :,full_cohort_exp.columns].dropna(axis=1)

In [None]:
# get the pancancer & pandisease expression for these genes
for gene in gene_exp:
    gene_exp[gene]["pc_cohort_exp"]=pc_cohort_exp.loc[gene]
    gene_exp[gene]["pd_cohort_exp"]=pd_cohort_exp.loc[gene]

In [None]:
%matplotlib inline


In [None]:
# Get pandisease sample count breakdown and generate text for the heading
sample_diseases = pd.read_csv(c["cohort"]["essential_clinical"], delimiter="\t", index_col="th_sampleid")
pd_disease_breakdown = sample_diseases.filter(
    items=pd_samples, axis=0).groupby("disease").size().to_frame(name="count")
pd_items = pd_disease_breakdown.apply(lambda x: "{} (n={})".format(x.name, x["count"]), axis=1)

if len(pd_items)==1:
    pd_text = pd_items[0]
elif len(pd_items)==2:
    pd_text = "{}, {}".format(pd_items[0], pd_items[1])
else:
    # Format text: three comma-separated items to a line
    # This leaves off the last item or two if it's not % 3 - reattach them
    pd_text =  ",\n".join(map(lambda x: ", ".join(x), zip(*[iter(pd_items)] * 3)))
    if len(pd_items) % 3 == 1:
        pd_text = "{},\n{}".format(pd_text, pd_items[-1])
    if len(pd_items) % 3 == 2:
        pd_text = "{},\n{}, {}".format(pd_text, pd_items[-1], pd_items[-2])

In [None]:
# IQR thresholds for drawing outlier area
def iqr_thresholds(series, iqr_multiplier):
    thresholds = series.quantile(q=[0.25,0.75])
    iqr_spread = iqr_multiplier * (thresholds.loc[0.75] - thresholds.loc[0.25])
    return { "high": thresholds.loc[0.75] + iqr_spread , 
             "low" : thresholds.loc[0.25] - iqr_spread ,
             "spread" : iqr_spread }

In [None]:
# Generate a plot for one gene

# Constants for plot
SAMPLE_ID=sample_id
IQR_MULTIPLIER=c["info"]["iqr_multiplier"]
PD_TEXT=pd_text

def plot(gene_name,
         is_pandisease,
         cohort_expression,
         sample_expression,
         output_path
        ):
    pd_or_pc_text = "pandisease cohort\n{}".format(PD_TEXT) if is_pandisease else "pancancer cohort"

    figure, axes=plt.subplots()

    plt.subplots_adjust(wspace=0, hspace=0) # Remove space beween subplots

    # Set up axes
    axes.get_xaxis().set_visible(False)
    axes.get_yaxis().set_visible(False)

    boxplot_axis = figure.add_subplot(211, frameon=False)
    boxplot_axis.get_xaxis().set_visible(False)
    boxplot_axis.get_yaxis().set_visible(False)

    hist_axis = figure.add_subplot(212, sharex=boxplot_axis, frameon=False)
    hist_axis.tick_params( top='off', right='off')
    
    # Title
    figure_title = "{} expression in {} and {}".format(gene_name, SAMPLE_ID, pd_or_pc_text)
    boxplot_axis.set_title(figure_title, fontsize=10)    
    
    # boxplot customization
    whisker_style=dict(linestyle="-", color="black")
    box_style=dict(linestyle="-", color="black")
    median_style=dict(linestyle="-",linewidth=2,color="black")

    # make the box-and-whisker plot
    boxplot = boxplot_axis.boxplot(cohort_expression, whis=IQR_MULTIPLIER,
                                   widths=0.5,
                                   showfliers=True, # Show a + for any Outliers in the cohort
                                   showcaps=False,vert=False, 
                                   whiskerprops=whisker_style,
                                   boxprops=box_style,
                                   medianprops=median_style )

    # make the histogram
    # arbitrarily chosen number of bins
    histogram = hist_axis.hist(cohort_expression, bins=30, color="#cccccc")


    # Set main x axis width of the plot 
    (plot_min, plot_max) = boxplot_axis.get_xlim()

    # is sample expression is below the minimum, push it down and add padding
    if(sample_expression < plot_min):
        plot_min = sample_expression - ((plot_max - sample_expression) * 0.05)

    # To find the max x, need to calculate the outlier zones first.

    ### Draw the up outlier zone

    # Start at the high outlier threshold
    # Minimum width is 1/2 the iqr spread.
    # If the sample expression is an up outlier, extend width beyond
    # that to include the expression plus a small margin.
    # If that doesn't go up to the pre-existing plot edge, extend it
    # outwards to meet that edge
    thresholds = iqr_thresholds(cohort_expression, IQR_MULTIPLIER)

    width_from_thresholds = thresholds["spread"]/2
    width_from_sample = (sample_expression - thresholds["high"]) * 1.05

    outlier_zone_width = max(width_from_thresholds, width_from_sample)

    # Enlarge the outlier zone to meet plot if necessary
    if(thresholds["high"] + outlier_zone_width) < plot_max:
        outlier_zone_width = plot_max - thresholds["high"]
    
    # Then enlarge plot to meet outlier zone if necessary
    plot_max = max(plot_max, thresholds["high"] + outlier_zone_width)
    
    # Set the overall axes
    boxplot_axis.set_xlim([plot_min,plot_max])

    
    # draw outlier zone - boxplot
    boxplot_axis.add_patch(mpatches.Rectangle(
        (thresholds["high"], 0), #XY of lower left corner
        outlier_zone_width, # width
        max(boxplot_axis.get_ylim()) - 0.01, # height - don't run into top border
        ec="#ffaa00",
        fc="#ffff00",
        alpha=0.5,
        linewidth=0
    ))

    # draw Outlier zone - histogram
    hist_axis.add_patch(mpatches.Rectangle(
        (thresholds["high"],0), #XY of lower left corner
        outlier_zone_width, # width
        max(hist_axis.get_ylim()), # height
        ec="#ffaa00",
        fc="#ffff00",
        alpha=0.5,
        linewidth=0
    ))

    # Draw the sample expression
    hist_axis.add_patch(mpatches.Rectangle(
        (sample_expression, 0), # xy
        0, # width - just a line
        max(hist_axis.get_ylim()), # height,
        linewidth=4,
        ec="#ff0000"
    ))
   
    # Save the output as a png
    plt.savefig(output_path, bbox_inches='tight')

In [None]:
# Create the output dir; it is allowed to already exist as a dir
try: 
    os.makedirs(output_dir)
except OSError as e:
    print("Found error, but perhaps the dir simply already exists?\nError: {}".format(e))
    if not os.path.isdir(output_dir):
        raise

In [None]:
# plot the pancancer & pandisease expression for these genes,
# & store the file location in the json dict.

j["gene_plots"] = {}

for gene in gene_exp:
    found_a_plot = False

    # For each of the PC and PD cohort, plot them if we have expression for them
    if len(gene_exp[gene]["pc_cohort_exp"]):
        found_a_plot = True
        pc_out = os.path.join(output_dir,"{}_pancancer.png".format(gene))
        plot(
            gene,
            False,
            gene_exp[gene]["pc_cohort_exp"],
            gene_exp[gene]["sample_expression"],
            pc_out
        )
        print("Plotted {} vs pan-cancer expression to : {}").format(gene, pc_out)
    else:
        print "Skipping pancancer plot - no samples in the cohort"
        pc_out = False
        
    if len(gene_exp[gene]["pd_cohort_exp"]):    
        found_a_plot = True
        pd_out = os.path.join(output_dir,"{}_pandisease.png".format(gene))
        plot(
            gene,
            True,
            gene_exp[gene]["pd_cohort_exp"],
            gene_exp[gene]["sample_expression"],
            pd_out
        )
        print("Plotted {} vs pan-disease expression to : {}").format(gene, pd_out)            
    else:
        print "Skipping pandisease plot - no samples in the cohort"
        pd_out = False
    
   
    # Then, if we plotted at least one cohort, we can make a gene_plots item for this gene
    # with the filepath and image data for that plot
    if(found_a_plot):
        j["gene_plots"][gene] = {}
        if(pc_out):
            j["gene_plots"][gene]["pancancer_filepath"] = pc_out
            with open(pc_out, "rb") as f:
                j["gene_plots"][gene]["pancancer_img_data"] = base64.b64encode(f.read())
        if(pd_out):
            j["gene_plots"][gene]["pandisease_filepath"] = pd_out
            with open(pd_out, "rb") as f:
                j["gene_plots"][gene]["pandisease_img_data"] = base64.b64encode(f.read())


In [None]:
with open(c["json"]["4.5"], "w") as jsonfile:
    json.dump(j, jsonfile, indent=2)

print("Done!")