# Analyze index size and build times

### load data first

In [171]:
import pickle


RESO = ["100000", "10000", "1000"]
SUBSAMPLES = ["1", "0.1", "0.01", "0.001", "0.0001"]
ORGANISMS = ["drosophila_m", "e_coli", "s_cerevisiae", "t_brucei"] # "human"
ORGA_SUBSAMPLE = "t_brucei"
SUBSAMPLE_ORGA = "even"
NO_PARAMS = ""
PARAMS = ["", ".-q", ".-c", ".-s", ".-q.-c", ".-q.-s", ".-c.-s", ".-q.-c.-s"]
PARAMS_TUPLE = [("+", "+", "+"), ("-q", "+", "+"), ("+", "-c", "+"), ("+", "+", "-s"), ("-q", "-c", "+"), 
                ("-q", "+", "-s"), ("+", "-c", "-s"), ("-q", "-c", "-s")]

zip_list = list(zip(PARAMS, PARAMS_TUPLE))
zip_list.sort(key=lambda x: x[1])
PARAMS = [x for x, y in zip_list]
PARAMS_TUPLE = [y for x, y in zip_list]

with open("index_sizes_data.pickle", "rb") as in_file:
    data = pickle.load(in_file)

ORGANISMS.sort(key=lambda x: data[x]["genome_size"])
print(ORGANISMS)

['e_coli', 's_cerevisiae', 't_brucei', 'drosophila_m']


### bokeh imports

In [159]:

from bokeh.plotting import figure
from bokeh.palettes import viridis
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, FactorRange, LabelSet, Whisker
from bokeh.models.tickers import FixedTicker
from bokeh.transform import jitter
from bokeh.layouts import column, row, gridplot
from math import pi

COLOR_PALETTE = ["#0072B2", "#D55E00", "#009E73", "#E69F00", "#CC79A7", "#56B4E9", "#F0E442"]
SCATTER_PALETTE = ["x", "cross", "circle", "dash"]

output_notebook()

### plot index size & build time by organism

In [160]:
def conf_x_axis_ticker(f, x_is_genome_size):
    def to_readable_genome_siz(x):
        x = int(x)
        if x < 1000:
            return str(x) + "bp"
        elif x < 1000000:
            return str(round(x / 1000, 2)) + "kbp"
        else:
            return str(round(x / 1000000, 2)) + "mbp"
    def to_readable_num_interactions(x):
        x = int(x)
        if x < 1000:
            return str(x)
        elif x < 1000000:
            return str(round(x / 1000, 2)) + "k"
        else:
            return str(round(x / 1000000, 2)) + "m"
        
    f.xaxis.ticker = FixedTicker(ticks=[data[orga]["genome_size"] if x_is_genome_size else data[orga][SUBSAMPLE_ORGA]["num_unique_interactions"] for orga in ORGANISMS])
    f.xaxis.major_label_overrides = {data[orga]["genome_size"] if x_is_genome_size else data[orga][SUBSAMPLE_ORGA]["num_unique_interactions"]: orga + " (" + to_readable_genome_siz(data[orga]["genome_size"]) + "; " + to_readable_num_interactions(data[orga][SUBSAMPLE_ORGA]["num_unique_interactions"]) + ")" for orga in ORGANISMS}


def plot_by_genome_size(key="index_size", parse_key=lambda x: x, conf_extra=lambda x: x, x_is_genome_size=True):
    f = figure(x_axis_type="log", y_axis_type="log", x_axis_label="genome size" if x_is_genome_size else "number of unique interactions", y_axis_label=key)


    conf_x_axis_ticker(f, x_is_genome_size)
    f.xaxis.major_label_orientation = pi/4
    conf_extra(f)

    for idx, reso in enumerate(RESO):
        xs = [data[orga]["genome_size"] if x_is_genome_size else data[orga][SUBSAMPLE_ORGA]["num_unique_interactions"] if data[orga][SUBSAMPLE_ORGA][reso]["build_status"] == "OK" else float("NaN") \
            for orga in ORGANISMS]
        ys = [parse_key(data[orga][SUBSAMPLE_ORGA][NO_PARAMS][reso][key]) if data[orga][SUBSAMPLE_ORGA][NO_PARAMS][reso]["build_status"] == "OK" else float("NaN") \
            for orga in ORGANISMS]
        
        f.scatter(xs, ys, marker=SCATTER_PALETTE[idx % len(SCATTER_PALETTE)], line_color=COLOR_PALETTE[idx],
                legend_label=reso, size=10, fill_color=None)
        f.line(xs, ys, color=COLOR_PALETTE[idx], legend_label=reso)

    f.legend.title = "minimal resolution"
    f.legend.location = "bottom_right"
    show(f)

def conf_y_axis_bytes(f):
    KB = 1
    MB = KB * 1000
    GB = MB * 1000
    TB = GB * 1000
    f.yaxis.ticker = FixedTicker(ticks=[KB, MB, 10*MB, 100*MB, GB, 10*GB, 100*GB, TB], 
                                 minor_ticks=[x for t in [1, KB, MB, GB, TB] for x in [*range(t, t * 10, t), 
                                                                                       *range(10*t, t * 100, 10*t), 
                                                                                       *range(100*t, t * 1000, 100*t)]])
    f.yaxis.major_label_overrides = {KB: "1 kb", MB: "1 mb", 10*MB: "10 mb", 100*MB: "100 mb", GB: "1 gb", 
                                     10*GB: "10 gb", 100*GB: "100 gb", TB: "1 tb", 10*TB: "10 tb", 100*TB: "100 tb"}

plot_by_genome_size(conf_extra=conf_y_axis_bytes)
#plot_by_genome_size(conf_extra=conf_y_axis_bytes, x_is_genome_size=False)

In [161]:
def conf_y_axis_time(f):
    MIN = 60
    HOUR = 60 * MIN
    DAY = 24 * HOUR
    f.yaxis.ticker = FixedTicker(ticks=[1, 10, MIN, 10*MIN, HOUR, DAY], 
                                minor_ticks=[*range(9), *range(10, MIN, 10), *range(2*MIN, 10*MIN, MIN), 
                                             *range(20*MIN, HOUR, 10*MIN), *range(2*HOUR, DAY, HOUR)])
    f.yaxis.major_label_overrides = {1: "1 sec", 10: "10 sec", MIN: "1 min", 10*MIN: "10 min", HOUR: "1 hour", DAY: "1 day"}


def parse_time(s):
    mins, secs = s.split("m")
    secs = secs[:-1]
    return float(mins) * 60 + float(secs)

plot_by_genome_size(key="index_build_time", parse_key=parse_time, conf_extra=conf_y_axis_time)

In [162]:
def plot_by_genome_size(key="index_size", parse_key=lambda x: x, conf_extra=conf_y_axis_bytes):
    f = figure(x_axis_type="log", y_axis_type="log", x_axis_label="number of unique interactions", 
                y_axis_label=key)
    conf_extra(f)


    def to_readable_num_interactions(x):
        x = int(x)
        if x < 1000:
            return str(x)
        elif x < 1000000:
            return str(round(x / 1000, 2)) + "k"
        else:
            return str(round(x / 1000000, 2)) + "m"
        
    f.xaxis.ticker = FixedTicker(ticks=[data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"] for samples in SUBSAMPLES])
    f.xaxis.major_label_overrides = {data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"]: str(int(float(samples)*100) if float(samples) >= 0.01 else float(samples)*100) + "%: " + to_readable_num_interactions(data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"]) for samples in SUBSAMPLES}
    f.xaxis.major_label_orientation = pi/4


    for idx, reso in enumerate(RESO):
        xs = [data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"] for samples in SUBSAMPLES]
        ys = [parse_key(data[ORGA_SUBSAMPLE][samples][NO_PARAMS][reso][key]) if data[ORGA_SUBSAMPLE][samples][NO_PARAMS][reso]["build_status"] == "OK" else float("NaN") \
            for samples in SUBSAMPLES]
        
        f.scatter(xs, ys, marker=SCATTER_PALETTE[idx % len(SCATTER_PALETTE)], line_color=COLOR_PALETTE[idx],
                legend_label=reso, size=10, fill_color=None)
        f.line(xs, ys, color=COLOR_PALETTE[idx], legend_label=reso)

    f.legend.title = "minimal resolution"
    f.legend.location = "bottom_right"
    show(f)
plot_by_genome_size()
plot_by_genome_size("index_build_time", parse_time, conf_extra=conf_y_axis_time)


In [180]:
def plot_by_params(key="index_size", parse_key=lambda x: x, conf_extra=conf_y_axis_bytes):
    f = figure(x_axis_label="index parameters", y_axis_type="log",
                y_axis_label=key, x_range=FactorRange(*PARAMS_TUPLE))
    conf_extra(f)

    for idx, reso in enumerate(RESO):
        offset = idx / len(RESO)
        xs = [params_tuple for params_tuple, params in zip(PARAMS_TUPLE, PARAMS) if data[ORGA_SUBSAMPLE][SUBSAMPLE_ORGA][params][reso]["build_status"] == "OK"]
        ys = [parse_key(data[ORGA_SUBSAMPLE][SUBSAMPLE_ORGA][params][reso][key]) for params in PARAMS if data[ORGA_SUBSAMPLE][SUBSAMPLE_ORGA][params][reso]["build_status"] == "OK"]
        
        f.vbar(x=xs, top=ys, bottom=1, width=1/len(RESO), color=COLOR_PALETTE[idx], legend_label=reso)

    f.y_range.flipped = True 
    f.legend.title = "minimal resolution"
    f.legend.location = "top_right"
    show(f)

plot_by_params()
plot_by_params("index_build_time", parse_time, conf_extra=conf_y_axis_time)