# Analyze index size and build times

### load data first

In [151]:
import pickle


RESO = ["5000", "10000", "50000"]
SUBSAMPLES = ["1", "0.1", "0.01", "0.001", "0.0001"]
SUBSAMPLES_2 = ["1", "0.8", "0.6", "0.4", "0.2"]
ORGANISMS = ["drosophila_m", "e_coli", "s_cerevisiae", "t_brucei", "human"]
ORGA_SUBSAMPLE = "t_brucei"
SUBSAMPLE_ORGA = ["even_1", "even_2"]
NO_PARAMS =  ["", "-c.-m", "-q.-c", "-q.-c.-m"] 
PARAMS = ["", "-q", "-c", "-m", "-q.-c", "-q.-m", "-c.-m", "-q.-c.-m"]
PARAMS_TUPLE = [
    ("+c", "+q", "+m"), 
    ("+c", "-q", "+m"), 
    ("-c", "+q", "+m"), 
    ("+c", "+q", "-m"), 
    ("-c", "-q", "+m"), 
    ("+c", "-q", "-m"), 
    ("-c", "+q", "-m"), 
    ("-c", "-q", "-m")]

zip_list = list(zip(PARAMS, PARAMS_TUPLE))
zip_list.sort(key=lambda x: x[1], reverse=True)
PARAMS = [x for x, y in zip_list]
PARAMS_TUPLE = [y for x, y in zip_list]

with open("index_sizes_data.pickle", "rb") as in_file:
    data = pickle.load(in_file)

ORGANISMS.sort(key=lambda x: data[x]["genome_size"])
print(ORGANISMS)

['e_coli', 's_cerevisiae', 't_brucei', 'drosophila_m', 'human']


### bokeh imports

In [152]:

from bokeh.plotting import figure
from bokeh.palettes import viridis
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, FactorRange, LabelSet, Whisker
from bokeh.models.tickers import FixedTicker
from bokeh.transform import jitter
from bokeh.layouts import column, row, gridplot
from math import pi
from bokeh.layouts import row

COLOR_PALETTE = ["#0072B2", "#D55E00", "#009E73", "#E69F00", "#CC79A7", "#56B4E9", "#F0E442"]
SCATTER_PALETTE = ["x", "circle", "cross", "dash"]

output_notebook()

### plot index size & build time by organism

In [153]:
def conf_x_axis_ticker(f, subsample_orga):
    def to_readable_genome_siz(x):
        x = int(x)
        if x < 1000:
            return str(x) + "bp"
        elif x < 1000000:
            return str(round(x / 1000, 2)) + "kbp"
        else:
            return str(round(x / 1000000, 2)) + "mbp"
    def to_readable_num_interactions(x):
        x = int(x)
        if x < 1000:
            return str(x)
        elif x < 1000000:
            return str(round(x / 1000, 2)) + "k"
        else:
            return str(round(x / 1000000, 2)) + "m"
        
    f.xaxis.ticker = FixedTicker(ticks=[data[orga]["genome_size"] for orga in ORGANISMS])
    # + [430 * 1000 * 1000, 1000 * 1000 * 1000, 1412 * 1000 * 1000, 2800 * 1000 * 1000])
    f.xaxis.major_label_overrides = {data[orga]["genome_size"]: orga + " (" + to_readable_genome_siz(data[orga]["genome_size"]) + "; " + to_readable_num_interactions(data[orga][subsample_orga]["num_unique_interactions"]) + ")" for orga in ORGANISMS} 
    #| {
    #    430 * 1000 * 1000: "rice 430mbp",
    #    1000 * 1000 * 1000: "chicken 1000mbp",
    #    1412 * 1000 * 1000: "zebrafish 1412mbp",
    #}


def plot_by_genome_size(key="index_size", parse_key=lambda x: x, conf_extra=lambda x: x,
                        subsample_orga=SUBSAMPLE_ORGA[0], y_range=(1000, 100000000)):
    fl = []
    for idx2, params in enumerate(NO_PARAMS):
        f = figure(x_axis_type="log", y_axis_type="log", x_axis_label="genome size", y_axis_label=key, title="params: " + params + " | subsample: " + subsample_orga, y_range=y_range)

        conf_x_axis_ticker(f, subsample_orga)
        f.xaxis.major_label_orientation = pi/4
        conf_extra(f)

        for idx, reso in enumerate(RESO):
            xs = [data[orga]["genome_size"] if data[orga][subsample_orga][params][reso]["build_status"] == "OK" else float("NaN") \
                for orga in ORGANISMS]
            ys = [parse_key(data[orga][subsample_orga][params][reso][key]) if data[orga][subsample_orga][params][reso]["build_status"] == "OK" else float("NaN") \
                for orga in ORGANISMS]
            
            legend_label=str(int(reso)//1000) + "kb"
            f.scatter(xs, ys, marker=SCATTER_PALETTE[idx], line_color=COLOR_PALETTE[idx],
                    legend_label=legend_label, size=10, fill_color=None)
            f.line(xs, ys, color=COLOR_PALETTE[idx], legend_label=legend_label)

        f.legend.title = "min resp"
        f.legend.location = "bottom_right"
        fl.append(f)
    show(row(fl), notebook_handle=True)

def conf_axis_bytes(f, y_axis=True, only_major=False):
    KB = 1
    MB = KB * 1000
    GB = MB * 1000
    TB = GB * 1000
    ticks = [KB, MB, 10*MB, 100*MB, GB, 10*GB, 100*GB, TB]
    minor_ticks = [x for t in [1, KB, MB, GB, TB] for x in [*range(t, t * 10, t), 
                                                            *range(10*t, t * 100, 10*t), 
                                                            *range(100*t, t * 1000, 100*t)]]
    tick = FixedTicker(ticks=ticks if not only_major else minor_ticks, 
                       minor_ticks=minor_ticks if not only_major else [])
    if only_major:
        label_overrides = {
            x*t: str(x) + n for n, t in zip(["", " kb", " mb", " gb", " tb"], [1, KB, MB, GB, TB]) for x in [
                                                    *range(10), 
                                                    *range(10, 100, 10), 
                                                    *range(100, 1000, 100)]
        }
    else:
        label_overrides = {KB: "1 kb", MB: "1 mb", 10*MB: "10 mb", 100*MB: "100 mb", GB: "1 gb", 
                                     10*GB: "10 gb", 100*GB: "100 gb", TB: "1 tb", 10*TB: "10 tb", 100*TB: "100 tb"}
    if y_axis:
        f.yaxis.ticker = tick
        f.yaxis.major_label_overrides = label_overrides
    else:
        f.xaxis.ticker = tick
        f.xaxis.major_label_overrides = label_overrides

def conf_y_axis_bytes(f):
    conf_axis_bytes(f, y_axis=True)
def conf_y_axis_bytes_detailed(f):
    conf_axis_bytes(f, y_axis=True, only_major=True)
def conf_x_axis_bytes(f):
    conf_axis_bytes(f, y_axis=False)

plot_by_genome_size(conf_extra=conf_y_axis_bytes, subsample_orga="even_1")
plot_by_genome_size(conf_extra=conf_y_axis_bytes, subsample_orga="even_2")

In [154]:
def conf_y_axis_time(f):
    MIN = 60
    HOUR = 60 * MIN
    DAY = 24 * HOUR
    f.yaxis.ticker = FixedTicker(ticks=[1, 10, MIN, 10*MIN, HOUR, 10*HOUR, DAY], 
                                minor_ticks=[*range(9), *range(10, MIN, 10), *range(2*MIN, 10*MIN, MIN), 
                                             *range(20*MIN, HOUR, 10*MIN), *range(2*HOUR, DAY, HOUR)])
    f.yaxis.major_label_overrides = {1: "1 sec", 10: "10 sec", MIN: "1 min", 10*MIN: "10 min", HOUR: "1 hour", 10*HOUR: "10 hours", DAY: "1 day"}
def conf_y_axis_time_detailed(f):
    MIN = 60
    HOUR = 60 * MIN
    DAY = 24 * HOUR
    f.yaxis.ticker = FixedTicker(ticks=[*range(9), *range(10, MIN, 10), *range(MIN, 10*MIN, MIN), 
                                             *range(10*MIN, HOUR, 10*MIN), *range(1*HOUR, DAY, HOUR)])
    f.yaxis.major_label_overrides = {x: str(x) + " sec" for x in [*range(10), *range(10, MIN, 10)]} | \
                                    {x*60: str(x) + " min" for x in [*range(10), *range(10, MIN, 10)]} | \
                                    {x*60*60: str(x) + " hour" for x in [*range(24)]} | \
                                    {x*60*60*24: str(x) + " day" for x in [*range(10)]}


def parse_time(s):
    mins, secs = s.split("m")
    secs = secs[:-1]
    return float(mins) * 60 + float(secs)

plot_by_genome_size(key="index_build_time", parse_key=parse_time, conf_extra=conf_y_axis_time,  subsample_orga="even_1",
                    y_range=(1, 60*60*24))
plot_by_genome_size(key="index_build_time", parse_key=parse_time, conf_extra=conf_y_axis_time,  subsample_orga="even_2",
                    y_range=(1, 60*60*24))

In [155]:
def to_readable_num_interactions(x):
    x = int(x)
    if x < 1000:
        return str(x)
    elif x < 1000000:
        return str(round(x / 1000, 2)) + "k"
    else:
        return str(round(x / 1000000, 2)) + "m"

def plot_by_genome_size(key="index_size", parse_key=lambda x: x, conf_extra=conf_y_axis_bytes):
    fl = []
    for idx2, params in enumerate(NO_PARAMS):
        f = figure(x_axis_type="log", y_axis_type="log", x_axis_label="number of unique interactions", 
                    y_axis_label=key, title="params: "+ params)
        conf_extra(f)
            
        f.xaxis.ticker = FixedTicker(ticks=[data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"] for samples in SUBSAMPLES])
        f.xaxis.major_label_overrides = {data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"]: str(int(float(samples)*100) if float(samples) >= 0.01 else float(samples)*100) + "%: " + to_readable_num_interactions(data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"]) for samples in SUBSAMPLES}
        f.xaxis.major_label_orientation = pi/4


        for idx, reso in enumerate(RESO):
                xs = [data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"] for samples in SUBSAMPLES]
                ys = [parse_key(data[ORGA_SUBSAMPLE][samples][params][reso][key]) if data[ORGA_SUBSAMPLE][samples][params][reso]["build_status"] == "OK" else float("NaN") \
                    for samples in SUBSAMPLES]
                
                legend_label=str(int(reso)//1000) + "kb "
                f.scatter(xs, ys, marker=SCATTER_PALETTE[idx % len(SCATTER_PALETTE)], line_color=COLOR_PALETTE[idx],
                        legend_label=legend_label, size=10, fill_color=None)
                f.line(xs, ys, color=COLOR_PALETTE[idx], legend_label=legend_label)

        f.legend.title = "minimal resolution"
        f.legend.location = "top_left"
        fl.append(f)
    show(row(fl), notebook_handle=True)
plot_by_genome_size()
plot_by_genome_size("index_build_time", parse_time, conf_extra=conf_y_axis_time)


In [158]:
def plot_by_params(key="index_size", parse_key=lambda x: x, conf_extra=conf_y_axis_bytes_detailed,
                   y_range=(100000, 2000000), subsample_orga=SUBSAMPLE_ORGA[0]):
    fl = []

    space = .75
    for idx, reso in enumerate(RESO[::-1]):
        xs = [list(params_tuple) for params_tuple, params in zip(PARAMS_TUPLE, PARAMS) if data[ORGA_SUBSAMPLE][subsample_orga][params][reso]["build_status"] == "OK"]
        ys = [float(parse_key(data[ORGA_SUBSAMPLE][subsample_orga][params][reso][key])) for params in PARAMS if data [ORGA_SUBSAMPLE][subsample_orga][params][reso]["build_status"] == "OK"]

        f = figure(x_axis_label="index parameters", y_axis_type="log",
                    y_axis_label=key, x_range=FactorRange(*PARAMS_TUPLE),
                    title="min res: " +str(int(reso)//1000) + "kb | subsample:" + subsample_orga,
                    width=300, height=300, y_range=y_range)
        #if len(fl) > 0:
        #    f.y_range = fl[0].y_range
        conf_extra(f)
        
        f.vbar(x=xs, top=ys, bottom=1, color=COLOR_PALETTE[idx], width=0.8)

        #f.y_range.flipped = True 
        fl.append(f)
    show(row(fl), notebook_handle=True)

plot_by_params()
plot_by_params("index_build_time", parse_time, conf_extra=conf_y_axis_time_detailed, y_range=(60, 60*5))
plot_by_params(subsample_orga=SUBSAMPLE_ORGA[1])

In [None]:
EXPORT = "export"
PROCESS = "process"
QUERY = "query"
GROUP_LIST = [
    PROCESS,
    EXPORT,
    QUERY,
]

GROUPS = {
    "bin_size": PROCESS,
    "render_area": PROCESS,
    "active_chroms": PROCESS,
    "axis_coords": PROCESS,
    "symmetry_setting": PROCESS,
    "bin_coords": PROCESS,
    "decay_coords": PROCESS,
    "flat_decay": PROCESS,
    "intersection_type_setting": PROCESS,
    "active_replicates_setting": PROCESS,
    "active_coverage": PROCESS,
    "coverage_values": QUERY,
    "bin_values": QUERY,
    "decay_values": QUERY,
    "flat_bins": PROCESS,
    "in_group_setting": PROCESS,
    "normalized_bins": PROCESS,
    "dist_dep_dec_normalized_bins": PROCESS,
    "decay_cds": EXPORT,
    "color_palette": PROCESS,
    "between_group_setting": PROCESS,
    "combined_bins": PROCESS,
    "colored_bins": EXPORT,
    "active_annotation": PROCESS,
    "annotation_values": QUERY,
    "annotation_cds": EXPORT,
    "annotation_color_palette": EXPORT,
    "active_annotation_cds": EXPORT,
    "heatmap_cds": EXPORT,
    "heatmap_export": EXPORT,
    "track_export": EXPORT,
    "scaled_bins": PROCESS,
    "ticks": PROCESS,
    "coverage_tracks": PROCESS,
    "rendered_palette": PROCESS,
    "anno_filters": PROCESS,
    "longest_common_substring": PROCESS,
    "canvas_size": PROCESS,
    "mapping_quality_setting": PROCESS,
    "directionality_setting": PROCESS,
    "ranked_slices_cds": EXPORT,
    "grid_seq_coverage": PROCESS,
    "radicl_coverage": PROCESS,
    "rna_associated_genes_filter": PROCESS,
    "rna_associated_background": PROCESS,
    "grid_seq_samples": PROCESS,
    "radicl_seq_samples": PROCESS,
    "dataset_id_per_repl": PROCESS,
    "active_chroms_length": PROCESS,
    "virtual4c_coords": PROCESS,
    "flat_4c": PROCESS,
    "ice_coords": PROCESS,
}

def avg_runtime_by_group(d):
    ret = {x: 0 for x in GROUP_LIST}
    for key, value in d.items():
        if GROUPS[key] in ret:
            #value.sort()
            #ret[GROUPS[key]] += value[len(value)//2]
            ret[GROUPS[key]] += sum(value) / len(value)
    return ret

def total_runtimes(d, percentile_to_exclude=0.1):
    runtimes = [sum(l) for l in zip(*d.values())]
    runtimes.sort()
    runtimes = runtimes[int(len(runtimes)*(percentile_to_exclude)):int(len(runtimes)*(1-percentile_to_exclude))]
    return runtimes

def plot_runtime_orgas(data, subsample=SUBSAMPLE_ORGA, params=NO_PARAMS[0], reso=RESO[2]):
    f = figure(x_axis_label="genome size", y_axis_label="runtime (ms)", title=subsample + " " + params + " " + reso,
               x_axis_type="log")

    conf_x_axis_ticker(f, True)
    f.xaxis.major_label_orientation = pi/4

    def pick_data(orga):
        return data[orga][subsample][params][reso]

    orga_list = [orga for orga in ORGANISMS if pick_data(orga)["build_status"] == "OK" and pick_data(orga)["query_status"] == "OK"]

    avg_runtimes = [avg_runtime_by_group(pick_data(orga)["query_times"]) for orga in orga_list]

    for idx, group in list(enumerate(GROUP_LIST))[::-1]:
        xs = [data[orga]["genome_size"] for orga in orga_list]
        ys1 = [sum(x[g] for g in GROUP_LIST[:idx])/1000 for x in avg_runtimes]
        ys2 = [sum(x[g] for g in GROUP_LIST[:idx+1])/1000 for x in avg_runtimes]
        
        f.varea(xs, ys1, ys2, color=COLOR_PALETTE[idx], legend_label=group)

    f.legend.title = "min res | params"
    f.legend.location = "bottom_right"
    show(f)

for reso in RESO:
    for params in NO_PARAMS:
        plot_runtime_orgas(data, params=params, reso=reso)

KeyError: True

In [None]:
def plot_runtime_subsample(data):
    f = figure(x_axis_label="number of unique interactions", y_axis_label="runtime (ms)")
    
    def pick_data(samples):
        return data[ORGA_SUBSAMPLE][samples][NO_PARAMS[0]][RESO[2]]

    f.xaxis.ticker = FixedTicker(ticks=[data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"] for samples in SUBSAMPLES_2])
    f.xaxis.major_label_overrides = {data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"]: str(int(float(samples)*100) if float(samples) >= 0.01 else float(samples)*100) + "%: " + to_readable_num_interactions(data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"]) for samples in SUBSAMPLES_2}
    f.xaxis.major_label_orientation = pi/4


    samples_list = [samples for samples in SUBSAMPLES_2 if pick_data(samples)["build_status"] == "OK" and pick_data(samples)["query_status"] == "OK"]

    avg_runtimes = [avg_runtime_by_group(pick_data(samples)["query_times"]) for samples in samples_list]

    for idx, group in list(enumerate(GROUP_LIST))[::-1]:
        xs = [data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"] for samples in SUBSAMPLES_2]
        ys1 = [sum(x[g] for g in GROUP_LIST[:idx])/1000 for x in avg_runtimes]
        ys2 = [sum(x[g] for g in GROUP_LIST[:idx+1])/1000 for x in avg_runtimes]
        
        f.varea(xs, ys1, ys2, color=COLOR_PALETTE[idx], legend_label=group)

    for samples in SUBSAMPLES_2:
        ys = [x/1000 for x in total_runtimes(pick_data(samples)["query_times"], 0)]
        xs = [data[ORGA_SUBSAMPLE][samples]["num_unique_interactions"]] * len(ys)

        f.x(xs, ys, color="black", alpha=0.1, size=10)

    f.legend.title = "min res | params"
    f.legend.location = "bottom_right"
    show(f)

plot_runtime_subsample(data)



KeyError: 'query_times'

In [None]:
def plot_runtime_dimensions(data):
    f = figure(x_range=FactorRange(*PARAMS_TUPLE), x_axis_label="index parameters", y_axis_label="runtime (ms)")
    
    def pick_data(params):
        return data[ORGA_SUBSAMPLE][SUBSAMPLE_ORGA][params][RESO[2]]

    avg_runtimes = [avg_runtime_by_group(pick_data(params)["query_times"]) for params in PARAMS if pick_data(params)["build_status"] == "OK" and pick_data(params)["query_status"] == "OK"]

    for idx, group in list(enumerate(GROUP_LIST))[::-1]:
        xs = [params for key, params in zip(PARAMS, PARAMS_TUPLE) if pick_data(key)["build_status"] == "OK" and pick_data(key)["query_status"] == "OK"]
        ys1 = [sum(x[g] for g in GROUP_LIST[:idx])/1000 for x in avg_runtimes]
        ys2 = [sum(x[g] for g in GROUP_LIST[:idx+1])/1000 for x in avg_runtimes]
        
        f.vbar(x=xs, top=ys2, bottom=ys1, color=COLOR_PALETTE[idx], width=0.9, legend_label=group)

    f.legend.title = "min res | params"
    f.legend.location = "bottom_right"
    show(f)

plot_runtime_dimensions(data)

In [None]:
def plot_speed_by_index_size(organisms=ORGANISMS, params=NO_PARAMS, resos=RESO, c_idx=0, s_idx=1, w_idx=2):
    f = figure(x_axis_type="log", y_axis_type="log", x_axis_label="index size (MB)", y_axis_label="query time (ms)")
    #conf_y_axis_time(f)
    conf_x_axis_bytes(f)

    for idx, orga in enumerate(organisms):
        for idx_2, param in enumerate(params):
            xs = []
            ys = []
            for idx_3, reso in enumerate(resos):
                idxs = [idx, idx_2, idx_3]
                idx_c = idxs[c_idx]
                idx_s = idxs[s_idx]
                if data[orga][SUBSAMPLE_ORGA][param][reso]["build_status"] == "OK" and data[orga][SUBSAMPLE_ORGA][param][reso]["query_status"] == "OK":
                    x = data[orga][SUBSAMPLE_ORGA][param][reso]["index_size"]
                    #y = sum(avg_runtime_by_group(data[orga][SUBSAMPLE_ORGA][param][reso]["query_times"]).values())/1000
                    y = [x / 1000 for x in total_runtimes(data[orga][SUBSAMPLE_ORGA][param][reso]["query_times"])]
                    x = [x] *len(y)
                    xs.append(x)
                    ys.append(y)
                    size = idxs[w_idx] * 2 + 8
                    label = ""
                    if 0 in [c_idx, s_idx]:
                        label += orga + " "
                    if 1 in [c_idx, s_idx]:
                        label += param + " "
                    if 2 in [c_idx, s_idx]:
                        label += str(int(reso)//1000) + "kb"
                    f.scatter(marker=SCATTER_PALETTE[idx_s], x=x, y=y, 
                              fill_color=None, line_color=COLOR_PALETTE[idx_c % len(COLOR_PALETTE)], legend_label=label, 
                              size=size)
            #f.line(xs, ys, color=COLOR_PALETTE[idx], legend_label=orga + " " + params)
    f.legend.location = "bottom_right"
    show(f)

plot_speed_by_index_size()
plot_speed_by_index_size(organisms=[ORGA_SUBSAMPLE], params=PARAMS, c_idx=2, s_idx=2, w_idx=1)