# Test normalizaitons
## basic settings first

In [9]:
import libsmoother

from bokeh.plotting import figure
from bokeh.palettes import viridis
from bokeh.io import show, output_notebook

import json
import sys
import pickle

output_notebook()

KBP = 1000
MBP = 1000 * KBP

WINDOW_SIZES = [500 * KBP] #[500 * KBP, 1 * MBP, 5 * MBP]
BIN_SIZES = [50 * KBP, 100 * KBP]
NUM_SAMPLES = [10**x for x in range(5)] # [2**x for x in range(16)]
n_windows=5

## define basic evaluation functions

In [10]:
import json
import sys

def lib_sps_print(s):
    pass

def conf_quarry_basic(quarry):
    #warnings.filterwarnings('ignore')
    with libsmoother.open_default_json() as default_file:
        default_json = json.load(default_file)
        quarry.set_value(["settings"], default_json)
    quarry.set_value(["settings", "filters", "symmetry"], "mirror")
    quarry.set_value(["settings", "filters", "cut_off_bin"], "smaller")
    quarry.set_value(["settings", "filters", "show_contig_smaller_than_bin"], True)
    quarry.set_value(["settings", "interface", "fixed_bin_size"], True)
    quarry.set_value(["settings", "interface", "add_draw_area", "val"], 0)
    quarry.set_value(["settings", "normalization", "scale"], "dont")
    quarry.set_value(["settings", "normalization", "log_base"], 0)

def conf_quarry_data(quarry):
    quarry.set_value(["settings", "normalization", "scale"], "dont")
    quarry.set_value(["settings", "normalization", "log_base"], 0)

def conf_quarry_heatmap(quarry):
    quarry.set_value(["settings", "normalization", "scale"], "max")
    quarry.set_value(["settings", "normalization", "log_base"], 10)


def set_bin_size(quarry, bin_size):
    div = quarry.get_value(["dividend"])
    if bin_size % div != 0:
        print("WARNING: uneven division by index dividend", file=sys.stderr)
    if bin_size < div:
        print("WARNING: dividend larger than value", file=sys.stderr)
    bin_size = max(1, bin_size // div)
    quarry.set_value(["settings", "interface", "fixed_bin_size_x", "val"], bin_size)
    quarry.set_value(["settings", "interface", "fixed_bin_size_y", "val"], bin_size)

def tsv_to_ret(tsv):
    ret = [(x[:-1], x[-1]) for x in tsv]
    ret.sort()
    return [a for a, _ in ret], [b for _, b in ret]

def quarry_whole_window(quarry, window_size, n_windows):
    canvas_size_x, canvas_size_y = quarry.get_canvas_size(lib_sps_print)
    quarry.set_value(["area", "x_start"], 0)
    quarry.set_value(["area", "x_end"], min(window_size * n_windows, canvas_size_x))
    quarry.set_value(["area", "y_start"], 0)
    quarry.set_value(["area", "y_end"], min(window_size * n_windows, canvas_size_y))
    
    conf_quarry_data(quarry)
    tsv = tsv_to_ret(quarry.get_heatmap_export(lib_sps_print))
    
    conf_quarry_heatmap(quarry)
    heatmap = quarry.get_heatmap(lib_sps_print)
    return tsv, heatmap

def quarry_chunked_window(quarry, window_size, n_windows):
    canvas_size_x, canvas_size_y = quarry.get_canvas_size(lambda s: None)
    tsv = []
    heatmap = None
    for x_start in range(0, min(window_size * n_windows, canvas_size_x), window_size):
        quarry.set_value(["area", "x_start"], x_start)
        quarry.set_value(["area", "x_end"], x_start + window_size)
        for y_start in range(0, min(window_size * n_windows, canvas_size_y), window_size):
            quarry.set_value(["area", "y_start"], y_start)
            quarry.set_value(["area", "y_end"], y_start + window_size)
            conf_quarry_data(quarry)
            tsv.extend(quarry.get_heatmap_export(lib_sps_print))
            
            conf_quarry_heatmap(quarry)
            heatmap_local = quarry.get_heatmap(lib_sps_print)
            if heatmap is None:
                heatmap = heatmap_local
            else:
                for key, val in heatmap_local.items():
                    heatmap[key].extend(val)
    return tsv_to_ret(tsv), heatmap

## Load index and compute correlation data

In [11]:
quarry = libsmoother.Quarry("../../smoother_out/radicl.smoother_index")
conf_quarry_basic(quarry)
n_windows = 5

cooler_bins = {}
local_ice_bins = {}
global_ice_bins = {}

for bin_size in BIN_SIZES:
    print("bin_size", bin_size)
    cooler_bins[bin_size] = {}
    local_ice_bins[bin_size] = {}
    global_ice_bins[bin_size] = {}
    for window_size in WINDOW_SIZES:
        print("window_size", window_size)

        libsmoother.set_bin_size(quarry, bin_size)
        print("cooler")
        quarry.set_value(["settings", "normalization", "normalize_by"], "cool-ice")
        bins, cooler_bins[bin_size][window_size] = libsmoother.quarry_whole_window(quarry, window_size, n_windows)

        print("local-ice")
        quarry.set_value(["settings", "normalization", "normalize_by"], "ice")
        quarry.set_value(["settings", "normalization", "ice_local"], True)
        bins2, local_ice_bins[bin_size][window_size] = libsmoother.quarry_whole_window(quarry, window_size, n_windows)
        assert bins == bins2

        quarry.set_value(["settings", "normalization", "normalize_by"], "ice")
        quarry.set_value(["settings", "normalization", "ice_local"], False)
        global_ice_bins[bin_size][window_size] = {}
        
        
        for num_samples in NUM_SAMPLES:
            print("global-ice", num_samples)
            quarry.set_value(["settings", "normalization", "num_ice_bins", "val"], num_samples)
            
            bins3, global_ice_bins[bin_size][window_size][num_samples] = libsmoother.quarry_chunked_window(quarry, window_size, n_windows)
            assert bins == bins3

bin_size 50000
window_size 500000
cooler
local-ice
global-ice 1
global-ice 10
global-ice 100
global-ice 1000
global-ice 10000
bin_size 100000
window_size 500000
cooler
local-ice
global-ice 1
global-ice 10
global-ice 100
global-ice 1000
global-ice 10000


## Investigate the point scatter plot for one bin and window size

Expect a bad correleation for a low number of samples, it should then gradually improve with the number of samples.

In [12]:
import random
from bokeh.models import Legend

def get_mean_dev(ground_truth, points):
    return sum(abs(a-b) for a, b in zip(ground_truth, points)) / len(points)


def plot_scatter_points(ground_truth, data, title):
    ALMOST_ZERO = 10**-5
    palette = viridis(len(data))
    f = figure(
            title=title, 
            x_axis_type="log", 
            y_axis_type="log", 
            x_range=(ALMOST_ZERO, 1), 
            y_range=(ALMOST_ZERO, 1),
            sizing_mode="stretch_width",
        )
    f.line(x=[ALMOST_ZERO,1], y=[ALMOST_ZERO,1], color="black")
    items = []
    for idx, (name, points) in enumerate(data):
        xs = []
        ys = []
        for (x, y) in random.sample(list(zip(ground_truth, points)), min(len(points), 1000)):
            xs.append(x)
            ys.append(y)
        mean_dev = round(get_mean_dev(ground_truth, points), 3)
        d = f.dot(x=xs, y=ys, color=palette[idx], size=25, alpha=0.5)
        items.append((name + " dev: " + str(mean_dev), [d]))

    f.xaxis.axis_label = "ground truth"
    f.yaxis.axis_label = "sample"
    f.add_layout(Legend(items=items, location="center", click_policy="hide"), "right")
    show(f)

plot_scatter_points(
    cooler_bins[BIN_SIZES[0]][WINDOW_SIZES[0]], 
    [("global ice bins - num samples= " + str(x), y) 
     for x, y in global_ice_bins[BIN_SIZES[0]][WINDOW_SIZES[0]].items()], 
    "chunked global ice vs cooler")

- mean deviation becomes smaller with increasing number of samples 
- it does not seed to tend towards zero but 0.001, which is odd
  - therefore, check my icing implementation v.s. cooler:

In [13]:
plot_scatter_points(cooler_bins[BIN_SIZES[0]][WINDOW_SIZES[0]], 
                    [("local ice bins", local_ice_bins[BIN_SIZES[0]][WINDOW_SIZES[0]])], "local ice vs cooler")

- equally, this has a mean deviation of 0.001, so it's not the number of samples but some glitch in the code

## Plot correlation as a function of the number of samples



In [14]:
def corr_as_func_of_samples(conditions):
    palette = viridis(len(conditions))
    f = figure(y_axis_type="log", x_axis_type="log")
    idx = 0
    for ground_truth, sample, name in conditions:
        xs = []
        ys = []
        for num_samples, points in sample.items():
            xs.append(num_samples)
            ys.append(get_mean_dev(ground_truth, points))
        f.line(x=xs, y=ys, color=palette[idx], legend_label=name)
        f.x(x=xs, y=ys, color=palette[idx], legend_label=name)
        idx += 1
    f.xaxis.axis_label = "number of samples"
    f.yaxis.axis_label = "mean deviation"
    f.legend.click_policy="hide"
    show(f)

conditions = [
    (
        cooler_bins[bin_size][window_size], 
        global_ice_bins[bin_size][window_size], 
        "ICE bin_size=" + str(bin_size//KBP) + "k window_size=" + str(window_size//KBP) + "k"
     ) 
     for bin_size in BIN_SIZES for window_size in WINDOW_SIZES
]

corr_as_func_of_samples(conditions)


- windows size does not affect results
- bin size does

In [15]:
plot_scatter_points(
    cooler_bins[BIN_SIZES[1]][WINDOW_SIZES[0]], 
    [("global ice bins - num samples= " + str(x), y) 
     for x, y in global_ice_bins[BIN_SIZES[0]][WINDOW_SIZES[0]].items()], 
    "chunked global ice vs cooler")