# Test normalizaitons

In [1]:
import libsmoother

from bokeh.plotting import figure
from bokeh.palettes import viridis
from bokeh.io import show, output_notebook

import pickle

output_notebook()

filename_1 = "norm_corr_1.pcl"
KBP = 1000

## Load index and compute correlation data

code can be toggled to compute new statistics or to load existing statistics file

In [4]:
quarry = libsmoother.Quarry("../../smoother_out/radicl.smoother_index")
libsmoother.conf_quarry(quarry)
n_windows = 5
for bin_size in [50*KBP]:
    for window_size in [1000*KBP]:
        libsmoother.set_bin_size(quarry, bin_size)
        print("cooler")
        quarry.set_value(["settings", "normalization", "normalize_by"], "cool-ice")
        bins, cooler_bins = libsmoother.quarry_whole_window(quarry, window_size, n_windows)
        print(bins[:10])

        print("local-ice")
        quarry.set_value(["settings", "normalization", "normalize_by"], "ice")
        quarry.set_value(["settings", "normalization", "ice_local"], True)
        bins2, local_ice_bins = libsmoother.quarry_whole_window(quarry, window_size, n_windows)
        assert bins == bins2

        print("global-ice")
        quarry.set_value(["settings", "normalization", "normalize_by"], "ice")
        quarry.set_value(["settings", "normalization", "ice_local"], False)
        global_ice_bins = {}
        global_ice_bins_chunked = {}
        
        for num_samples in [10**x for x in range(5)]:
            print(num_samples)
            quarry.set_value(["settings", "normalization", "num_ice_bins", "val"], num_samples)
            
            bins3, global_ice_bins[num_samples] = libsmoother.quarry_whole_window(quarry, window_size, n_windows)
            assert bins == bins3
            bins4, global_ice_bins_chunked[num_samples] = libsmoother.quarry_chunked_window(quarry, window_size, n_windows)
            assert bins == bins4

cooler
[('Chr10_3A_Tb427v10', 0, 50000, 'Chr10_3A_Tb427v10', 0, 50000), ('Chr10_3A_Tb427v10', 0, 50000, 'Chr10_3A_Tb427v10', 50000, 100000), ('Chr10_3A_Tb427v10', 0, 50000, 'Chr10_3A_Tb427v10', 100000, 150000), ('Chr10_3A_Tb427v10', 0, 50000, 'Chr10_3A_Tb427v10', 150000, 200000), ('Chr10_3A_Tb427v10', 0, 50000, 'Chr10_3A_Tb427v10', 200000, 250000), ('Chr10_3A_Tb427v10', 0, 50000, 'Chr10_3A_Tb427v10', 250000, 300000), ('Chr10_3A_Tb427v10', 0, 50000, 'Chr10_3A_Tb427v10', 300000, 350000), ('Chr10_3A_Tb427v10', 0, 50000, 'Chr10_3A_Tb427v10', 350000, 400000), ('Chr10_3A_Tb427v10', 0, 50000, 'Chr10_3A_Tb427v10', 400000, 450000), ('Chr10_3A_Tb427v10', 0, 50000, 'Chr10_3A_Tb427v10', 450000, 500000)]
local-ice
global-ice
1
10
100
1000
10000


## Investigate the point scatter plot for one bin and window size

Expect a bad correleation for a low number of samples, it should then gradually improve with the number of samples.

In [16]:
import random
from bokeh.models import Legend

def plot_scatter_points(ground_truth, data, title):
    ALMOST_ZERO = 10**-5
    palette = viridis(len(data))
    f = figure(
            title=title, 
            x_axis_type="log", 
            y_axis_type="log", 
            x_range=(ALMOST_ZERO, 1), 
            y_range=(ALMOST_ZERO, 1),
            sizing_mode="stretch_width",
        )
    f.line(x=[ALMOST_ZERO,1], y=[ALMOST_ZERO,1], color="black")
    items = []
    for idx, (name, points) in enumerate(data):
        xs = []
        ys = []
        for (x, y) in random.sample(list(zip(ground_truth, points)), min(len(points), 1000)):
            xs.append(x)
            ys.append(y)
        mean_dev = round(sum(abs(a-b) for a, b in zip(ground_truth, points)) / len(points), 3)
        d = f.dot(x=xs, y=ys, color=palette[idx], size=25, alpha=0.5)
        items.append((name + " dev: " + str(mean_dev), [d]))

    f.xaxis.axis_label = "ground truth"
    f.yaxis.axis_label = "sample"
    f.add_layout(Legend(items=items, location="center", click_policy="hide"), "right")
    show(f)

plot_scatter_points(cooler_bins, [("local ice bins", local_ice_bins)], "local ice vs cooler")
plot_scatter_points(cooler_bins, [("global ice bins - num samples= " + str(x), y) for x, y in global_ice_bins.items()], "global ice vs cooler")
plot_scatter_points(cooler_bins, [("global ice bins - num samples= " + str(x), y) for x, y in global_ice_bins_chunked.items()], "chunked global ice vs cooler")
plot_scatter_points(global_ice_bins[10**4], [("global ice bins - num samples= " + str(x), y) for x, y in global_ice_bins_chunked.items()], "chunked global ice vs global ice")
#plot_scatter_points(norm_corr["ICE"][10*KBP][1000*KBP], "10k bin_size, 1M window_size")
#plot_scatter_points(norm_corr["ICE"][100*KBP][5000*KBP], "10k bin_size, 5M window_size")

## Plot correlation as a function of the number of samples



In [None]:
palette = viridis(sum(len(bin_size.values()) for name in norm_corr.values() for bin_size in name.values()))
idx = 0
for name, data_2 in norm_corr.items():
    f = figure(y_axis_type="log", x_axis_type="log")
    for bin_size, data_3 in data_2.items():
        for window_size, data_4 in data_3.items():
            xs = []
            ys = []
            for num_samples, points in data_4.items():
                xs.append(num_samples)
                ys.append(sum(abs(a-b) for a, b in points) / len(points))
                #ys.append(data_5["corr"])
            l = name + " b: " + str(bin_size//1000) + "k w: " + str(window_size//1000) + "k"
            f.line(x=xs, y=ys, color=palette[idx], legend=l)
            f.x(x=xs, y=ys, color=palette[idx], legend=l)
            idx += 1
    f.xaxis.axis_label = "number of samples"
    f.yaxis.axis_label = "mean deviation"
    f.legend.click_policy="hide"
    show(f)


## Plot average correleation

In [None]:
f = figure()
palette = viridis(sum(1 for name in norm_corr.values() for bin_size in name.values() for window_size in bin_size.values()))
idx = 0
for name, data_2 in norm_corr.items():
    xs = list(list(list(data_2.values())[0].values())[0].keys())
    ys = []
    for x in xs:
        yss = []
        for bin_size, data_3 in data_2.items():
            for window_size, data_4 in data_3.items():
                yss.append(data_4[x]["corr"])
        ys.append(sum(yss) / len(yss))
    f.line(x=xs, y=ys, color=palette[idx], legend=name)
    f.x(x=xs, y=ys, color=palette[idx], legend=name)
    idx += 1
show(f)