# Analyzing the improved genome assembly

## run the notebook serve on a client node

```
    srun --pty --nodes=1  --ntasks-per-node=1 --cpus-per-task=28 --time 100:00:00 --job-name bash-jupyter bash
    conda activate ont_assembly
    jupyter notebook --ip 0.0.0.0 --port 3001 --no-browser
```

## bokeh imports

In [1]:
from bokeh.plotting import figure
from bokeh.palettes import viridis
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, FactorRange, LabelSet, Whisker
from bokeh.models.tickers import FixedTicker
from bokeh.transform import jitter
from bokeh.layouts import column, row, gridplot
from math import pi
from bokeh.layouts import row
import random

COLOR_PALETTE = ["#0072B2", "#D55E00", "#009E73", "#E69F00", "#CC79A7", "#56B4E9", "#F0E442"]

output_notebook()

## load the data

In [3]:
def load_dist_dev(file_name, max_dist=None):
    names = []
    ret = []
    with open(file_name, "r") as in_file:
        for line in in_file:
            if line[0] == "#":
                continue
            else:
                readname, distance = line.strip().split()
                if max_dist is None or int(distance) < max_dist:
                    names.append(readname)
                    ret.append(int(distance))
    return names, ret

ref_names, ref_dev = load_dist_dev("../data/out/virtual_paired_read_dist/referece.distance_deviation")
fixed_names, fixed_dev = load_dist_dev("../data/out/virtual_paired_read_dist/fixed_n.distance_deviation")

gap_spanning_reads = {}

with open("../data/out/virtual_paired_read_dist/gap_spanning_reads", "r") as file_in:
    for line in file_in:
        readname, *gaps = line[:-1].strip().split()
        gap_spanning_reads[readname] = " ".join(gaps)



## post process the data

In [8]:

ref_dict = {x: y for x, y in zip(ref_names, ref_dev)}
fixed_dict = {x: y for x, y in zip(fixed_names, fixed_dev)}

# filter reads
# remove those where the disrance has not changed
filtered = set()
for read_name, distance in zip(fixed_names, fixed_dev):
    #if not read_name in gap_spanning_reads:
    #    filtered.add(read_name)
    # if read_name in ref_dict and ref_dict[read_name] == distance:
    #     filtered.add(read_name)
    pass
    
readnames = [n for n in fixed_names if n in ref_dict and not n in filtered]



# figure out gap groups
gap_groups = set()
for gap in gap_spanning_reads.values():
    gap_groups.add(gap)
gap_colors = {}
for idx, gap in enumerate(gap_groups):
    if len(gap.split()) > 1:
        gap_colors[gap] = "black"
    else:
        gap_colors[gap] = COLOR_PALETTE[idx % len(COLOR_PALETTE)]
read_colors = {}
for read in readnames:
    read_colors[read] = gap_colors[gap_spanning_reads[read]] if read in gap_spanning_reads else "black"


## Plot the results

In [9]:
f = figure(title="Comparison of distance deviations", x_axis_label="Reference", y_axis_label="Fixed", 
           tooltips=[("", "@h"), ("", "@r")])

f.line(x=[-1.5*10**5, 0.5*10**5], y=[-1.5*10**5, 0.5*10**5], color="black")

f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] for n in readnames], 
                "y": [fixed_dict[n] for n in readnames],
                "c": [read_colors[n] for n in readnames],
                "h": [gap_spanning_reads[n] if n in gap_spanning_reads else "none" for n in readnames],
                "r": [n for n in readnames]
            }
                ))

show(f, notebook_handle=True)

## check some individual gaps

In [51]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
#GAP_TO_CHECK = "Chr11_core:263kbp"
#GAP_TO_CHECK = "Chr7_core:1789kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"
#GAP_TO_CHECK = "Chr11_3B:266kbp"
GAP_TO_CHECK = "Chr9_core:347kbp"

f = figure(title="Comparison of distance deviations", x_axis_label="Reference", y_axis_label="Fixed", 
           tooltips=[("", "@h"), ("", "@r")])

picked_readnames = [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == GAP_TO_CHECK]

f.line(x=[-1.5*10**5, 0.5*10**5], y=[-1.5*10**5, 0.5*10**5], color="black")
f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] for n in picked_readnames], 
                "y": [fixed_dict[n] for n in picked_readnames],
                "c": [read_colors[n] for n in picked_readnames],
                "h": [gap_spanning_reads[n] for n in picked_readnames],
                "r": picked_readnames
            }
                ))

show(f, notebook_handle=True)

## get read clusters -> turn into table of "correctly" expanded gaps

In [46]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
#GAP_TO_CHECK = "Chr11_core:263kbp"
#GAP_TO_CHECK = "Chr11_3B:266kbp"
GAP_TO_CHECK = "Chr7_core:1789kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"

f = figure(title="looking at how to best cluster", x_axis_label="Ref fixed difference", y_axis_label="ref fixed sum", 
           tooltips=[("", "@h"), ("", "@r")])

picked_readnames = [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == GAP_TO_CHECK]

f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] - fixed_dict[n] for n in picked_readnames], 
                "y": [fixed_dict[n] + ref_dict[n] for n in picked_readnames],
                "c": [read_colors[n] for n in picked_readnames],
                "h": [gap_spanning_reads[n] for n in picked_readnames],
                "r": picked_readnames
            }
                ))

show(f, notebook_handle=True)

In [29]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
#GAP_TO_CHECK = "Chr11_core:263kbp"
GAP_TO_CHECK = "Chr7_core:1789kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"

def extract_reads_for_gap(gap_name):
    return [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == gap_name]

picked_readnames = extract_reads_for_gap(GAP_TO_CHECK)


def cluster(l, key, max_dif):
    l.sort(key=key)

    clustered = [[l[0]]]
    if len(l) > 1:
        for read in l[1:]:
            if abs(key(read) - key(clustered[-1][-1])) > max_dif:
                clustered.append([])
            clustered[-1].append(read)
    return clustered

def cluster_reads(readnames, max_ref_fixed_diff = 10, max_ref_fixed_sum = 1000):
    return [c for l in cluster(readnames, lambda x: ref_dict[x] - fixed_dict[x], max_ref_fixed_diff) 
               for c in cluster(l, lambda x: ref_dict[x] + fixed_dict[x], max_ref_fixed_sum)]

clustered = cluster_reads(picked_readnames)


print(clustered)
print([len(c) for c in clustered])

def filter_clusters(clusters):
    return [c for c in clusters if len(c) > 5]

clustered = filter_clusters(clustered)

def get_mean_deviation_in_clusters(clusters, in_fixed=True):
    return [sum([fixed_dict[n] if in_fixed else ref_dict[n] for n in cluster]) / len(cluster) for cluster in clusters]

print(get_mean_deviation_in_clusters(clustered))
print(get_mean_deviation_in_clusters(clustered, False))


[['8b06da5d-716e-4be7-b505-bdc3202b98b9', 'c32915f8-1f2e-4b5b-b27c-b7675fc59c12', '988378f5-4f89-4a61-9324-9b09e910ea67', '7ae17521-be78-4a0f-b07d-72b1f4a17571', 'b3984092-b567-4515-9fab-dedce37d164f', 'd8adad1d-f9e6-4fc2-8661-94daa7d719c5', '457f1eb5-ace5-4f81-8cd5-1205f4bc89e4', '68facc1f-10bb-4d58-babb-d50f9a394459', '5773b73f-e20d-465f-87cc-586316476582', '23ce9168-bc3e-463d-8f69-c63aeb08d3e0', 'bd55ff1f-1300-45ce-8bfa-cf96bdcd94c9', '9f37d85a-310d-4c56-b546-d369cb032972', 'd098a699-7d36-4954-bcc0-9158a39b9d8a', 'ced9b237-3971-4015-a927-fb9b8e6a5664', '7685e1cf-a56b-42d8-8327-bb2242a272da', '9e2d5ec8-3cc0-421d-8058-f9aae046878d', '25470cb8-f0bf-471b-806a-b726eb4784ea', 'c4290c47-9323-486f-bf18-b034f6a98ad1', 'ff787de3-0f56-4eec-8234-b822d345e713', '99abaac6-24f7-43bc-bb2d-c9d34c090859', '7de15af8-3be9-4794-b6c3-63d6d56a3a33', 'd61d851c-d320-4041-8a4e-c5cb4920b987', '741cdcc6-a322-44ae-80ef-5c9a342aa463', '1ea3794e-caa9-4ba3-8756-5e96a1caa4aa', 'd021856a-39fd-479d-acb2-7b379d4b2ece'

In [49]:
gap_closed_if_fixed_dev_smaller_than = 500


gap_names = set([x for x in gap_spanning_reads.values() if not " " in x])

closed_gaps = 0
print("closed", "#supp", "#contra", "dev", "fxd_dev", "other", "name", sep="\t")
for gap in sorted(gap_names):
    read_names = extract_reads_for_gap(gap)
    if len(read_names) > 0:
        read_clusters = filter_clusters(cluster_reads(read_names))
        cluster_fixed = get_mean_deviation_in_clusters(read_clusters)
        gap_sizes = get_mean_deviation_in_clusters(read_clusters, False)
        gap_closed = False
        gap_idx = 0
        for idx, x in enumerate(cluster_fixed):
            if abs(x) < gap_closed_if_fixed_dev_smaller_than:
                gap_closed = True
                gap_idx = idx
                break
        if gap_closed:
            print("Yes", len(read_clusters[idx]), len(read_names) - len(read_clusters[idx]), int(gap_sizes[idx]), 
                int(cluster_fixed[idx]), len(gap_sizes) > 1, gap, sep="\t")
            closed_gaps += 1
        else:
            print("No", "", len(read_names), "", "", "", gap, sep="\t")
    else:
        print("No", "", "", "", "", "", gap, sep="\t")
print()
print()
print("closed", closed_gaps, "out of", len(gap_names), "gaps that had reads spanning them")

closed	#supp	#contra	dev	fxd_dev	other	name
No		25				BES17:65kbp
No		8				BES2:62kbp
No		15				Chr10_3A:1256kbp
Yes	17	0	-23778	347	False	Chr11_3A:284kbp
No		3				Chr11_3A:497kbp
No		28				Chr11_3B:266kbp
No		7				Chr11_3B:319kbp
Yes	13	41	-22365	121	True	Chr11_core:263kbp
No		1				Chr1_3A:1409kbp
No		1				Chr1_3A:202kbp
Yes	37	4	-3967	113	False	Chr1_3A:2090kbp
No		2				Chr1_3A:264kbp
Yes	33	2	-1687	115	False	Chr1_3A:43kbp
Yes	21	3	-17502	176	False	Chr1_3B:226kbp
Yes	7	0	-38534	274	False	Chr1_3B:299kbp
Yes	11	0	-27733	177	False	Chr1_3B:372kbp
No		18				Chr1_core:609kbp
No		2				Chr3_5A:303kbp
No		12				Chr3_core:820kbp
No		9				Chr4_core:879kbp
Yes	25	5	-14036	220	False	Chr5_3B:167kbp
Yes	43	2	-17180	230	False	Chr5_3B:286kbp
No		10				Chr5_core:223kbp
Yes	20	0	-17728	191	False	Chr6_3A:1057kbp
No		28				Chr6_3A:1194kbp
Yes	10	3	-13797	-162	False	Chr6_3A:1239kbp
No		1				Chr6_3A:71kbp
No		4				Chr6_3A:861kbp
Yes	14	0	-42652	446	False	Chr6_3B:1000kbp
Yes	32	55	-10528	-173	True	Chr7_core: