# Analyzing the improved genome assembly

## run the notebook serve on a client node

```
    srun --pty --nodes=1  --ntasks-per-node=1 --cpus-per-task=28 --time 100:00:00 --job-name bash-jupyter bash
    conda activate ont_assembly
    jupyter notebook --ip 0.0.0.0 --port 3001 --no-browser
```

## bokeh imports

In [99]:
from bokeh.plotting import figure, output_file, save
from bokeh.palettes import viridis
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, FactorRange, LabelSet, Whisker
from bokeh.models.tickers import FixedTicker
from bokeh.transform import jitter
from bokeh.layouts import column, row, gridplot
from math import pi
from bokeh.layouts import row
import random
import re

COLOR_PALETTE = ["#0072B2", "#D55E00", "#009E73", "#E69F00", "#CC79A7", "#56B4E9", "#F0E442"]

output_notebook()

## load the data

In [100]:
def load_dist_dev(file_name, max_dist=None):
    names = []
    ret = []
    with open(file_name, "r") as in_file:
        for line in in_file:
            if line[0] == "#":
                continue
            else:
                readname, distance = line.strip().split()
                if max_dist is None or int(distance) < max_dist:
                    names.append(readname)
                    ret.append(int(distance))
    return names, ret

ref_names, ref_dev = load_dist_dev("../data/out/virtual_paired_read_dist/referece.distance_deviation")
fixed_names, fixed_dev = load_dist_dev("../data/out/virtual_paired_read_dist/fixed_n.distance_deviation")

gap_spanning_reads = {}
gap_pos = {}

with open("../data/out/virtual_paired_read_dist/gap_spanning_reads", "r") as file_in:
    for line in file_in:
        readname, *gaps = line[:-1].strip().split()
        gap_names = []
        for gap in gaps:
            chrom, start, end = re.split(":|-", gap)
            gap_name = chrom + ":" + str(int(start)//1000) + "kbp"
            gap_pos[gap_name] = [chrom, int(start), int(end)]
            gap_names.append(gap_name)
        gap_spanning_reads[readname] = " ".join(gap_names)


## post process the data

In [106]:

ref_dict = {x: y for x, y in zip(ref_names, ref_dev)}
fixed_dict = {x: y for x, y in zip(fixed_names, fixed_dev)}

# filter reads
# remove those where the disrance has not changed
filtered = set()
for read_name, distance in zip(fixed_names, fixed_dev):
    if not read_name in gap_spanning_reads:
       filtered.add(read_name)
    # if read_name in ref_dict and ref_dict[read_name] == distance:
    #     filtered.add(read_name)
    pass
    
readnames = [n for n in fixed_names if n in ref_dict and not n in filtered]



# figure out gap groups
gap_groups = set()
for gap in gap_spanning_reads.values():
    gap_groups.add(gap)
gap_colors = {}
for idx, gap in enumerate(gap_groups):
    if len(gap.split()) > 1:
        gap_colors[gap] = "black"
    else:
        gap_colors[gap] = COLOR_PALETTE[idx % len(COLOR_PALETTE)]
read_colors = {}
for read in readnames:
    read_colors[read] = gap_colors[gap_spanning_reads[read]] if read in gap_spanning_reads else "black"


## Plot the results

In [107]:
output_file("../data/out/virtual_paired_read_dist/compare.html")
f = figure(title="Comparison of distance deviations", x_axis_label="Deviation on reference genome", y_axis_label="Deviation on improved genome", 
           tooltips=[("", "@h"), ("", "@r")])

f.line(x=[-1.5*10**5, 0.5*10**5], y=[-1.5*10**5, 0.5*10**5], color="black")

f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] for n in readnames], 
                "y": [fixed_dict[n] for n in readnames],
                "c": [read_colors[n] for n in readnames],
                "h": [gap_spanning_reads[n] if n in gap_spanning_reads else "none" for n in readnames],
                "r": [n for n in readnames]
            }
                ))

show(f, notebook_handle=True)
save(f)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-62f97720-fa07-4445-95f2-23548165e207.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-62f97720-fa07-4445-95f2-23548165e207.sock'
}


'/ladsie/project/ladsie_019/claudia/ont_assembly_improvement/data/out/virtual_paired_read_dist/compare.html'

## check some individual gaps

In [None]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
# GAP_TO_CHECK = "Chr11_core:263kbp"
#GAP_TO_CHECK = "Chr7_core:1789kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"
# GAP_TO_CHECK = "Chr11_3B:266kbp"
GAP_TO_CHECK = "Chr9_core:347kbp"
# GAP_TO_CHECK = "BES17:65kbp"
# GAP_TO_CHECK = "Chr11_3A:497kbp"

f = figure(title="Comparison of distance deviations", x_axis_label="Reference", y_axis_label="Fixed", 
           tooltips=[("", "@h"), ("", "@r")])

picked_readnames = [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == GAP_TO_CHECK]

f.line(x=[-1.5*10**5, 0.5*10**5], y=[-1.5*10**5, 0.5*10**5], color="black")
f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] for n in picked_readnames], 
                "y": [fixed_dict[n] for n in picked_readnames],
                "c": [read_colors[n] for n in picked_readnames],
                "h": [gap_spanning_reads[n] for n in picked_readnames],
                "r": picked_readnames
            }
                ))

show(f, notebook_handle=True)

## get read clusters -> turn into table of "correctly" expanded gaps

In [None]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
#GAP_TO_CHECK = "Chr11_core:263kbp"
#GAP_TO_CHECK = "Chr11_3B:266kbp"
GAP_TO_CHECK = "Chr7_core:1789kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"

f = figure(title="looking at how to best cluster", x_axis_label="Ref fixed difference", y_axis_label="ref fixed sum", 
           tooltips=[("", "@h"), ("", "@r")])

picked_readnames = [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == GAP_TO_CHECK]

f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] - fixed_dict[n] for n in picked_readnames], 
                "y": [fixed_dict[n] + ref_dict[n] for n in picked_readnames],
                "c": [read_colors[n] for n in picked_readnames],
                "h": [gap_spanning_reads[n] for n in picked_readnames],
                "r": picked_readnames
            }
                ))

show(f, notebook_handle=True)

In [None]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
#GAP_TO_CHECK = "Chr11_core:263kbp"
GAP_TO_CHECK = "Chr7_core:1789kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"

def extract_reads_for_gap(gap_name):
    return [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == gap_name]

picked_readnames = extract_reads_for_gap(GAP_TO_CHECK)


def cluster(l, key, max_dif):
    l.sort(key=key)

    clustered = [[l[0]]]
    if len(l) > 1:
        for read in l[1:]:
            if abs(key(read) - key(clustered[-1][-1])) > max_dif:
                clustered.append([])
            clustered[-1].append(read)
    return clustered

def cluster_reads(readnames, max_ref_fixed_diff = 10, max_ref_fixed_sum = 1000):
    return [c for l in cluster(readnames, lambda x: ref_dict[x] - fixed_dict[x], max_ref_fixed_diff) 
               for c in cluster(l, lambda x: ref_dict[x] + fixed_dict[x], max_ref_fixed_sum)]

clustered = cluster_reads(picked_readnames)


print(clustered)
print([len(c) for c in clustered])

def filter_clusters(clusters):
    return [c for c in clusters if len(c) > 3]

clustered = filter_clusters(clustered)

def get_mean_deviation_in_clusters(clusters, in_fixed=True):
    return [sum([fixed_dict[n] if in_fixed else ref_dict[n] for n in cluster]) / len(cluster) for cluster in clusters]

print(get_mean_deviation_in_clusters(clustered))
print(get_mean_deviation_in_clusters(clustered, False))


[['8b06da5d-716e-4be7-b505-bdc3202b98b9', 'c32915f8-1f2e-4b5b-b27c-b7675fc59c12', '988378f5-4f89-4a61-9324-9b09e910ea67', '7ae17521-be78-4a0f-b07d-72b1f4a17571', 'b3984092-b567-4515-9fab-dedce37d164f', 'd8adad1d-f9e6-4fc2-8661-94daa7d719c5', '68facc1f-10bb-4d58-babb-d50f9a394459', '5773b73f-e20d-465f-87cc-586316476582', '23ce9168-bc3e-463d-8f69-c63aeb08d3e0', '278cb7cf-c242-4176-afe2-599de4cc5f86', 'd098a699-7d36-4954-bcc0-9158a39b9d8a', '9f37d85a-310d-4c56-b546-d369cb032972', '9e2d5ec8-3cc0-421d-8058-f9aae046878d', 'bd55ff1f-1300-45ce-8bfa-cf96bdcd94c9', '7685e1cf-a56b-42d8-8327-bb2242a272da', '25470cb8-f0bf-471b-806a-b726eb4784ea', 'c4290c47-9323-486f-bf18-b034f6a98ad1', 'ff787de3-0f56-4eec-8234-b822d345e713', '99abaac6-24f7-43bc-bb2d-c9d34c090859', 'd61d851c-d320-4041-8a4e-c5cb4920b987', '741cdcc6-a322-44ae-80ef-5c9a342aa463', '1ea3794e-caa9-4ba3-8756-5e96a1caa4aa', '7de15af8-3be9-4794-b6c3-63d6d56a3a33', 'd021856a-39fd-479d-acb2-7b379d4b2ece', '879f2676-7b75-431e-8688-940ff8a0e04a'

In [109]:
gap_closed_if_fixed_dev_smaller_than = 5000


gap_names = set([x for x in gap_spanning_reads.values() if not " " in x])

closed_gaps = 0
print("closed", "#supp", "#contra", "dev", "fxd_dev", "other", "name", sep="\t")
with open("../data/out/virtual_paired_read_dist/closed_gaps_analysis.gff", "w") as file_out:
    for gap in sorted(gap_names):
        read_names = extract_reads_for_gap(gap)
        if len(read_names) > 0:
            read_clusters = filter_clusters(cluster_reads(read_names))
            cluster_fixed = get_mean_deviation_in_clusters(read_clusters)
            gap_sizes = get_mean_deviation_in_clusters(read_clusters, False)
            gap_closed = False
            gap_idx = 0
            min_fixed = float("inf")
            for idx, x in enumerate(cluster_fixed):
                if abs(x) < gap_closed_if_fixed_dev_smaller_than and abs(x) < min_fixed:
                    gap_closed = True
                    gap_idx = idx
                    min_fixed = abs(x)
            if gap_closed:
                print("Yes", len(read_clusters[idx]), len(read_names) - len(read_clusters[idx]), int(gap_sizes[idx]), 
                    int(cluster_fixed[idx]), len(gap_sizes) > 1, gap, sep="\t")
                closed_gaps += 1
                chrom, start, end = gap_pos[gap]
                file_out.write("\t".join([chrom + "_Tb427v10", ".", "gap", str(start), str(end), ".", ".", ".", 
                                          "estimated_length=1000;gap_type=within scaffold;closed_correctly=true"]) + "\n" )
            else:
                print("No", "", len(read_names), "", "", "", gap, sep="\t")
        else:
            print("No", "", "", "", "", "", gap, sep="\t")
print()
print()
print("closed", closed_gaps, "out of", len(gap_names), "gaps that had reads spanning them")

closed	#supp	#contra	dev	fxd_dev	other	name
Yes	14	2	-1091	-3062	False	BES17:65kbp
No		3				BES2:62kbp
No		15				Chr10_3A:1256kbp
Yes	16	0	-23766	358	False	Chr11_3A:284kbp
No		2				Chr11_3A:497kbp
No		23				Chr11_3B:266kbp
No		5				Chr11_3B:319kbp
No		53				Chr11_core:263kbp
No		1				Chr1_3A:1409kbp
Yes	36	2	-3971	109	False	Chr1_3A:2090kbp
No		1				Chr1_3A:264kbp
Yes	27	1	-1673	129	False	Chr1_3A:43kbp
Yes	19	3	-17494	184	False	Chr1_3B:226kbp
Yes	6	0	-38566	242	False	Chr1_3B:299kbp
Yes	10	0	-27704	206	False	Chr1_3B:372kbp
Yes	5	5	-45093	1759	False	Chr1_core:609kbp
No		3				Chr3_5A:303kbp
No		8				Chr3_core:820kbp
No		3				Chr4_core:879kbp
Yes	20	5	-14002	-754	False	Chr5_3B:167kbp
Yes	31	1	-17168	242	False	Chr5_3B:286kbp
No		9				Chr5_core:223kbp
Yes	13	1	-17788	131	False	Chr6_3A:1057kbp
No		21				Chr6_3A:1194kbp
Yes	10	3	-13723	-88	False	Chr6_3A:1239kbp
No		1				Chr6_3A:71kbp
No		4				Chr6_3A:861kbp
Yes	12	0	-42654	444	False	Chr6_3B:1000kbp
Yes	36	41	-3157	7197	True	Chr7_core:1789kbp
No