# Analyzing the improved genome assembly

## run the notebook serve on a client node

```
    srun --pty --nodes=1  --ntasks-per-node=1 --cpus-per-task=28 --time 100:00:00 --job-name bash-jupyter bash
    conda activate ont_assembly
    jupyter notebook --ip 0.0.0.0 --port 3001 --no-browser
```

## bokeh imports

In [1]:
from bokeh.plotting import figure, output_file, save
from bokeh.palettes import viridis
from bokeh.io import show, output_notebook, export_svg
from bokeh.models import ColumnDataSource, HoverTool, FactorRange, LabelSet, Whisker, Label, Arrow
from bokeh.models.tickers import FixedTicker
from bokeh.transform import jitter
from bokeh.layouts import column, row, gridplot, grid
from math import pi
from bokeh.layouts import row
import random
import re
from identify_collapsed_regions import load_dist_dev, load_gaps

COLOR_PALETTE = ["#709DAE", "#E5AD50", "#6068A2", "#A44758", "#808080"]

output_notebook()

# identify reverse complemented scaffolds

In [2]:
# load the data:


def load_gaps_by_contig(file_name):
    ret = {}

    with open(file_name, "r") as in_file:
        for line in in_file:
            if line[0] == "#":
                continue
            chrom, source, anno_type, start, end, *extra = line.strip().split()
            if chrom not in ret:
                ret[chrom] = []
            start = int(start)
            end = int(end)
            ret[chrom].append((start, end))
    return ret

gap_pos = load_gaps_by_contig("../data/out/2_ref_reannotated_gaps/gaps.gff3")

def load_read_pos_and_str(file_name_in):
    ret = {}
    with open(file_name_in, "r") as file_in:
        for line in file_in:
            if line[0] == "#":
                continue
            read_name, chrom, pos1, pos2, strand1, strand2, map_q = line.strip().split("\t")
            pos1 = int(pos1)
            pos2 = int(pos2)
            map_q = int(map_q)
            strand1 = strand1 == "0"
            strand2 = strand2 == "0"
            if chrom not in ret:
                ret[chrom] = []
            ret[chrom].append((read_name, min(pos1, pos2), max(pos1, pos2), strand1, strand2, map_q))
    return ret

read_pos_and_strand = load_read_pos_and_str("../data/out/2.1_gap_spanning_reads/read_pos_and_strnd.tsv")

In [3]:
# filter the reads

def filter_reads(reads, gaps):
    ret = {}
    for contig, r in reads.items():
        ret[contig] = []
        if not contig in gaps:
            continue
        gaps[contig].sort()
        r.sort(key=lambda x: x[1])

        for r_name, pos1, pos2, s1, s2, map_q in r:
            for start, end in gaps[contig]:
                if pos1 < end and pos2 > start and map_q > 30:
                    ret[contig].append((r_name, pos1, pos2, s1, s2, map_q))
                    break
    return ret

filtered_reads = filter_reads(read_pos_and_strand, gap_pos)

In [4]:
# plot

contigs_to_check = set()
for contig, r in filtered_reads.items():
    for r_name, pos1, pos2, s1, s2, _ in r:
        if s1 != s2:
            contigs_to_check.add(contig)

all_contigs = set()
for contig, r in filtered_reads.items():
    if "Chr" in contig:
        all_contigs.add(contig)

# output_notebook()
output_file("../data/out/2.1_gap_spanning_reads/rev_complemented_scaffold_.html")
def plot_support(chrom):
    c = chrom[:-len("_Tb427v10")]
    f = figure(x_axis_label="genome position [Mbp]", y_axis_label="read index", title=c)
    # f.yaxis.visible=False
    f.output_backend = "svg"

    x1s = []
    x2s = []
    ys = []
    cs = []
    interval_ends = []
    for r_name, pos1, pos2, s1, s2, map_q in filtered_reads[chrom]:
        x1s.append(pos1 / 1000000)
        x2s.append(pos2 / 1000000)

        idx = 0
        while idx < len(interval_ends):
            if interval_ends[idx] + 100000 < pos1:
                break
            idx += 1
        if idx == len(interval_ends):
            interval_ends.append(pos2)
        else:
            interval_ends[idx] = pos2

        ys.append(idx * 2)
        cs.append(("#6068a2" if s1 == s2 else "#a44758") if map_q > 30 else ("lightblue" if s1 == s2 else "yellow"))

    f.hbar(left=x1s, right=x2s, y=ys, color=cs, line_width=2)
    
    x1s = []
    ys = []
    cs = []
    if chrom in gap_pos:
        for start, end in gap_pos[chrom]:
            x1s.append((start + end) / 2 / 1000000)
            ys.append(-15)
            cs.append("#e5ae51")
    f.x(x1s, ys, color=cs, size=10, line_width=2)
    # save(f)
    
    pt = "12pt"
    lw = 2

    f.title.text_font_size = pt
    f.axis.axis_label_text_font_size = pt
    f.axis.major_label_text_font_size = pt

    f.axis.axis_line_width = lw
    f.axis.major_tick_line_width = lw
    f.axis.minor_tick_line_width = lw
    f.grid.grid_line_width = lw
    f.yaxis.visible = False
    return f

c = column([plot_support(chrom) for chrom in sorted(list(contigs_to_check))])

show(c, notebook_handle=True)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-f11615e9-a44b-46a5-b840-a9bde1afdf51.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-f11615e9-a44b-46a5-b840-a9bde1afdf51.sock'
}


## load the data

In [16]:
ref_names, ref_dev = load_dist_dev("../data/out/10_gap_spanning_reads/distance_deviation.tsv")
gap_pos = load_gaps("../data/out/8_merged_genomes/gaps.gff3")


### print mq>= 30 connections between contigs

In [17]:

contig_connections = {}
with open("../data/out/10_gap_spanning_reads/contig_connecting_reads.tsv", "r") as in_file:
    for line in in_file.readlines():
        if line[0] == "#":
            continue
        rname, o_rname, pos, o_rpos, rev_stnd, o_rev_stnd, map_q, o_map_q = line[:-1].split()
        ctg1 = min(rname, o_rname)
        ctg2 = max(rname, o_rname)
        if int(map_q) < 30 or int(o_map_q) < 30:
            continue
        if not ctg1 in contig_connections:
            contig_connections[ctg1] = {}
        if not ctg2 in contig_connections[ctg1]:
            contig_connections[ctg1][ctg2] = 0
        contig_connections[ctg1][ctg2] += 1

connection_list = []
for k, v in contig_connections.items():
    for k2, v2 in v.items():
        connection_list.append((v2, k, k2))
connection_list.sort(reverse=True)
for v, k, k2 in connection_list[:20]:
    print(v, k, "->", k2)

356 Chr11_hapB_Tb427v11 -> Chr9_hapB_Tb427v11
248 Chr6_hapA_Tb427v11 -> unitig_252_Tb427v11
219 Chr9_hapA_Tb427v11 -> Chr9_hapB_Tb427v11
215 unitig_148_Tb427v11 -> unitig_2569_Tb427v11
181 Chr1_hapA_Tb427v11 -> Chr6_hapA_Tb427v11
163 unitig_1852_Tb427v11 -> unitig_1853_Tb427v11
147 Chr1_hapB_Tb427v11 -> unitig_1853_Tb427v11
145 Chr6_hapB_Tb427v11 -> unitig_288_Tb427v11
136 unitig_1853_Tb427v11 -> unitig_1897_Tb427v11
134 Chr9_hapB_Tb427v11 -> unitig_228_Tb427v11
129 Chr6_hapA_Tb427v11 -> unitig_1889_Tb427v11
126 Chr6_hapA_Tb427v11 -> unitig_1947_Tb427v11
123 Chr3_hapA_Tb427v11 -> unitig_2111_Tb427v11
122 Chr11_hapA_Tb427v11 -> Chr11_hapB_Tb427v11
115 Chr9_hapB_Tb427v11 -> unitig_2279_Tb427v11
110 Chr1_hapA_Tb427v11 -> unitig_166_Tb427v11
107 Chr4_hapA_Tb427v11 -> Chr4_hapB_Tb427v11
104 Chr8_hapA_Tb427v11 -> Chr8_hapB_Tb427v11
101 unitig_2332_Tb427v11 -> unitig_309_Tb427v11
100 Chr1_hapA_Tb427v11 -> unitig_80_Tb427v11


In [18]:
def analyze_connections(picked_contig_a, picked_contig_b):
    f = figure(x_axis_label=picked_contig_a, y_axis_label=picked_contig_b)

    x1s = []
    y1s = []
    cs = []

    with open("../data/out/10_gap_spanning_reads/contig_connecting_reads.tsv", "r") as in_file:
        for line in in_file.readlines():
            if line[0] == "#":
                continue
            rname, o_rname, pos, o_rpos, rev_stnd, o_rev_stnd, map_q, o_map_q = line[:-1].split()
            # if int(map_q) < 30 or int(o_map_q) < 30:
            #     continue
            pos = int(pos)
            o_rpos = int(o_rpos)

            if rname == picked_contig_b and o_rname == picked_contig_a:
                o_rname, rname, o_rpos, pos, o_rev_stnd, rev_stnd, o_map_q, map_q = \
                    rname, o_rname, pos, o_rpos, rev_stnd, o_rev_stnd, map_q, o_map_q
            if rname == picked_contig_a and o_rname == picked_contig_b:
                x1s.append(pos)
                y1s.append(o_rpos)
                if rev_stnd == o_rev_stnd and rev_stnd == "False":
                    cs.append("blue")
                elif rev_stnd == o_rev_stnd and rev_stnd == "True":
                    cs.append("black")
                elif rev_stnd == "False":
                    cs.append("orange")
                else:
                    assert rev_stnd == "True"
                    cs.append("green")

    f.x(x=x1s, y=y1s, color=cs)


    x1s = []
    x2s = []
    y1s = []
    y2s = []
    cs = []


    for chr, start, end, _ in gap_pos[0].values():
        if chr + "_Tb427v10" == picked_contig_a:
            x1s.append(start)
            x2s.append(end)
            y2s.append(-1)
            y1s.append(-2)
            cs.append("red")
        if chr + "_Tb427v10" == picked_contig_b:
            y1s.append(start)
            y2s.append(end)
            x2s.append(-1)
            x1s.append(-2)
            cs.append("red")

    f.quad(left=x1s, right=x2s, bottom=y1s, top=y2s, color=cs, line_width=2)


    show(f, notebook_handle=True)

for v, k, k2 in sorted(connection_list[:20], key=lambda x: x[1]):
    if "unitig" in k and "unitig" in k2:
        continue
    if not ("Chr" in k or "Chr" in k2):
        continue
    # if ("Chr10" in k or "Chr10" in k2):# and ("unitig" in k or "unitig" in k2):
    analyze_connections(k, k2)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


# identify collapsed regions 

# post process the data

In [20]:
from identify_collapsed_regions import *


chr_to_investigate = "Chr4_hapA"
xs = []
ys = []
cs = []
for r_name, (distance, expected, chr, pos1, pos2, strnd, map_q) in zip(ref_names, ref_dev):
    if chr_to_investigate in chr and map_q > 30: # :1507kbp
        xs.append([min(pos1, pos2) / 100000, max(pos1, pos2) / 100000])
        ys.append([distance, distance])
        cs.append("#6068a2" if map_q > 30 else "lightblue")


data = post_process(ref_names, ref_dev)
clusters = cluster(data, distance_y=50)
clusters = filter_clusters_with_counter_indication(clusters, data)

qxs = []
qxe = []
qys = []
qye = []

output_file("../data/out/13_identify_collapsed_regions/collapsed_regions.html")
f = figure(x_axis_label="genome position [Mbp]", y_axis_label="distance deviation [bp]",
           title=chr_to_investigate)
for cluster_chr, cluster_start, cluster_end, cluster_deviation, c in clusters:
    if chr_to_investigate in cluster_chr: # :1507kbp
        qxs.append(cluster_start / 100000)
        qxe.append(cluster_end / 100000)
        qys.append(min(cx[3] for cx in c))
        qye.append(max(cx[3] for cx in c))


f.multi_line(xs=xs, ys=ys, color=cs, alpha=1)
f.quad(left=qxs, right=qxe, bottom=qys, top=qye, color="#a44758", alpha=1, line_width=2)


pt = "12pt"
lw = 2

f.output_backend = "svg"
f.title.text_font_size = pt
f.axis.axis_label_text_font_size = pt
f.axis.major_label_text_font_size = pt

f.axis.axis_line_width = lw
f.axis.major_tick_line_width = lw
f.axis.minor_tick_line_width = lw
f.grid.grid_line_width = lw


show(f, notebook_handle=True)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


# find gaps with too much flanking sequence

In [33]:
from identify_collapsed_regions import *

ref_names, ref_dev = load_dist_dev("../data/out/4_close_gaps_full_genome/1_gap_spanning_reads/distance_deviation.tsv")
gap_pos, _ = load_gaps("../data/out/4_close_gaps_full_genome/0_reannotated_gaps/gaps.gff3")

chr_to_investigate = "Chr4_hapA"
xs = []
ys = []
cs = []
for r_name, (distance, expected, chr, pos1, pos2, strnd, map_q) in zip(ref_names, ref_dev):
    if chr_to_investigate in chr and map_q > 30: # :1507kbp
        xs.append([min(pos1, pos2) / 1000000, max(pos1, pos2) / 1000000])
        ys.append([distance, distance])
        cs.append("#6068a2" if map_q > 30 else "lightblue")


data = post_process(ref_names, ref_dev, min_dev=float("inf"), max_dev=500)
clusters = cluster(data)
clusters = filter_clusters_with_counter_indication(clusters, data)
_, clusters = filter_clusters_that_overlap_gap(clusters, gap_pos, min_distance_to_gap=1000*3)

qxs = []
qxe = []
qys = []
qye = []

for cluster_chr, cluster_start, cluster_end, cluster_deviation, c in clusters:
    if chr_to_investigate in cluster_chr: # :1507kbp
        qxs.append(cluster_start / 1000000)
        qxe.append(cluster_end / 1000000)
        qys.append(min(cx[3] for cx in c))
        qye.append(max(cx[3] for cx in c))

        print(cluster_start, cluster_end)
    print(cluster_chr)

output_file("../data/out/4_close_gaps_full_genome/1_gap_spanning_reads/gaps_with_duplicated_sequence.html")
f = figure(x_axis_label="genome position [Mbp]", y_axis_label="distance deviation")
f.output_backend = "svg"
f.multi_line(xs=xs, ys=ys, color=cs, alpha=1, line_width=1)
f.quad(left=qxs, right=qxe, bottom=qys, top=qye, line_color="#a44758", color="#a44758", fill_alpha=0.8, line_width=2)

x1s = []
ys = []
gap_pos = load_gaps_by_contig("../data/out/4_close_gaps_full_genome/0_reannotated_gaps/gaps.gff3")
if chr_to_investigate + "_Tb427v11" in gap_pos:
    for start, end in gap_pos[chr_to_investigate + "_Tb427v11"]:
        x1s.append((start + end) / 2 / 1000000)
        ys.append(-15)
f.x(x1s, ys, color="#e5ae51", size=10, line_width=2)

pt = "12pt"
lw = 2

f.title.text_font_size = pt
f.axis.axis_label_text_font_size = pt
f.axis.major_label_text_font_size = pt

f.axis.axis_line_width = lw
f.axis.major_tick_line_width = lw
f.axis.minor_tick_line_width = lw
f.grid.grid_line_width = lw


#save(f)
show(f, notebook_handle=True)

BES17_Tb427v11
Chr11_hapA_Tb427v11
1480857 1508638
Chr4_hapA_Tb427v11
Chr5_hapA_Tb427v11
Chr5_hapA_Tb427v11
Chr6_hapA_Tb427v11
Chr9_hapB_Tb427v11


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-635a64c6-483e-4ff9-9f5c-9c6103a931e9.sock'
}


# analyze the fixing of collapsed regions

## load the remaining data

In [4]:
ref_names, ref_dev = load_dist_dev("../data/out/2.1_gap_spanning_reads/distance_deviation.tsv")
fixed_names, fixed_dev = load_dist_dev("../data/out/22_vpr_new_genome/distance_deviation.tsv")

gap_spanning_reads = {}

with open("../data/out/22_vpr_new_genome/gap_spanning_reads.tsv", "r") as file_in:
    for line in file_in:
        readname, *gaps = line[:-1].strip().split()
        readname = readname.split("_")[0]
        gap_names = []
        for gap in gaps:
            chrom, start, end = re.split(":|-", gap)
            # if not "8_hapB" in chrom:
            #     continue
            #chrom = chrom[:-len("_Tb427v10")]
            gap_name = chrom + ":" + str(int(start)//1000) + "kbp"
            gap_names.append(gap_name)
        gap_spanning_reads[readname] = " ".join(gap_names)

print(gap_spanning_reads)
print(fixed_names[:10])
print(ref_names[:10])

{'a2079ae0-9592-4bf0-a26e-07e1fcda63b0': 'Chr8_hapB:1103kbp', 'ddfef19b-abf2-4826-98d9-da313467b7e7': 'Chr5_hapA:886kbp', '62096527-d446-42dc-94f7-59dc45ba90cb': 'Chr2_hapA:63kbp', '81d2e85a-ffc3-4c89-a448-d59ec32747f0': 'Chr1_hapB:1437kbp', 'bedfc235-3c97-4952-8b41-6bfcd6d650e3': 'Chr4_hapB:114kbp', '361cc2f0-2c48-4eb0-8c67-00e56ec2c0e3': 'Chr7_hapA:1892kbp', 'd5d662ba-16f9-405d-be8a-f2cf9330a262': 'Chr1_hapA:990kbp', '4e69e73a-34a6-4755-8486-cb5168698da8': 'BES14:60kbp', 'a4f4ccd8-33b5-4e52-aa47-acadb648e72f': 'Chr5_hapA:1652kbp', '2f755348-8dbd-419b-868f-709e06e7d814': 'BES14:60kbp', '259ebaa5-cddd-4488-962c-1a399cbcc9be': 'Chr8_hapB:2793kbp', '6f7f0012-6dcf-46c2-95c0-5892300fbdcc': 'Chr11_hapA:1055kbp', 'e3a55be6-548c-4c29-b6b9-5308ed38a641': 'Chr3_hapA:542kbp', '087eb211-f674-4630-93d5-07a4787dd64d': 'Chr11_hapA:1364kbp', '4839039f-7740-4160-a49f-3eb21c4084f4': 'Chr2_hapB:293kbp', 'c6d6b975-786c-4b75-9f0c-c627ca118f62': 'Chr6_hapB:1378kbp', '91180164-0260-4474-9b8e-30af566b48d6': 

## post process the data

In [5]:

ref_dict = {x: y[0] for x, y in zip(ref_names, ref_dev) if y[-1] > 30}
fixed_dict = {x: y[0] for x, y in zip(fixed_names, fixed_dev) if y[-1] > 30}

# filter reads
# remove those where the disrance has not changed
filtered = set()
for read_name, distance in zip(fixed_names, fixed_dev):
    if not read_name in gap_spanning_reads:
       filtered.add(read_name)
    if read_name in gap_spanning_reads and len(gap_spanning_reads[read_name].split()) > 1:
        filtered.add(read_name)
    # if read_name in ref_dict and ref_dict[read_name] == distance:
    #     filtered.add(read_name)
    pass
    
readnames = [n for n in fixed_names if n in ref_dict and n in fixed_dict and not n in filtered]
print(readnames)


# figure out gap groups
gap_groups = set()
for gap in gap_spanning_reads.values():
    if len(gap.split()) == 1:
        gap_groups.add(gap)
gap_colors = {}
gap_scatters = {}
gap_fill_colors = {}
gap_x = {}
gap_y = {}

SCATTER_PALETTE = ["circle", "diamond", "triangle", "square", "inverted_triangle", "plus", "hex", "star", "square_pin", 
                   "triangle_pin"]
COLOR_PALETTE_EXT = COLOR_PALETTE + ["#999999"]


num_groups = len(gap_groups)

if num_groups > len(COLOR_PALETTE_EXT) * len(SCATTER_PALETTE) * 2:
    print("WARNING: not enough colors for all gap groups", num_groups, 
          len(COLOR_PALETTE_EXT) * len(SCATTER_PALETTE) * 2)

for idx, gap in enumerate(gap_groups):
    gap_x[gap] = []
    gap_y[gap] = []

read_colors = {}
read_fill_colors = {}
read_scatters = {}
for read in readnames:
    if read in gap_spanning_reads and len(gap_spanning_reads[read].split()) == 1:
        gap_x[gap_spanning_reads[read]].append(ref_dict[read])
        gap_y[gap_spanning_reads[read]].append(fixed_dict[read])

def mean(lst):
    if len(lst) == 0:
        return 0
    return sum(lst) / len(lst)

for idx, (gap_pos, gap_name) in enumerate(sorted([
        (mean([
                x - y for x, y in zip(gap_x[gap_name], gap_y[gap_name])
            ]), gap_name) for gap_name in gap_groups
        ])):
    gap_colors[gap_name] = COLOR_PALETTE_EXT[idx % len(COLOR_PALETTE_EXT)]
    gap_scatters[gap_name] = SCATTER_PALETTE[(idx // len(COLOR_PALETTE_EXT)) % len(SCATTER_PALETTE)]
    gap_fill_colors[gap_name] = gap_colors[gap_name] if (idx // (len(COLOR_PALETTE_EXT) * len(SCATTER_PALETTE))) % 2 == 0 else None

for read in readnames:
    if read in gap_spanning_reads and len(gap_spanning_reads[read].split()) == 1:
        read_colors[read] = None #gap_colors[gap_spanning_reads[read]]
        read_fill_colors[read] = gap_colors[gap_spanning_reads[read]]
        read_scatters[read] = gap_scatters[gap_spanning_reads[read]]
    else:
        read_colors[read] = "black"
        read_fill_colors[read] = None
        read_scatters[read] = "circle"



def load_gaps(file_name):
    gap_pos = {}
    contig_sizes = {}

    with open(file_name, "r") as in_file:
        for line in in_file:
            if line[0] == "#":
                if line.startswith("##sequence-region"):
                    contig, _, end = line.strip().split()[1:]
                    contig_sizes[contig] = int(end)
                continue
            ##columns: contig, type, start, idx, size_change
            contig, t, start, _, size_change, size_before, size_after = line.strip().split()

            
            gap_name = contig + ":" + str(int(start)//1000) + "kbp"
            gap_pos[gap_name] = [contig, int(start), int(size_change), t, int(size_before), int(size_after)]
    return gap_pos, contig_sizes


gap_pos, _ = load_gaps("../data/out/20_transfer_fixed_regions/gap_size_change.gff")
print(gap_pos)
print(gap_colors)
#print(gap_pos["BES14_Tb427v11:60kbp"][2] - gap_pos["BES14_Tb427v11:60kbp"][1])

['ddfef19b-abf2-4826-98d9-da313467b7e7', '81d2e85a-ffc3-4c89-a448-d59ec32747f0', '4e69e73a-34a6-4755-8486-cb5168698da8', 'a4f4ccd8-33b5-4e52-aa47-acadb648e72f', '9c93ae0f-62d2-46ce-892e-2b47610e3373', '9f2be5b3-0236-458b-b4df-888f66e06a47', '9eea2d8c-ce1e-499f-be8d-d10117d2df06', '108a0fb2-bb8a-4a64-a607-bbca8ef798d0', 'f88ef346-8b5e-467c-b6ab-fcaa98d1ece8', '4408a7e2-c8cb-4f37-90c5-add644125af3', '1d4d8f78-16bb-427d-a2d2-e18524e8f7b1', '59088221-50f7-42f2-a2ff-6efdf2ce2406', '372f2fb3-80ec-4bcc-b065-cd0035576ad7', '7c7d2e4f-70ff-4a66-be1d-1b56b0a44dbd', '5ae937f9-0d6d-4005-8af7-242de4685fdd', 'b1bd74aa-9fc5-466b-a9c0-82c82ef9833f', 'c8af1b28-c40c-453a-9569-83846281f485', 'f31626c5-6215-4895-9057-fbeb9cd05316', 'bbf1e2c9-4db1-43a2-ad25-b855d81fb615', 'e501e537-3dae-4009-9b14-63eb7ac5db27', 'f0111760-36dd-4344-b873-8736acc1fef0', '24751b41-1323-4a9b-86dd-a2dd0194a199', '68f7c094-cc0e-4185-bc9e-74191d04661c', 'ec1a0194-d56e-44f1-b116-1b65e83b9750', '174b5d35-cf30-4f4e-bfe8-b27931753da0',

In [13]:
import math

output_file("../data/out/20_transfer_fixed_regions/gap_size_amount.html")

def heatmap(xss, n_bins, log_axis, x_axis_label="", y_axis_label="", y_log_axis=False, width=300):
    f = figure(x_axis_type="log" if log_axis else "linear", x_axis_label=x_axis_label, y_axis_label=y_axis_label,
               width=width, height=300)
    f.output_backend = "svg"
    
    print(len(xss[0]))

    min_ = min([min(xs) for xs in xss])
    max_ = max([max(xs) for xs in xss]) + 1
    if log_axis:
        min_ = math.log(min_)
        max_ = math.log(max_)
    bin_w = (max_ - min_) / n_bins
    min_ -= min_ % bin_w
    bins = [0] * (n_bins + 1)
    for idx, xs in zip([1, -1], xss):
        for x in xs:
            if log_axis:
                x = math.log(x)
            p = int((x - min_) / bin_w)
            # print(p, x)
            bins[p] += idx
    xs = [min_ + bin_w * (i + 0.5) for i in range(n_bins + 1)]
    if log_axis:
        xs = [math.exp(x) for x in xs]
    bot = 0
    if y_log_axis:
        bins = [0 if x == 0 else (math.log10(abs(x)) * (1 if x >= 0 else -1)) for x in bins]
    f.vbar(x=xs, top=bins, width=bin_w, bottom=bot, color="#6068A2", 
            alpha=1)
    
    
    f.add_layout(LabelSet(x="x", y="y", text="text", text_color="black", angle=math.pi/2,
         text_baseline="middle", text_align="right", 
         y_offset=20, source=ColumnDataSource(data={"x": [x for x in xs], "y": [0]*(len(xs) + 1), 
                                                               "text": [str(x) for x in bins]})))
    
    #f.xaxis.ticker = FixedTicker(ticks=[int(min_ + bin_w * i + 1) for i in range(n_bins + 1)])


    pt = "12pt"
    lw = 2

    f.title.text_font_size = pt
    f.axis.axis_label_text_font_size = pt
    f.axis.major_label_text_font_size = pt

    f.axis.axis_line_width = lw
    f.axis.major_tick_line_width = lw
    f.axis.minor_tick_line_width = 0
    f.grid.grid_line_width = lw
    return f


f = heatmap([[size_after/1000 for _, (_, _, size_change, t, size_before, size_after) in gap_pos.items() if t in ["closedgap_full", 
                                                                                       "closedgap_a", "closedgap_b", 
                                                                                       "closedgap_masked"]]], 
                                                10, False, "gap size [kb]", "amount of gaps")
show(f, notebook_handle=True)


85




Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-f15ecf5b-53e9-48aa-b3a5-25b51bed6e1c.sock
[90m    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1595:16)[39m {
  errno: [33m-2[39m,
  code: [32m'ENOENT'[39m,
  syscall: [32m'connect'[39m,
  address: [32m'/run/user/1121/vscode-ipc-f15ecf5b-53e9-48aa-b3a5-25b51bed6e1c.sock'[39m
}


In [14]:
output_file("../data/out/20_transfer_fixed_regions/coll_size_amount_after.html")
f = heatmap([[size_after/1000 for _, (_, _, _, t, size_before, size_after) in gap_pos.items() if t == "expanded_region"]], 10, False, "coll. size [kb]", "amount of coll.")
show(f, notebook_handle=True)
output_file("../data/out/20_transfer_fixed_regions/coll_size_change.html")
f = heatmap([[(size_after - size_before)/1000 for _, (_, _, _, t, size_before, size_after) in gap_pos.items() if t == "expanded_region"]], 10, False, "coll. size change [kb]", "amount of coll.")
show(f, notebook_handle=True)

63




63


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-f15ecf5b-53e9-48aa-b3a5-25b51bed6e1c.sock
[90m    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1595:16)[39m {
  errno: [33m-2[39m,
  code: [32m'ENOENT'[39m,
  syscall: [32m'connect'[39m,
  address: [32m'/run/user/1121/vscode-ipc-f15ecf5b-53e9-48aa-b3a5-25b51bed6e1c.sock'[39m
}


Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-f15ecf5b-53e9-48aa-b3a5-25b51bed6e1c.sock
[90m    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1595:16)[39m {
  errno: [33m-2[39m,
  code: [32m'ENOENT'[39m,
  syscall: [32m'connect'[39m,
  address: [32m'/run/user/1121/vscode-ipc-f15ecf5b-53e9-48aa-b3a5-25b51bed6e1c.sock'[39m
}


In [178]:
output_file("../data/out/20_transfer_fixed_regions/expanded_repeat_size_change.html")

f = figure(x_axis_label="size in v11 [kbp]", y_axis_label="size in v12 [kbp]", x_axis_type="log", y_axis_type="log",
           width=300, height=300)
f.output_backend = "svg"

xs = [size_before/1000 for _, (_, _, _, t, size_before, size_after) in gap_pos.items() if t == "expanded_region"]
ys = [size_after/1000 for _, (_, _, _, t, size_before, size_after) in gap_pos.items() if t == "expanded_region"]

min_ = max(1, min(xs + ys))
max_ = max(xs + ys)

f.line([min_, max_], [min_, max_], color="lightgrey", line_width=2)

f.x(x=xs, 
    y=ys,
    color="#6068A2",
    size=15,
    line_width=2.5)

pt = "12pt"
lw = 2

f.title.text_font_size = pt
f.axis.axis_label_text_font_size = pt
f.axis.major_label_text_font_size = pt

f.axis.axis_line_width = lw
f.axis.major_tick_line_width = lw
f.axis.minor_tick_line_width = lw
f.grid.grid_line_width = lw
show(f, notebook_handle=True)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-b43ae827-a8b0-47e3-8e63-3c1688c14bdf.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-b43ae827-a8b0-47e3-8e63-3c1688c14bdf.sock'
}


In [183]:
output_file("../data/out/20_transfer_fixed_regions/dist_dev_before_after.html")

f = heatmap([[(d - e) / 1000 for d, e, *_ in fixed_dev], [(d - e) / 1000 for d, e, *_ in ref_dev]], 1000, False, "observed - expected VPR distance [kb]", "log10(amount of VPRs in v12 - amount in v11)", y_log_axis=True, width=600)
show(f, notebook_handle=True)





Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-b43ae827-a8b0-47e3-8e63-3c1688c14bdf.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-b43ae827-a8b0-47e3-8e63-3c1688c14bdf.sock'
}


## Plot the results

In [67]:
output_file("../data/out/22_vpr_new_genome/compare.html")

out_list = []
for y_pos in [True, False]:
    out_list.append([])
    def myp(x):
        return x * (1 if y_pos else -1)
    for x_pos in [False, True]:
        def mxp(x):
            return x * (1 if x_pos else -1)
        f = figure(x_axis_label="gap size change", width=300 if x_pos else 900, height=500 if y_pos else 300,
                y_axis_label="dd on improved genome", y_axis_type="log", x_axis_type="log",
                tooltips=[("", "@h"), ("", "@r")]#, title=("x+" if x_pos else "x-") + (" y+" if y_pos else " y-")
            )
        # f.line(x=[-1.5*10**5, 0.5*10**5], y=[-1.5*10**5, 0.5*10**5], color="black")

        if x_pos:
            f.y_range = out_list[-1][0].y_range
            f.yaxis.visible = False
        if not y_pos:
            f.x_range = out_list[0][1 if x_pos else 0].x_range
        else:
            f.xaxis.visible = False

        x_tr = {}
        for xpos in sorted(set([ref_dict[n] - fixed_dict[n] for n in readnames])):
            x_tr[xpos] = len(x_tr)

        def in_frame(n):
            in_x = mxp(ref_dict[n] - fixed_dict[n]) > 0 or (mxp(ref_dict[n] - fixed_dict[n]) == 0 and not x_pos)
            in_y = myp(fixed_dict[n]) > 0 or (myp(fixed_dict[n]) == 0 and not y_pos)
            return in_x and in_y

        f.x(x="x", y="y", line_color="cf", 
                fill_color="c", #marker="m", 
                size=9, alpha=0.4, line_alpha=0.4,
                source=ColumnDataSource(data={
                        "x": [max(0.1, mxp(ref_dict[n] - fixed_dict[n])) for n in readnames if in_frame(n)], 
                        "y": [max(0.1, myp(fixed_dict[n])) for n in readnames if in_frame(n)],
                        "c": [read_colors[n] for n in readnames if in_frame(n)],
                        "cf": [read_fill_colors[n] for n in readnames if in_frame(n)],
                        "m": [read_scatters[n] for n in readnames if in_frame(n)],
                        "h": [gap_spanning_reads[n] if n in gap_spanning_reads else "-/-" for n in readnames if in_frame(n)],
                        "r": [n for n in readnames if in_frame(n)]
                    }
                        ))
        f.output_backend = "svg"


        pt = "12pt"
        lw = 2

        f.title.text_font_size = pt
        f.axis.axis_label_text_font_size = pt
        f.axis.major_label_text_font_size = pt

        f.axis.axis_line_width = lw
        f.axis.major_tick_line_width = lw
        f.axis.minor_tick_line_width = lw
        f.grid.grid_line_width = lw

        out_list[-1].append(f)

out_list.append([])
for x_pos in [False, True]:
    def mxp(x):
        return x * (1 if x_pos else -1)
    f_width = 300 if x_pos else 900
    f = figure(x_axis_label="gap size change", width=f_width, height=400,
            y_axis_label="dd on improved genome", x_axis_type="log",
            tooltips=[("", "@h"), ("", "@r")]#, title=("x+" if x_pos else "x-") + (" y+" if y_pos else " y-")
        )
    # f.line(x=[-1.5*10**5, 0.5*10**5], y=[-1.5*10**5, 0.5*10**5], color="black")

    if x_pos:
        f.y_range = out_list[-1][0].y_range
        f.yaxis.visible = False
    f.x_range = out_list[0][1 if x_pos else 0].x_range

    x_tr = {}
    for xpos in sorted(set([ref_dict[n] - fixed_dict[n] for n in readnames])):
        x_tr[xpos] = len(x_tr)

    def in_frame(s):
        in_x = mxp(s) > 0 or (mxp(s) == 0 and not x_pos)
        return in_x
    
    gaps_by_size = []
    relevant_contigs = {}
    for gap_name, (chr, start, size) in gap_pos.items():
        gap_name = gap_name.replace("_Tb427v11", "")
        c = "red" if gap_name not in gap_colors else gap_colors[gap_name]
        chr2 = chr.replace("_Tb427v11", "").replace("Chr", "").replace("_", "").replace("hap", "")
        if not chr in relevant_contigs:
            relevant_contigs[chr2] = len(relevant_contigs)
        if in_frame(-size):
            gaps_by_size.append((-size, chr2, str((int(start)//100000)/10), c))

    for idx, (size, contig, pos, c) in enumerate(sorted(gaps_by_size, reverse=not x_pos)):
        # screen_x = (f_width - 100) * (idx + 1) / (len(gaps_by_size) + 2)
        
        l_text = contig + pos
        NUM_LINES = 20
        f.line(x=[max( 0.1, mxp(size)), max( 0.1, mxp(size))], y=[NUM_LINES+1, idx % NUM_LINES], color=c, line_width=2)
        f.add_layout(Label(x=max( 0.1, mxp(size)), y=idx % NUM_LINES, text_baseline="middle", text_align="left", 
                           text=l_text, text_font_size="8pt", x_offset=3, text_color=c,
                           background_fill_color='white', background_fill_alpha=0.75))


    # for idx, (size, contig, pos, c) in enumerate(sorted(gaps_by_size, reverse=not x_pos)):
    #     # screen_x = (f_width - 100) * (idx + 1) / (len(gaps_by_size) + 2)
        
    #     NUM_LINES = 20
    #     f.line(x=[max( 0.1, mxp(size)), max( 0.1, mxp(size))], y=[relevant_contigs[contig] + 0.5, 
    #                                                               relevant_contigs[contig]], color=c, line_width=2)
    #     f.add_layout(Label(x=max( 0.1, mxp(size)), y=relevant_contigs[contig], text_baseline="middle", text_align="left", 
    #                        text=pos, text_font_size="8pt", x_offset=3, text_color=c,
    #                        background_fill_color='white', background_fill_alpha=0.75))



    f.output_backend = "svg"


    pt = "12pt"
    lw = 2

    f.title.text_font_size = pt
    f.axis.axis_label_text_font_size = pt
    f.axis.major_label_text_font_size = pt

    f.axis.axis_line_width = lw
    f.axis.major_tick_line_width = lw
    f.axis.minor_tick_line_width = lw
    f.grid.grid_line_width = lw
    f.yaxis.visible = False

    out_list[-1].append(f)

show(grid(out_list), notebook_handle=True)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-b43ae827-a8b0-47e3-8e63-3c1688c14bdf.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-b43ae827-a8b0-47e3-8e63-3c1688c14bdf.sock'
}


In [32]:
output_file("../data/out/22_vpr_new_genome/compare2.html")


f = figure(x_axis_label="Deviation on reference genome",
        y_axis_label="Deviation on improved genome", 
        tooltips=[("", "@h")])
f.line(x=[-1.5*10**5, 0.5*10**5], y=[-1.5*10**5, 0.5*10**5], color="black")

def median(lst):
    if len(lst) == 0:
        return None
    return sorted(lst)[len(lst)//2]

def mean(lst):
    if len(lst) == 0:
        return None
    return sum(lst) / len(lst)

# f.scatter(x="x", y="y", line_color="c", fill_color="cf", marker="m", 
#         size=9, alpha=0.1, 
#         source=ColumnDataSource(data={
#                 "x": [ref_dict[n] - 1000 for n in readnames], 
#                 "y": [fixed_dict[n] for n in readnames],
#                 "c": [read_colors[n] for n in readnames],
#                 "cf": [read_fill_colors[n] for n in readnames],
#                 "m": [read_scatters[n] for n in readnames],
#                 "h": [gap_spanning_reads[n] for n in readnames],
#                 "r": [n for n in readnames]
#             }
#                 ))

f.scatter(x="x", y="y", size=9, alpha=0.8, line_color="c", fill_color="cf", marker="m", 
        source=ColumnDataSource(data={
                "x": [mean(gap_x[n]) - 1000 if median(gap_x[n]) is not None else None for n in gap_groups], 
                "y": [mean(gap_y[n]) for n in gap_groups],
                "c": [gap_colors[n] for n in gap_groups],
                "cf": [gap_fill_colors[n] for n in gap_groups],
                "m": [gap_scatters[n] for n in gap_groups],
                "h": [n for n in gap_groups]
            }
                ))

show(f, notebook_handle=True)
save(f)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-d02d7a45-3edc-4af2-b7d1-42cde59ec336.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-d02d7a45-3edc-4af2-b7d1-42cde59ec336.sock'
}


'/ladsie/project/ladsie_019/claudia/ont_assembly_improvement/data/out/22_vpr_new_genome/compare2.html'

## check some individual gaps

In [40]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
# GAP_TO_CHECK = "Chr11_core:263kbp"
#GAP_TO_CHECK = "Chr7_core:1789kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"
# GAP_TO_CHECK = "Chr11_3B:266kbp"
#GAP_TO_CHECK = "Chr6_A:2589kbp"
#GAP_TO_CHECK = "Chr10_B:5341kbp"
# GAP_TO_CHECK = "BES17:65kbp"
# GAP_TO_CHECK = "Chr11_3A:497kbp"
GAP_TO_CHECK = "Chr4_A"

f = figure(title="Comparison of distance deviations", x_axis_label="Reference", y_axis_label="Fixed", 
           tooltips=[("", "@h"), ("", "@r")])

picked_readnames = [n for n in readnames if n in gap_spanning_reads and GAP_TO_CHECK in gap_spanning_reads[n]]

f.line(x=[-1.5*10**5, 0.5*10**5], y=[-1.5*10**5, 0.5*10**5], color="black")
f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] for n in picked_readnames], 
                "y": [fixed_dict[n] for n in picked_readnames],
                "c": [read_colors[n] for n in picked_readnames],
                "h": [gap_spanning_reads[n] for n in picked_readnames],
                "r": picked_readnames
            }
                ))

show(f, notebook_handle=True)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-f98820cc-5cc5-4b7d-a26a-67efeb848081.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-f98820cc-5cc5-4b7d-a26a-67efeb848081.sock'
}


## get read clusters -> turn into table of "correctly" expanded gaps

In [None]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
#GAP_TO_CHECK = "Chr11_core:263kbp"
#GAP_TO_CHECK = "Chr11_3B:266kbp"
GAP_TO_CHECK = "Chr11_A:4918kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"

f = figure(title="looking at how to best cluster", x_axis_label="Ref fixed difference", y_axis_label="ref fixed sum", 
           tooltips=[("", "@h"), ("", "@r")])

picked_readnames = [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == GAP_TO_CHECK]

f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] - fixed_dict[n] for n in picked_readnames], 
                "y": [fixed_dict[n] + ref_dict[n] for n in picked_readnames],
                "c": [read_colors[n] for n in picked_readnames],
                "h": [gap_spanning_reads[n] for n in picked_readnames],
                "r": picked_readnames
            }
                ))

show(f, notebook_handle=True)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-e1f9b001-948f-4d16-a626-3f97eb0e0971.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-e1f9b001-948f-4d16-a626-3f97eb0e0971.sock'
}


In [None]:

def extract_reads_for_gap(gap_name):
    return [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == gap_name]

def cluster(l, key, max_dif):
    l.sort(key=key)

    clustered = [[l[0]]]
    if len(l) > 1:
        for read in l[1:]:
            if abs(key(read) - key(clustered[-1][-1])) > max_dif:
                clustered.append([])
            clustered[-1].append(read)
    return clustered

def cluster_reads(readnames, max_ref_fixed_diff = 10, max_ref_fixed_sum = 1000):
    return [c for l in cluster(readnames, lambda x: ref_dict[x] - fixed_dict[x], max_ref_fixed_diff) 
               for c in cluster(l, lambda x: ref_dict[x] + fixed_dict[x], max_ref_fixed_sum)]

def filter_clusters(clusters):
    return [c for c in clusters if len(c) > 3]

def get_mean_deviation_in_clusters(clusters, in_fixed=True):
    return [sum([fixed_dict[n] if in_fixed else ref_dict[n] for n in cluster]) / len(cluster) for cluster in clusters]

In [None]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
#GAP_TO_CHECK = "Chr11_core:263kbp"
GAP_TO_CHECK = "Chr11_A:4918kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"

# picked_readnames = extract_reads_for_gap(GAP_TO_CHECK)


def cluster(l, key, max_dif):
    l.sort(key=key)

    clustered = [[l[0]]]
    if len(l) > 1:
        for read in l[1:]:
            if abs(key(read) - key(clustered[-1][-1])) > max_dif:
                clustered.append([])
            clustered[-1].append(read)
    return clustered

# clustered = cluster_reads(picked_readnames)


# print(clustered)
# print([len(c) for c in clustered])

# clustered = filter_clusters(clustered)

# print(get_mean_deviation_in_clusters(clustered))
# print(get_mean_deviation_in_clusters(clustered, False))


In [None]:
gap_closed_if_fixed_dev_smaller_than = 5000


gap_names = gap_pos.keys()

closed_gaps = 0
print("correct", "#supp", "#contra", "dev", "fxd_dev", "other", "has_cluster", "name", sep="\t")
with open("../data/in/analysis_in/closed_gaps_analysis.gff", "w") as file_out:
    for gap in sorted(gap_names):
        has_cluster = False #not gap in gap_without_cluster
        read_names = extract_reads_for_gap(gap)
        chrom, start, end = gap_pos[gap]
        if len(read_names) > 0:
            read_clusters = filter_clusters(cluster_reads(read_names))
            cluster_fixed = get_mean_deviation_in_clusters(read_clusters)
            gap_sizes = get_mean_deviation_in_clusters(read_clusters, False)
            gap_closed = False
            gap_idx = 0
            min_fixed = float("inf")
            for idx, x in enumerate(cluster_fixed):
                if abs(x) < gap_closed_if_fixed_dev_smaller_than and abs(x) < min_fixed:
                    gap_closed = True
                    gap_idx = idx
                    min_fixed = abs(x)
            if gap_closed:
                print("Yes", len(read_clusters[idx]), len(read_names) - len(read_clusters[idx]), int(gap_sizes[idx]), 
                    int(cluster_fixed[idx]), len(gap_sizes) > 1, has_cluster, gap, sep="\t")
                closed_gaps += 1
                file_out.write("\t".join([chrom + "_Tb427v10", ".", "fixedgap", str(start), str(end), ".", ".", ".", 
                                          "estimated_length=1000;gap_type=within scaffold;closed_correctly=true"]) + "\n" )
            else:
                print("No" if len(read_names) > 5 else "?", "", len(read_names), "", "", "", has_cluster, gap, sep="\t")
                file_out.write("\t".join([chrom + "_Tb427v10", ".", "notenoughdatagap", str(start), str(end), ".", ".", ".", 
                                          "estimated_length=1000;gap_type=within scaffold;not_enough_data=true"]) + "\n" )
        else:
            print("?", "", "", "", "", "", has_cluster, gap, sep="\t")
            file_out.write("\t".join([chrom + "_Tb427v10", ".", "notenoughdatagap", str(start), str(end), ".", ".", ".", 
                                        "estimated_length=1000;gap_type=within scaffold;not_enough_data=true"]) + "\n" )
print()
print()
print("closed", closed_gaps, "out of", len(gap_names), "gaps")

correct	#supp	#contra	dev	fxd_dev	other	has_cluster	name
Yes	14	2	-1091	-3062	False	False	BES17:65kbp
?		3				False	BES2:62kbp
Yes	21	0	1125	128	False	False	Chr10_A:4101kbp
Yes	14	1	-6040	142	False	False	Chr10_A:76kbp
Yes	11	5	1043	45	False	False	Chr10_B:4083kbp
Yes	10	2	-44272	-464	False	False	Chr10_B:5341kbp
Yes	19	1	-11365	183	False	False	Chr10_B:58kbp
Yes	6	1	-2951	-2951	False	False	Chr11_A:14kbp
?						False	Chr11_A:279kbp
Yes	27	1	-104	82	False	False	Chr11_A:4651kbp
No		25				False	Chr11_A:4918kbp
No		9				False	Chr11_A:4971kbp
?						False	Chr11_B:296kbp
Yes	11	0	1065	69	False	False	Chr11_B:32kbp
Yes	17	3	-10477	48	False	False	Chr11_B:4669kbp
Yes	18	1	-23788	399	False	False	Chr11_B:4955kbp
?		2				False	Chr11_B:5168kbp
?						False	Chr11_B:5455kbp
?		1				False	Chr1_A:100kbp
?						False	Chr1_A:1149kbp
?		1				False	Chr1_A:1212kbp
?		1				False	Chr1_A:2357kbp
Yes	35	2	-3970	116	False	False	Chr1_A:3037kbp
?						False	Chr1_A:711kbp
Yes	9	0	1099	101	False	False	Chr1_A:946kbp
Yes	2

# Create a plot for the error rate

In [5]:
gap_errors = []
non_gap_errors = []

with open("../data/out/26.1_analyze_error_rates/combined_error_rates.tsv", "r") as file_in:
    for line in file_in:
        if line[0] == "#" or len(line.strip().split()) != 5:
            continue
        chrom, start, end, is_gap, error_rate = line.strip().split()
        error_rate = float(error_rate)
        start = int(start)
        end = int(end)
        if is_gap == "True":
            gap_errors.append(error_rate)
        else:
            non_gap_errors.append(error_rate)



In [6]:
output_file("../data/out/26.1_analyze_error_rates/error_rates.html")
p = figure(x_range=["gap", "non_gap"], y_axis_label="mismatches / bases mapped", title="Error rates")
p.output_backend = "svg"

p.scatter(y='v', x=jitter('x', width=0.6, range=p.x_range), color="#6068a2",
            source=ColumnDataSource(data={"v":gap_errors, "x": ["gap"]*len(gap_errors)}), alpha=0.5)
p.scatter(y='v', x=jitter('x', width=0.6, range=p.x_range), color="#6068a2",
            source=ColumnDataSource(data={"v":non_gap_errors, "x": ["non_gap"]*len(non_gap_errors)}), alpha=0.5)


show(p)
save(p)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-f11615e9-a44b-46a5-b840-a9bde1afdf51.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-f11615e9-a44b-46a5-b840-a9bde1afdf51.sock'
}


'/ladsie/project/ladsie_019/claudia/ont_assembly_improvement/data/out/26.1_analyze_error_rates/error_rates.html'