# Analyzing the improved genome assembly

## run the notebook serve on a client node

```
    srun --pty --nodes=1  --ntasks-per-node=1 --cpus-per-task=28 --time 100:00:00 --job-name bash-jupyter bash
    conda activate ont_assembly
    jupyter notebook --ip 0.0.0.0 --port 3001 --no-browser
```

## bokeh imports

In [1]:
from bokeh.plotting import figure, output_file, save
from bokeh.palettes import viridis
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, FactorRange, LabelSet, Whisker
from bokeh.models.tickers import FixedTicker
from bokeh.transform import jitter
from bokeh.layouts import column, row, gridplot
from math import pi
from bokeh.layouts import row
import random
import re
from identify_collapsed_regions import load_dist_dev, load_gaps

COLOR_PALETTE = ["#0072B2", "#D55E00", "#009E73", "#E69F00", "#CC79A7", "#56B4E9", "#F0E442"]

output_notebook()

# identify reverse complemented scaffolds

In [10]:
# load the data:


def load_gaps_by_contig(file_name):
    ret = {}

    with open(file_name, "r") as in_file:
        for line in in_file:
            if line[0] == "#":
                continue
            chrom, source, anno_type, start, end, *extra = line.strip().split()
            if chrom not in ret:
                ret[chrom] = []
            start = int(start)
            end = int(end)
            ret[chrom].append((start, end))
    return ret

gap_pos = load_gaps_by_contig("../data/out/2_ref_reannotated_gaps/gaps.gff3")

def load_read_pos_and_str(file_name_in):
    ret = {}
    with open(file_name_in, "r") as file_in:
        for line in file_in:
            if line[0] == "#":
                continue
            read_name, chrom, pos1, pos2, strand1, strand2, map_q = line.strip().split("\t")
            pos1 = int(pos1)
            pos2 = int(pos2)
            map_q = int(map_q)
            strand1 = strand1 == "0"
            strand2 = strand2 == "0"
            if chrom not in ret:
                ret[chrom] = []
            ret[chrom].append((read_name, min(pos1, pos2), max(pos1, pos2), strand1, strand2, map_q))
    return ret

read_pos_and_strand = load_read_pos_and_str("../data/out/6_gap_spanning_reads_old_genome/read_pos_and_strnd.tsv")

In [11]:
# filter the reads

def filter_reads(reads, gaps):
    ret = {}
    for contig, r in reads.items():
        ret[contig] = []
        if not contig in gaps:
            continue
        gaps[contig].sort()
        r.sort(key=lambda x: x[1])

        for r_name, pos1, pos2, s1, s2, map_q in r:
            for start, end in gaps[contig]:
                if pos1 < end and pos2 > start:
                    ret[contig].append((r_name, pos1, pos2, s1, s2, map_q))
                    break
    return ret

filtered_reads = filter_reads(read_pos_and_strand, gap_pos)

In [15]:
# plot

contigs_to_check = set()
for contig, r in filtered_reads.items():
    for r_name, pos1, pos2, s1, s2, _ in r:
        if s1 != s2:
            contigs_to_check.add(contig)


def plot_support(chrom):
    f = figure(x_axis_label="genome position", title=chrom)

    x1s = []
    x2s = []
    ys = []
    cs = []
    for idx, (r_name, pos1, pos2, s1, s2, map_q) in enumerate(filtered_reads[chrom]):
        x1s.append(pos1)
        x2s.append(pos2)
        ys.append(idx * 2)
        cs.append(("blue" if s1 == s2 else "orange") if map_q > 30 else ("lightblue" if s1 == s2 else "yellow"))
    for start, end in gap_pos[chrom]:
        x1s.append(start)
        x2s.append(end)
        ys.append(-2)
        cs.append("red")

    f.hbar(left=x1s, right=x2s, y=ys, color=cs, line_width=2)
    return f


show(column([plot_support(chrom) for chrom in sorted(list(contigs_to_check))]), notebook_handle=True)

## load the data

In [40]:
ref_names, ref_dev = load_dist_dev("../data/out/virtual_paired_read_dist/referece.distance_deviation")
gap_pos = load_gaps("../data/out/samba_out_1/reference.gaps.gff3")


# identify collapsed regions 

# post process the data

In [41]:
f = figure(x_axis_label="genome position", y_axis_label="distance deviation")

xs = []
ys = []
r_names = []
for r_name, (distance, expected, chr, pos1, pos2, strnd) in zip(ref_names, ref_dev):
    if distance < -100:
        if "Chr5_A" in chr: # :1507kbp
            xs.append([min(pos1, pos2), max(pos1, pos2)])
            ys.append([distance, distance])
            r_names.append(r_name)

f.multi_line(xs=xs, ys=ys)

show(f, notebook_handle=True)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-e1f9b001-948f-4d16-a626-3f97eb0e0971.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-e1f9b001-948f-4d16-a626-3f97eb0e0971.sock'
}


# analyze the fixing of collapsed regions

## load the remaining data

In [42]:
fixed_names, fixed_dev = load_dist_dev("../data/out/virtual_paired_read_dist/fixed_n.distance_deviation")

gap_spanning_reads = {}

with open("../data/out/virtual_paired_read_dist/gap_spanning_reads", "r") as file_in:
    for line in file_in:
        readname, *gaps = line[:-1].strip().split()
        gap_names = []
        for gap in gaps:
            chrom, start, end = re.split(":|-", gap)
            #chrom = chrom[:-len("_Tb427v10")]
            gap_name = chrom + ":" + str(int(start)//1000) + "kbp"
            gap_names.append(gap_name)
        gap_spanning_reads[readname] = " ".join(gap_names)

print(gap_spanning_reads)

{'81d2e85a-ffc3-4c89-a448-d59ec32747f0': 'Chr1_B:1369kbp', 'd5d662ba-16f9-405d-be8a-f2cf9330a262': 'Chr1_A:946kbp', 'a4f4ccd8-33b5-4e52-aa47-acadb648e72f': 'Chr5_A:1675kbp', '3a242236-0177-40af-ba86-72840dbe3a0e': 'Chr3_B:25kbp', 'c7af1802-956c-4076-89e6-fd713f21d9ca': 'Chr4_B:1530kbp', 'f88ef346-8b5e-467c-b6ab-fcaa98d1ece8': 'Chr9_A:211kbp', '59088221-50f7-42f2-a2ff-6efdf2ce2406': 'Chr9_B:101kbp', '1180e6d1-5028-4398-a64a-ab7745a56d12': 'Chr9_A:3150kbp', '372f2fb3-80ec-4bcc-b065-cd0035576ad7': 'Chr8_B:780kbp', '7c7d2e4f-70ff-4a66-be1d-1b56b0a44dbd': 'Chr9_A:336kbp', '5b5e8a3a-b4b8-47df-9370-9986c1a1ebfc': 'Chr11_A:14kbp', '5ae937f9-0d6d-4005-8af7-242de4685fdd': 'Chr6_B:2385kbp', 'b1bd74aa-9fc5-466b-a9c0-82c82ef9833f': 'Chr8_A:335kbp', 'c8af1b28-c40c-453a-9569-83846281f485': 'Chr1_B:1069kbp', 'f31626c5-6215-4895-9057-fbeb9cd05316': 'Chr8_B:3140kbp', '28a6a8c4-6250-4f1c-b6d8-d49d1500e916': 'Chr10_B:5341kbp', 'bbf1e2c9-4db1-43a2-ad25-b855d81fb615': 'Chr10_A:4101kbp', 'e501e537-3dae-4009-

## post process the data

In [43]:

ref_dict = {x: y[0] for x, y in zip(ref_names, ref_dev)}
fixed_dict = {x: y[0] for x, y in zip(fixed_names, fixed_dev)}

# filter reads
# remove those where the disrance has not changed
filtered = set()
for read_name, distance in zip(fixed_names, fixed_dev):
    if not read_name in gap_spanning_reads:
       filtered.add(read_name)
    # if read_name in ref_dict and ref_dict[read_name] == distance:
    #     filtered.add(read_name)
    pass
    
readnames = [n for n in fixed_names if n in ref_dict and not n in filtered]



# figure out gap groups
gap_groups = set()
for gap in gap_spanning_reads.values():
    gap_groups.add(gap)
gap_colors = {}
for idx, gap in enumerate(gap_groups):
    if len(gap.split()) > 1:
        gap_colors[gap] = "black"
    else:
        gap_colors[gap] = COLOR_PALETTE[idx % len(COLOR_PALETTE)]
read_colors = {}
for read in readnames:
    read_colors[read] = gap_colors[gap_spanning_reads[read]] if read in gap_spanning_reads else "black"


## Plot the results

In [44]:
output_file("../data/out/virtual_paired_read_dist/compare.html")
f = figure(title="Comparison of distance deviations", x_axis_label="Deviation on reference genome", y_axis_label="Deviation on improved genome", 
           tooltips=[("", "@h"), ("", "@r")])

f.line(x=[-1.5*10**5, 0.5*10**5], y=[-1.5*10**5, 0.5*10**5], color="black")

f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] for n in readnames], 
                "y": [fixed_dict[n] for n in readnames],
                "c": [read_colors[n] for n in readnames],
                "h": [gap_spanning_reads[n] if n in gap_spanning_reads else "none" for n in readnames],
                "r": [n for n in readnames]
            }
                ))

show(f, notebook_handle=True)
save(f)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-e1f9b001-948f-4d16-a626-3f97eb0e0971.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-e1f9b001-948f-4d16-a626-3f97eb0e0971.sock'
}


'/ladsie/project/ladsie_019/claudia/ont_assembly_improvement/data/out/virtual_paired_read_dist/compare.html'

## check some individual gaps

In [45]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
# GAP_TO_CHECK = "Chr11_core:263kbp"
#GAP_TO_CHECK = "Chr7_core:1789kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"
# GAP_TO_CHECK = "Chr11_3B:266kbp"
#GAP_TO_CHECK = "Chr6_A:2589kbp"
GAP_TO_CHECK = "Chr10_B:5341kbp"
# GAP_TO_CHECK = "BES17:65kbp"
# GAP_TO_CHECK = "Chr11_3A:497kbp"

f = figure(title="Comparison of distance deviations", x_axis_label="Reference", y_axis_label="Fixed", 
           tooltips=[("", "@h"), ("", "@r")])

picked_readnames = [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == GAP_TO_CHECK]

f.line(x=[-1.5*10**5, 0.5*10**5], y=[-1.5*10**5, 0.5*10**5], color="black")
f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] for n in picked_readnames], 
                "y": [fixed_dict[n] for n in picked_readnames],
                "c": [read_colors[n] for n in picked_readnames],
                "h": [gap_spanning_reads[n] for n in picked_readnames],
                "r": picked_readnames
            }
                ))

show(f, notebook_handle=True)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-e1f9b001-948f-4d16-a626-3f97eb0e0971.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-e1f9b001-948f-4d16-a626-3f97eb0e0971.sock'
}


## get read clusters -> turn into table of "correctly" expanded gaps

In [46]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
#GAP_TO_CHECK = "Chr11_core:263kbp"
#GAP_TO_CHECK = "Chr11_3B:266kbp"
GAP_TO_CHECK = "Chr11_A:4918kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"

f = figure(title="looking at how to best cluster", x_axis_label="Ref fixed difference", y_axis_label="ref fixed sum", 
           tooltips=[("", "@h"), ("", "@r")])

picked_readnames = [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == GAP_TO_CHECK]

f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [ref_dict[n] - fixed_dict[n] for n in picked_readnames], 
                "y": [fixed_dict[n] + ref_dict[n] for n in picked_readnames],
                "c": [read_colors[n] for n in picked_readnames],
                "h": [gap_spanning_reads[n] for n in picked_readnames],
                "r": picked_readnames
            }
                ))

show(f, notebook_handle=True)

Unable to connect to VS Code server: Error in request.
Error: connect ENOENT /run/user/1121/vscode-ipc-e1f9b001-948f-4d16-a626-3f97eb0e0971.sock
    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1494:16) {
  errno: -2,
  code: 'ENOENT',
  syscall: 'connect',
  address: '/run/user/1121/vscode-ipc-e1f9b001-948f-4d16-a626-3f97eb0e0971.sock'
}


In [47]:

def extract_reads_for_gap(gap_name):
    return [n for n in readnames if n in gap_spanning_reads and gap_spanning_reads[n] == gap_name]

def cluster(l, key, max_dif):
    l.sort(key=key)

    clustered = [[l[0]]]
    if len(l) > 1:
        for read in l[1:]:
            if abs(key(read) - key(clustered[-1][-1])) > max_dif:
                clustered.append([])
            clustered[-1].append(read)
    return clustered

def cluster_reads(readnames, max_ref_fixed_diff = 10, max_ref_fixed_sum = 1000):
    return [c for l in cluster(readnames, lambda x: ref_dict[x] - fixed_dict[x], max_ref_fixed_diff) 
               for c in cluster(l, lambda x: ref_dict[x] + fixed_dict[x], max_ref_fixed_sum)]

def filter_clusters(clusters):
    return [c for c in clusters if len(c) > 3]

def get_mean_deviation_in_clusters(clusters, in_fixed=True):
    return [sum([fixed_dict[n] if in_fixed else ref_dict[n] for n in cluster]) / len(cluster) for cluster in clusters]

In [48]:
#GAP_TO_CHECK = "Chr10_3A:1256kbp"
#GAP_TO_CHECK = "Chr11_core:263kbp"
GAP_TO_CHECK = "Chr11_A:4918kbp" # on the cores there are two populations
#GAP_TO_CHECK = "Chr5_3B:286kbp"

# picked_readnames = extract_reads_for_gap(GAP_TO_CHECK)


def cluster(l, key, max_dif):
    l.sort(key=key)

    clustered = [[l[0]]]
    if len(l) > 1:
        for read in l[1:]:
            if abs(key(read) - key(clustered[-1][-1])) > max_dif:
                clustered.append([])
            clustered[-1].append(read)
    return clustered

# clustered = cluster_reads(picked_readnames)


# print(clustered)
# print([len(c) for c in clustered])

# clustered = filter_clusters(clustered)

# print(get_mean_deviation_in_clusters(clustered))
# print(get_mean_deviation_in_clusters(clustered, False))


In [49]:
gap_closed_if_fixed_dev_smaller_than = 5000


gap_names = gap_pos.keys()

closed_gaps = 0
print("correct", "#supp", "#contra", "dev", "fxd_dev", "other", "has_cluster", "name", sep="\t")
with open("../data/in/analysis_in/closed_gaps_analysis.gff", "w") as file_out:
    for gap in sorted(gap_names):
        has_cluster = False #not gap in gap_without_cluster
        read_names = extract_reads_for_gap(gap)
        chrom, start, end = gap_pos[gap]
        if len(read_names) > 0:
            read_clusters = filter_clusters(cluster_reads(read_names))
            cluster_fixed = get_mean_deviation_in_clusters(read_clusters)
            gap_sizes = get_mean_deviation_in_clusters(read_clusters, False)
            gap_closed = False
            gap_idx = 0
            min_fixed = float("inf")
            for idx, x in enumerate(cluster_fixed):
                if abs(x) < gap_closed_if_fixed_dev_smaller_than and abs(x) < min_fixed:
                    gap_closed = True
                    gap_idx = idx
                    min_fixed = abs(x)
            if gap_closed:
                print("Yes", len(read_clusters[idx]), len(read_names) - len(read_clusters[idx]), int(gap_sizes[idx]), 
                    int(cluster_fixed[idx]), len(gap_sizes) > 1, has_cluster, gap, sep="\t")
                closed_gaps += 1
                file_out.write("\t".join([chrom + "_Tb427v10", ".", "fixedgap", str(start), str(end), ".", ".", ".", 
                                          "estimated_length=1000;gap_type=within scaffold;closed_correctly=true"]) + "\n" )
            else:
                print("No" if len(read_names) > 5 else "?", "", len(read_names), "", "", "", has_cluster, gap, sep="\t")
                file_out.write("\t".join([chrom + "_Tb427v10", ".", "notenoughdatagap", str(start), str(end), ".", ".", ".", 
                                          "estimated_length=1000;gap_type=within scaffold;not_enough_data=true"]) + "\n" )
        else:
            print("?", "", "", "", "", "", has_cluster, gap, sep="\t")
            file_out.write("\t".join([chrom + "_Tb427v10", ".", "notenoughdatagap", str(start), str(end), ".", ".", ".", 
                                        "estimated_length=1000;gap_type=within scaffold;not_enough_data=true"]) + "\n" )
print()
print()
print("closed", closed_gaps, "out of", len(gap_names), "gaps")

correct	#supp	#contra	dev	fxd_dev	other	has_cluster	name
Yes	14	2	-1091	-3062	False	False	BES17:65kbp
?		3				False	BES2:62kbp
Yes	21	0	1125	128	False	False	Chr10_A:4101kbp
Yes	14	1	-6040	142	False	False	Chr10_A:76kbp
Yes	11	5	1043	45	False	False	Chr10_B:4083kbp
Yes	10	2	-44272	-464	False	False	Chr10_B:5341kbp
Yes	19	1	-11365	183	False	False	Chr10_B:58kbp
Yes	6	1	-2951	-2951	False	False	Chr11_A:14kbp
?						False	Chr11_A:279kbp
Yes	27	1	-104	82	False	False	Chr11_A:4651kbp
No		25				False	Chr11_A:4918kbp
No		9				False	Chr11_A:4971kbp
?						False	Chr11_B:296kbp
Yes	11	0	1065	69	False	False	Chr11_B:32kbp
Yes	17	3	-10477	48	False	False	Chr11_B:4669kbp
Yes	18	1	-23788	399	False	False	Chr11_B:4955kbp
?		2				False	Chr11_B:5168kbp
?						False	Chr11_B:5455kbp
?		1				False	Chr1_A:100kbp
?						False	Chr1_A:1149kbp
?		1				False	Chr1_A:1212kbp
?		1				False	Chr1_A:2357kbp
Yes	35	2	-3970	116	False	False	Chr1_A:3037kbp
?						False	Chr1_A:711kbp
Yes	9	0	1099	101	False	False	Chr1_A:946kbp
Yes	2