# Analyzing the improved genome assembly

## run the notebook serve on a client node

```
    srun --pty --nodes=1  --ntasks-per-node=1 --cpus-per-task=28 --time 100:00:00 --job-name bash-jupyter bash
    conda activate ont_assembly
    jupyter notebook --ip 0.0.0.0 --port 3001 --no-browser
```

## bokeh imports

In [10]:
from bokeh.plotting import figure
from bokeh.palettes import viridis
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, FactorRange, LabelSet, Whisker
from bokeh.models.tickers import FixedTicker
from bokeh.transform import jitter
from bokeh.layouts import column, row, gridplot
from math import pi
from bokeh.layouts import row
import random

COLOR_PALETTE = ["#0072B2", "#D55E00", "#009E73", "#E69F00", "#CC79A7", "#56B4E9", "#F0E442"]

output_notebook()

## load the data

In [50]:
def load_dist_dev(file_name, max_dist=None):
    names = []
    ret = []
    with open(file_name, "r") as in_file:
        for line in in_file:
            if line[0] == "#":
                continue
            else:
                readname, distance = line.strip().split()
                if max_dist is None or int(distance) < max_dist:
                    names.append(readname)
                    ret.append(int(distance))
    return names, ret

ref_names, ref_dev = load_dist_dev("../data/virtual_paired_read_dist/referece.distance_deviation")
fixed_names, fixed_dev = load_dist_dev("../data/virtual_paired_read_dist/fixed_n.distance_deviation")

gap_spanning_reads = {}

with open("../data/virtual_paired_read_dist/gap_spanning_reads", "r") as file_in:
    for line in file_in:
        readname, *gaps = line[:-1].strip().split()
        gap_spanning_reads[readname] = " ".join(gaps)



## post process the data

In [53]:

ref_dict = {x: y for x, y in zip(ref_names, ref_dev)}

# filter reads
# remove those where the disrance has not changed
filtered = set()
for read_name, distance in zip(fixed_names, fixed_dev):
    if not read_name in ref_dict:
        filtered.add(read_name)
    if not read_name in gap_spanning_reads:
        filtered.add(read_name)
    # if read_name in ref_dict and ref_dict[read_name] == distance:
    #     filtered.add(read_name)
    



# figure out gap groups
gap_groups = set()
for gap in gap_spanning_reads.values():
    gap_groups.add(gap)
gap_colors = {}
for idx, gap in enumerate(gap_groups):
    if len(gap.split()) > 1:
        gap_colors[gap] = "black"
    else:
        gap_colors[gap] = COLOR_PALETTE[idx % len(COLOR_PALETTE)]
read_colors = {}
for read in fixed_names:
    if read in ref_dict:
        if read in gap_spanning_reads:
            read_colors[read] = gap_colors[gap_spanning_reads[read]]
        else:
            read_colors[read] = "black"

combined_def = [(ref_dict[n], y, read_colors[n], gap_spanning_reads[n] if n in gap_spanning_reads else "", n) for n, y in zip(fixed_names, fixed_dev) if n in ref_dict and not n in filtered]

## Plot the results

In [None]:
def barplot(data, data_div=None, num_bars=100, title="Bar plot"):
    f = figure(title=title, y_axis_type="log")

    max_val = max(data)
    min_val = min(data)
    if not data_div is None:
        max_val = max(max_val, max(data_div))
        min_val = min(min_val, min(data_div))
    bin_size = (max_val - min_val) / num_bars
    bins = [0]*(num_bars+1)
    bins_div = [0]*(num_bars+1)
    for x in data:
        bins[int((x - min_val) / bin_size)] += 1
    if not data_div is None:
        for x in data_div:
            bins_div[int((x - min_val) / bin_size)] += 1
        vals = [x/y if y != 0 else 1 for x, y in zip(bins, bins_div)]
        f.vbar(x=[x*bin_size + bin_size/2 + min_val for x in range(num_bars + 1)], width=bin_size, top=[max(1, x) for x in vals], 
               bottom=[min(1, x) for x in vals])
    else:
        f.vbar(x=[x*bin_size + bin_size/2 + min_val for x in range(num_bars + 1)], width=bin_size, top=bins, bottom=0.1)


    show(f, notebook_handle=True)



#barplot([x[0] for x in combined_def])
#barplot([x[1] for x in combined_def])
barplot([x[1] for x in combined_def], [x[0] for x in combined_def], title="Fixed / Reference distance deviation")



# f.scatter(x=jitter("x", 1), y="y", size=9, alpha=0.4, 
#           source=ColumnDataSource(data={"x":[1]*len(combined_def), "y": [x[0] for x in combined_def]}))

# show(f, notebook_handle=True)

# f = figure(title="Fixed")

# f.scatter(x=jitter("x", 1), y="y", size=9, alpha=0.4, 
#           source=ColumnDataSource(data={"x":[1]*len(combined_def), "y": [x[1] for x in combined_def]}))

# show(f, notebook_handle=True)

In [54]:
f = figure(title="Comparison of distance deviations", x_axis_label="Reference", y_axis_label="Fixed", 
           tooltips="@h")

f.scatter(x="x", y="y", line_color=None, fill_color="c", size=9, alpha=0.4, 
          source=ColumnDataSource(data={
                "x": [x[0] for x in combined_def], 
                "y": [x[1] for x in combined_def],
                "c": [x[2] for x in combined_def],
                "h": [x[3] for x in combined_def]
            }
                ))

show(f, notebook_handle=True)

## extract the "correctly" closed gaps

In [69]:
closed_gaps = {}
improved_gaps = {}

GAP_IS_FIXED_MAX_DEVIATION = 5000
GAP_IS_FIXED_MIN_READS = 5
GAP_IS_IMPROVED_MIN_IMPROVEMENT = 5000

for ref_dev, fix_dev, color, gap, read_name in combined_def:
    if len(gap.split()) == 1:
        if abs(fix_dev) < GAP_IS_FIXED_MAX_DEVIATION:
            if not gap in closed_gaps:
                closed_gaps[gap] = []
            closed_gaps[gap].append(read_name)
        if abs(fix_dev) + GAP_IS_IMPROVED_MIN_IMPROVEMENT <= abs(ref_dev):
            if not gap in improved_gaps:
                improved_gaps[gap] = []
            improved_gaps[gap].append(read_name)

for gap, reads in list(closed_gaps.items()):
    if len(reads) < GAP_IS_FIXED_MIN_READS:
        del closed_gaps[gap]

for gap, reads in list(improved_gaps.items()):
    if len(reads) < GAP_IS_FIXED_MIN_READS:
        del improved_gaps[gap]

for gap in closed_gaps.keys():
    if gap in improved_gaps:
        del improved_gaps[gap]




print("Closed gaps", "supporting reads", sep="\t")
for gap, reads in sorted(closed_gaps.items()):
    print(gap, len(reads))
print("closed", len(closed_gaps), "gaps in total")

print()
print("Improved gaps", "supporting reads", sep="\t")
for gap, reads in sorted(improved_gaps.items()):
    print(gap, len(reads))
print("improved", len(improved_gaps), "gaps in total")

Closed gaps	supporting reads
BES17:65kbp 7
Chr11_3A:284kbp 8
Chr11_core:263kbp 20
Chr1_3A:2090kbp 19
Chr1_3A:43kbp 16
Chr1_3B:226kbp 16
Chr1_3B:299kbp 5
Chr1_3B:372kbp 7
Chr5_3B:167kbp 15
Chr5_3B:286kbp 24
Chr6_3A:1057kbp 12
Chr6_3B:1000kbp 8
Chr7_core:1789kbp 21
Chr8_5A:666kbp 15
Chr8_5B:335kbp 25
Chr9_3A:437kbp 8
Chr9_3A:617kbp 15
Chr9_5A:211kbp 7
closed 18 gaps in total

Improved gaps	supporting reads
Chr10_3A:1256kbp 9
Chr1_core:609kbp 8
Chr5_core:223kbp 6
improved 3 gaps in total
