In [10]:
import pandas as pd
from local.constants import WORKSPACE_ROOT

df = pd.read_csv(WORKSPACE_ROOT/"data/assembly_qc/epi300.quast.tsv", sep="\t")
for c in df.columns:
    val = df.iloc[0][c]
    print(f"{c}{' '*(30-len(c))}{val}")

Assembly                      epi300
# contigs (>= 0 bp)           42
# contigs (>= 1000 bp)        38
# contigs (>= 5000 bp)        13
# contigs (>= 10000 bp)       8
# contigs (>= 25000 bp)       5
# contigs (>= 50000 bp)       1
Total length (>= 0 bp)        4837344
Total length (>= 1000 bp)     4836284
Total length (>= 5000 bp)     4776395
Total length (>= 10000 bp)    4745861
Total length (>= 25000 bp)    4693476
Total length (>= 50000 bp)    4553367
# contigs                     38
Largest contig                4553367
Total length                  4836284
GC (%)                        50.77
N50                           4553367
N75                           4553367
L50                           1
L75                           1
# N's per 100 kbp             0.12


In [26]:
from Bio import SeqIO

refs = {
    "DH5α": "dh5alpha_CP026085.1",
    "K-12 substr. MG1655": "k12mg1655_NC_000913.3",
}
reference_lengths = {}
for name, file in refs.items():
    l = 0
    for e in SeqIO.parse(WORKSPACE_ROOT/f"data/reference_genomes/{file}.fna", "fasta"):
        l += len(e.seq)
    print(name, l)
    reference_lengths[name] = l

DH5α 4833062
K-12 substr. MG1655 4641652


In [22]:
from local.figures.template import BaseFigure, ApplyTemplate, go
from local.figures.colors import Color, Palettes

In [14]:
df_cov = pd.read_csv(WORKSPACE_ROOT/"data/assembly_qc/epi300.cov_per_contig.tsv", sep="\t", header=None)
c2len = {}
df_cov.columns = ["contig", "coverage", "length"]
for _, r in df_cov.iterrows():
    c2len[r.contig] = r.length
c2len

{'C1': 4553367,
 'C2': 49295,
 'C3': 39050,
 'C4': 26753,
 'C5': 25011,
 'C6': 24326,
 'C7': 16074,
 'C8': 11985,
 'C9': 7639,
 'C10': 7075,
 'C11': 5522,
 'C13': 5001,
 'C14': 4538,
 'C16': 4001,
 'C17': 3243}

In [40]:
fig = BaseFigure()

cl = 0
_commulative_lengths = {}
for i, (c, l) in enumerate(c2len.items()):
    cl += l
    _commulative_lengths[i] = cl

fig.add_trace(go.Scatter(
    name = "Cumulative contig length",
    x = [i+1 for i, l in enumerate(_commulative_lengths.values())],
    y = [l for i, l in enumerate(_commulative_lengths.values())],
    mode="markers+lines",
    marker=dict(
        color="black",
        size=7,
    ),
    line=dict(width=1),
))

def add_line(name: str, length: int, color: Color):
    fig.add_trace(go.Scatter(
        name = name,
        x = [1, len(c2len)],
        y = [length, length],
        mode = "lines",
        line = dict(color=color.color_value, width=1),
    ))
_colors = Palettes.PLOTLY
for i, (name, length) in enumerate(reference_lengths.items()):
    add_line(name, length, _colors[i])

fig = ApplyTemplate(
    fig,
    axis = {},
    layout=dict(
        width=600, height=400,
    ),
)
fig.show()