# BLAST scores by sequence length

In [None]:
using Plots
using FASTX
include("source/io.jl")
include("source/utils.jl")
include("source/alignment.jl")
include("source/blast.jl")

### Mask repetitive regions in Swiss-Prot using tantan
Rather than using lowercase letters for masks, configure tantan to mask with 'X' characters.

```
tantan -p -x X data/sprot.fa > data/masked_sprot.fa
```

### Masked paired alignment with BLAST

In [None]:
queries = [
    "data/sprot.fa", 
    "data/sprot.fa",
    "data/masked_sprot.fa",
    "data/shuf_sprot.fa",]
references = [
    "data/shuf_sprot.fa", 
    "data/rev_sprot.fa", 
    "data/rev_sprot.fa", 
    "data/rev_shuf_sprot.fa"]
name(x) = split(split(x, '/')[2], '.')[1]
querynames = name.(queries)
referencenames = name.(references)
resultnames = ["$(qname)↔$(rname)" for (qname, rname)=zip(querynames, referencenames)]
resultpaths = ["data/$(name).dlm" for name=resultnames]
n = length(resultpaths)

In [None]:
### [~10 hours]
for i=1:n
    result = search(Pairwise(), queries[i], references[i]; verbose=true, careful=true, ntasks=12)
    result_table = parse_blastp(result)
    writeframe(resultpaths[i], DataFrame(result_table, BLASTP_COLUMNS))
end

## Plot

In [None]:
function fig3plot!(x, label, color)
    scatter!(x, label=label, color=color, bins=1:80)
end
querylengths = [length.(sequence.(readfasta(q))) for q=queries]
referencelengths = [length.(sequence.(readfasta(r))) for r=references]
geometricmeanlengths = [[geometricmean(q, r) for (q, r)=zip(qlengths, rlengths)] for (rlengths, qlengths)=zip(querylengths, referencelengths)]
n = length(geometricmeanlengths)[1]
colors = ["purple", "orange", "blue", "red"]
results = readframe.(resultpaths)
scores = [frame.score for frame=results]

In [None]:
plot(xlabel="Sequence Length",
     ylabel="Average Score",
     legend=:outertop,
     dpi=500)
fig3plot!(scores[1], latexstring(BLAST(sprotall, shuf(sprotall))), colors[1])
fig3plot!(scores[2], latexstring(BLAST(sprotall, rev(sprotall))), colors[2])
fig3plot!(scores[3], latexstring(BLAST(mask(sprotall), rev(sprotall))), colors[3])
fig3plot!(scores[4], latexstring(BLAST(shuf(sprotall), rev(shuf(sprotall)))), colors[4])
savefig("figures/BLAST_MaskedSprot.png")
plot!()