# BLAST scores by sequence length

In [None]:
using Plots
using FASTX
include("source/io.jl")
include("source/utils.jl")
include("source/alignment.jl")
include("source/blast.jl")

### Mask repetitive regions in Swiss-Prot using tantan
Rather than using lowercase letters for masks, configure tantan to mask with 'X' characters.

```
tantan -p -x X data/sprot.fa > data/masked_sprot.fa
```

### Masked paired alignment

In [None]:
resultpaths = [
    "outputs/sprot-shuf_sprot.dlm",
    "outputs/sprot-rev_sprot.dlm",
    "outputs/shuf_sprot-rev_shuf_sprot.dlm",
    "outputs/masked_sprot-rev_sprot.dlm",
];

### Option A: BioAlignments.jl

In [None]:
shuf_sprot = readsequences("data/shuf_sprot.fa")
rev_shuf_sprot = readsequences("data/rev_shuf_sprot.fa")
sprot = readsequences("data/sprot.fa")
rev_sprot = readsequences("data/rev_sprot.fa")
masked_sprot = readsequences("data/masked_sprot.fa");

In [None]:
alignmentscore(x) = align(Pairwise(), x[1], x[2];
    formatter=x::PairwiseAlignmentResult -> score(x),
    verbose=true)
results = alignmentscore.([
    (sprot, shuf_sprot),
    (sprot, rev_sprot),
    (shuf_sprot, rev_shuf_sprot),
    (masked_sprot, rev_sprot),
]);

In [None]:
sequence_lengths = length.(sprot)
masked_sequence_lengths = length.(masked_sprot)
writetable(resultpaths[1], hcat(sequence_lengths, results[1]))
writetable(resultpaths[2], hcat(sequence_lengths, results[2]))
writetable(resultpaths[3], hcat(sequence_lengths, results[3]))
writetable(resultpaths[4], hcat(masked_sequence_lengths, results[4]))

### Option B: BLAST

In [None]:
result_dirs = [
    "outputs/sprot-shuf_sprot",
    "outputs/sprot-rev_sprot",
    "outputs/shuf_sprot-rev_shuf_sprot",
    "outputs/masked_sprot-rev_sprot",
]
mkdir.(result_dirs)

In [None]:
search(Pairwise(), "outputs/sprot-shuf_sprot", "data/sprot.fa", "data/shuf_sprot.fa";
    verbose=true, ntasks=12)
search(Pairwise(), "outputs/sprot-rev_sprot", "data/sprot.fa", "data/rev_sprot.fa";
    verbose=true, ntasks=12)
search(Pairwise(), "outputs/shuf_sprot-rev_shuf_sprot", data/rev_sprot.fa", "data/rev_shuf_sprot.fa";
    verbose=true, ntasks=12)
search(Pairwise(), "outputs/masked_sprot-rev_sprot", "data/masked_sprot.fa", "data/rev_sprot.fa";
    verbose=true, ntasks=12)