# Figure 5
### LPS and LCS in Swiss-Prot

In [None]:
include("source/io.jl")
include("source/mask.jl")
include("source/palindrome.jl")
include("source/alignment.jl")
using StringAlgorithms: longestcommonsubstring
using ProgressMeter
using Plots
using StatsBase: mean
using Base.Threads: @threads;

In [None]:
shuf_sprot = readsequences("data/shuf_sprot.fa")
sprot = readsequences("data/sprot.fa");
masked_sprot = replace.(readsequences("data/masked_sprot.fa"), "X"=>"");

In [None]:
resultpaths = ["lcs_sprot_shuf_sprot.txt", 
    "lps_sprot.txt", 
    "lps_shuf_sprot.txt", 
    "lps_hardmasked_sprot.txt"];

### Compute LPS and LCS

In [None]:
function llcs(x, y, progressmeter)
    next!(progressmeter)
    length(longestcommonsubstring(x, y)[1])
end
function llps(x, progressmeter)
    next!(progressmeter)
    length(longestpalindromicsubstring(x))
end
function generate_lps(x, resultpath)
    n = length(x)
    results = zeros(Int, n)
    p = Progress(n, 1, resultpath)
    @threads for i=1:n
        results[i] = llps(x[i], p)
    end
    lx = length.(x)
    print("writing LPS to ", resultpath)
    writeframe(resultpath, DataFrame([lx, results], ["length", "lps"]))
end
function generate_lcs(x, y, resultpath)
    @assert length(x) == length(y)
    n = length(x)
    results = zeros(Int, n)
    p = Progress(n, 1, resultpath)
    @threads for i=1:n
        results[i] = llcs(x[i], y[i], p)
    end
    lx = length.(x)
    ly = length.(y)
    print("writing LCS to ", resultpath)
    writeframe(resultpath, DataFrame([lx, ly, results], ["length1", "length2", "lcs"]))
end

In [None]:
generate_lcs(sprot, shuf_sprot, resultpaths[1])
generate_lps(shuf_sprot, resultpaths[3])
generate_lps(sprot, resultpaths[2])
generate_lps(masked_sprot, resultpaths[4]);

## Plot [python]

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import statistics as stats
cluster = lambda data, parameter, lo, hi : [data[parameter==x] for x in range(lo,hi+1)]
robust_cluster_means = lambda data_clusters : [stats.mean(cluster) if len(cluster)>0 else 0 for cluster in data_clusters]
def scatter_cluster_means(frame, datakey, parakey, lo, hi, color, label):
    clusters = cluster(frame[datakey], frame[parakey], lo, hi)
    x = robust_cluster_means(clusters)
    y = list(range(lo,hi+1))
    plt.scatter(y, x, 1, color, label=label)
resultpaths = ["lcs_sprot_shuf_sprot.txt", 
    "lps_sprot.txt", 
    "lps_shuf_sprot.txt", 
    "lps_hardmasked_sprot.txt"]
lcs_sprot_shuf, lps_sprot, lps_shuf, lps_hardmasked_sprot = [pd.read_csv(x) for x in resultpaths]
scatter_cluster_means(lcs_sprot_shuf, "lcs", "length1", 0, 2000, "#8a1414", "LCS(sprot, shuf)")
scatter_cluster_means(lps_shuf, "lps", "length", 0, 2000, "#ff0000", "LPS(shuf)")
scatter_cluster_means(lps_sprot, "lps", "length", 0, 2000, "#0000ff", "LPS(sprot)")
scatter_cluster_means(lps_hardmasked_sprot, "lps", "length", 0, 2000, "green", "LPS(masked-sprot)")
plt.yscale("log")
plt.legend()