# LPS and LCS in Human Chromosome 22

In [None]:
using Plots
using Distributions
using DataFrames
using DelimitedFiles
using StringAlgorithms
using LaTeXStrings
using ProgressMeter
include("source/utils.jl")
include("source/palindrome.jl")
include("source/alignment.jl")
include("source/expectation.jl")
include("source/chromosome.jl")
include("source/io.jl")

### The Human Genome
Retrieve the sequences of the human chromosome from the UCSC Genome Browser's data:

- <a href="https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p14/hg38.p14.fa.gz">HG38.p14</a> 

Unzip with `gzip -d hg38.p14.fa.gz` and move `hg38.p14.fa` to `/data`.

### HG38 Annotations
Retrieve the NCBI annotation for HG38:

- <a href="https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.ncbiRefSeq.gtf.gz">HG38 NCBI RefSeq GTF</a>

Unzip with `gzip -d hg38.ncbiRefSeq.gtf.gz` and move `hg38.ncbiRefSeq.gtf` to `/data`.

### GffRead
The command-line tool GffRead can be downloaded and built from its github repository
- https://github.com/gpertea/gffread,

or installed using a package manager, e.g., `sudo apt install gffread`.

Convert the annotation `.gtf` to a `.gff`, and use it to extract reference sequences from the genome sequences. 
```
gffread -E data/hg38.ncbiRefSeq.gtf -o data/hg38.ncbiRefSeq.gff
gffread -w data/hg38.refseqs.fa -g data/hg38.p14.fa data/hg38.ncbiRefSeq.gff
```

In [None]:
anno = readtable("data/hg38.ncbiRefSeq.gtf")
seqs = readfasta("data/hg38.refseqs.fa")
# locate all the transcripts (~ genes) in the annotation
transcript_mask = anno[:, 3] .== "transcript"
# (check that the number of sequences is equal to the number of transcripts)
@assert sum(transcript_mask) == length(seqs)
# locate transcripts for chromosome 22
transcripts = anno[transcript_mask, :]
chr22_mask = transcripts[:, 1] .== "chr22"
# isolate sequences for chromosome 22
chr22_seqs = [seqs[i] for i=1:length(seqs) if chr22_mask[i] && length(sequence(seqs[i])) < 50_000]
writefasta("data/chr22.transcript.fa", chr22_seqs)

### Introns and Intergenic regions

In [None]:
chr22 = sequence(chromosome(22, readgenome("data/hg38.p14.fa"))[1]);
chr22_anno = anno[anno[:, 1] .== "chr22", :];
chr22_intron_anno = chr22_anno[chr22_anno[:,3] .== "3UTR" .|| chr22_anno[:,3] .== "5UTR", :]
chr22_intron_intervals = chr22_intron_anno[:,4:5]
chr22_introns = [chr22[start:stop] for (start,stop)=eachrow(chr22_intron_intervals)];
chr22_intron_samples = filter(x -> length(x) < 50_000, chr22_introns)
writesequences("data/chr22.intron-samples.fa", chr22_intron_samples)

In [None]:
chr22_transcripts_anno = chr22_anno[chr22_anno[:, 3] .== "transcript", :]
chr22_transcript_intervals = Tuple.(eachrow(chr22_transcripts_anno[:, 4:5]))
# sample the negative space of the transcript intervals 
# for substrings with the same length distribution as `chr22_seqs`
target_distribution = vcat(length.(sequence.(chr22_seqs)), length.(chr22_intron_samples))
chr22_intergenic_samples = []
N = length(chr22)
for i=1:length(target_distribution)
    target_length = target_distribution[i]
    idx = rand(1:N)
    while any(interval_in.(idx,chr22_transcript_intervals)) || any(interval_in.(idx+target_length-1,chr22_transcript_intervals))
        idx = rand(1:N)
    end
    intergenic_seq = chr22[idx:idx+target_length-1]
    push!(chr22_intergenic_samples, intergenic_seq)
end
writesequences("data/chr22.intergenic-samples.fa", chr22_intergenic_samples)

# Masking repetitive regions with tantan
```
tantan -x N data/chr22.transcript.fa > data/chr22.masked-transcript.fa
```
```
tantan -x N data/chr22.transcript.fa > data/chr22.masked-intron-samples.fa
```
```
tantan -x N data/chr22.transcript.fa > data/chr22.masked-intergenic-samples.fa
```

### Compute LPS and LCS

In [None]:
function llcs(x, y, progressmeter)
    next!(progressmeter)
    length(longestcommonsubstring(x, y)[1])
end
function llps(x, progressmeter)
    next!(progressmeter)
    length(longestpalindromicsubstring(x))
end
function generate_lps(x, resultpath)
    n = length(x)
    results = zeros(Int, n)
    p = Progress(n, 1, resultpath)
    @threads for i=1:n
        results[i] = llps(x[i], p)
    end
    lx = length.(x)
    print("writing LPS to ", resultpath)
    writeframe(resultpath, DataFrame([lx, results], ["length", "lps"]))
end
function generate_lcs(x, y, resultpath)
    @assert length(x) == length(y)
    n = length(x)
    results = zeros(Int, n)
    p = Progress(n, 1, resultpath)
    @threads for i=1:n
        results[i] = llcs(x[i], y[i], p)
    end
    lx = length.(x)
    ly = length.(y)
    print("writing LCS to ", resultpath)
    writeframe(resultpath, DataFrame([lx, ly, results], ["length1", "length2", "lcs"]))
end

In [None]:
transcripts = uppercase.(readsequences("data/chr22.transcript.fa"))
masked_transcripts = readsequences("data/chr22.masked-transcript.fa")
masked_transcripts = replace.(masked_transcripts, 'N'=>"")
shuf_transcripts = shufflefast.(transcripts)
introns = uppercase.(readsequences("data/chr22.intron-samples.fa"))
masked_introns = readsequences("data/chr22.masked-intron-samples.fa")
masked_introns = replace.(masked_introns, 'N'=>"")
shuf_introns = shufflefast.(introns)
intergenics = uppercase.(readsequences("data/chr22.intergenic-samples.fa"))
masked_intergenics = readsequences("data/chr22.masked-intergenic-samples.fa")
masked_intergenics = replace.(masked_intergenics, 'N'=>"")
shuf_intergenics = shufflefast.(intergenics)
resultpaths = [
    "data/chr22.transcript_shuf-transcript.lcs",
    "data/chr22.intron_shuf-intron.lcs",
    "data/chr22.intergenic_shuf-intergenic.lcs",
    "data/chr22.shuf-transcript.lps",
    "data/chr22.shuf-intron.lps",
    "data/chr22.shuf-intergenic.lps",
    "data/chr22.transcript.lps",
    "data/chr22.intron.lps",
    "data/chr22.intergenic.lps",
    "data/chr22.masked-transcript.lps",
    "data/chr22.masked-intron.lps",
    "data/chr22.masked-intergenic.lps",]

In [None]:
generate_lcs(transcripts, shuf_transcripts, resultpaths[1])
generate_lcs(introns, shuf_introns, resultpaths[2])
generate_lcs(intergenics, shuf_intergenics, resultpaths[3])
generate_lps(shuf_transcripts, resultpaths[4])
generate_lps(shuf_introns, resultpaths[5])
generate_lps(shuf_intergenics, resultpaths[6])
generate_lps(transcripts, resultpaths[7])
generate_lps(introns, resultpaths[8])
generate_lps(intergenics, resultpaths[9])
generate_lps(masked_transcripts, resultpaths[10])
generate_lps(masked_introns, resultpaths[11])
generate_lps(masked_intergenics, resultpaths[12])

## Plot [python]

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import statistics as stats
from math import *
cluster = lambda data, parameter, lo, hi : [data[parameter==x] for x in range(lo,hi+1)]
robust_cluster_means = lambda data_clusters : [stats.mean(cluster) if len(cluster)>0 else -1 for cluster in data_clusters]
def scatter_cluster_means(axis, frame, datakey, parakey, lo, hi, color, marker, label):
    clusters = cluster(frame[datakey], frame[parakey], lo, hi)
    x = robust_cluster_means(clusters)
    y = list(range(lo,hi+1))
    axis.scatter(y, x, 2, 
                 color=color, label=label, marker=marker, alpha=0.1)
exp_lcs = lambda n, lambda2: 2*log(n, 1/lambda2)
exp_lps = lambda n, lambda2: exp_lcs(n, lambda2) + 1
def plot_expected_lcs(axis, lambda2, lo, hi, color):
    x = list(range(lo,hi+1))
    axis.plot(x, [exp_lcs(n, lambda2) for n in x], 
              color=color, linewidth=1, label="𝔼[LCS] = 2log(n)")
def plot_expected_lps(axis, lambda2, lo, hi, color):
    x = list(range(lo,hi+1))
    axis.plot(x, [exp_lps(n, lambda2) for n in x], 
              color=color, linewidth=1, label="𝔼[LPS] = 𝔼[LCS] + 1")
def setup_axis1(axis,xlo=1,xhi=50_000,ylo=0,yhi=40):
    axis.set_xlim(xlo,xhi)
    axis.set_xscale("log")
    axis.set_xlabel("Sequence Length")
    axis.set_ylim(ylo,yhi)
    axis.set_yscale("linear")
    axis.set_ylabel("Average Substring Length")
    leg = axis.legend()
    for lh in leg.legendHandles: 
        lh.set_alpha(1)
def setup_axis2(axis,xlo=1,xhi=50_000,ylo=0,yhi=40):
    axis.set_xlim(xlo,xhi)
    axis.set_xscale("log")
    axis.set_xlabel("Sequence Length")
    axis.set_ylim(ylo,yhi)
    axis.set_yscale("linear")
    axis.set_ylabel("Average Substring Length")
    leg = axis.legend()
    for lh in leg.legendHandles: 
        lh.set_alpha(1)

In [None]:
resultpaths = [
    "data/chr22.transcript_shuf-transcript.lcs",
    "data/chr22.intron_shuf-intron.lcs",
    "data/chr22.intergenic_shuf-intergenic.lcs",
    "data/chr22.shuf-transcript.lps",
    "data/chr22.shuf-intron.lps",
    "data/chr22.shuf-intergenic.lps",
    "data/chr22.transcript.lps",
    "data/chr22.intron.lps",
    "data/chr22.intergenic.lps",
    "data/chr22.masked-transcript.lps",
    "data/chr22.masked-intron.lps",
    "data/chr22.masked-intergenic.lps",]

(lcs_transcript, 
 lcs_intron, 
 lcs_intergenic, 
 lps_shuf_transcript,
 lps_shuf_intron,
 lps_shuf_intergenic,
 lps_transcript,
 lps_intron,
 lps_intergenic,
 lps_masked_transcript,
 lps_masked_intron,
 lps_masked_intergenic,
) = [pd.read_csv(x) for x in resultpaths]

# coincidence frequency for our annotation of chr22
lambda2 = 0.25080397536793314

lcsmaroon = "#8a1414"
sprotblue = "#0000ff"
shufred = "#ff8800"
maskgreen = "#0cff0c"

lo = lo_protein = lo_rna = lo_pseudogene = 1
hi = hi_protein = hi_rna = hi_pseudogene = int(50_000)

In [None]:
from time import time

In [None]:
fig, ax = plt.subplots(dpi=800)
start_time = time()

scatter_cluster_means(ax, lps_transcript, "lps", "length", lo_protein, hi_protein,
                      sprotblue, 's', "LPS(transcript)")
scatter_cluster_means(ax, lps_intron, "lps", "length", lo_rna, hi_rna,
                      sprotblue, 'o', "LPS(intron)")
scatter_cluster_means(ax, lps_intergenic, "lps", "length", lo_pseudogene, hi_pseudogene,
                      sprotblue, '^', "LPS(intergenic)")

plot_expected_lps(ax, lambda2, lo, hi, shufred)

scatter_cluster_means(ax, lps_shuf_transcript, "lps", "length", lo_protein, hi_protein,
                      shufred, '^', "LPS(shuf-transcript)")
scatter_cluster_means(ax, lps_shuf_intron, "lps", "length", lo_rna, hi_rna,
                      shufred, 's', "LPS(shuf-intron)")
scatter_cluster_means(ax, lps_shuf_intergenic, "lps", "length", lo_pseudogene, hi_pseudogene,
                      shufred, 'o', "LPS(shuf-intergenic)")

plot_expected_lcs(ax, lambda2, lo, hi, lcsmaroon)

scatter_cluster_means(ax, lcs_transcript, "lcs", "length1", lo_protein, hi_protein,
                      lcsmaroon, '^', "LCS(transcript, shuf-transcript)")
scatter_cluster_means(ax, lcs_intron, "lcs", "length1", lo_rna, hi_rna,
                      lcsmaroon, 's', "LCS(intron, shuf-intron)")
scatter_cluster_means(ax, lcs_intergenic, "lcs", "length1", lo_pseudogene, hi_pseudogene,
                      lcsmaroon, 'o', "LCS(intergenic, shuf-intergenic)")

setup_axis1(ax)
plt.tight_layout()
plt.savefig("figures/LPSLCS_Chromosome22_a.png")

In [None]:
fig,ax = plt.subplots(dpi=800)

scatter_cluster_means(ax, lps_masked_transcript, "lps", "length", lo_protein, hi_protein,
                      maskgreen, '^', "LPS(masked-transcript)")
scatter_cluster_means(ax, lps_masked_intron, "lps", "length", lo_rna, hi_rna,
                      maskgreen, 's', "LPS(masked-intron)")
scatter_cluster_means(ax, lps_masked_intergenic, "lps", "length", lo_pseudogene, hi_pseudogene,
                      maskgreen, 'o', "LPS(masked-intergenic)")

plot_expected_lps(ax, lambda2, lo, hi, shufred)

scatter_cluster_means(ax, lps_shuf_transcript, "lps", "length", lo_protein, hi_protein,
                      shufred, '^', "LPS(shuf-transcript)")
scatter_cluster_means(ax, lps_shuf_intron, "lps", "length", lo_rna, hi_rna,
                      shufred, 's', "LPS(shuf-intron)")
scatter_cluster_means(ax, lps_shuf_intergenic, "lps", "length", lo_pseudogene, hi_pseudogene,
                      shufred, 'o', "LPS(shuf-intergenic)")
setup_axis2(ax)
plt.tight_layout()
plt.savefig("figures/LPSLCS_Chromosome22_b.png")