# Minimizing allocations in phasing

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using SparseArrays
using JLD2, FileIO, JLSO
using ProgressMeter
using GroupSlices
using ThreadPools
using BenchmarkTools
using StatsBase
using StaticArrays
using LinearAlgebra
# using Plots
# using ProfileView

BLAS.set_num_threads(1)

# Optimize window by window intersection

# using array of int

Seems like `intersect!` in Base is allocating a lot. Its implementation is confusing.

In [4]:
@btime intersect!(x, y) setup=(x = [1, 2, 3]; y = [1, 4])

  311.573 ns (15 allocations: 1.05 KiB)


1-element Array{Int64,1}:
 1

In [11]:
@which intersect!([1, 2, 3], [1, 4])

## Try writing our own non-allocating intersect

Here is old implementation, requiring 1 pass through both vectors.

In [2]:
"""
    intersect!(v::AbstractVector, u::AbstractVector, seen::BitSet=BitSet())

Computes `v ∩ u` in place and stores result in `v`. 

# Arguments
- `v`: An integer vector
- `u`: An integer vector
- `seen`: Preallocated storage container
"""
function Base.intersect!(
    v::AbstractVector{<:Integer}, 
    u::AbstractVector{<:Integer}, 
    seen::AbstractSet
    )
    empty!(seen)
    for i in u
        push!(seen, i)
    end
    for i in Iterators.reverse(eachindex(v))
        @inbounds v[i] ∉ seen && deleteat!(v, i)
    end
    nothing
end

"""
    intersect_size(v::AbstractVector, u::AbstractVector, seen::BitSet=BitSet())

Computes the size of `v ∩ u` in place. Assumes `v` is usually smaller than `u`
and each element in `v` is unique.

# Arguments
- `v`: An integer vector
- `u`: An integer vector
- `seen`: Preallocated storage container
"""
function intersect_size(
    v::AbstractVector{<:Integer}, 
    u::AbstractVector{<:Integer}, 
    seen::AbstractSet=BitSet()
    )
    empty!(seen)
    for i in u
        push!(seen, i)
    end
    s = 0
    for i in eachindex(v)
        @inbounds v[i] ∈ seen && (s += 1)
    end
    s
end
intersect_size(v::AbstractVector, u::Integer, seen) = u in v

intersect_size (generic function with 3 methods)

Here is new implementation, avoids allocating a BitSet

In [3]:
"""
    intersect_sorted!(v::AbstractVector, u::AbstractVector)

Computes `v ∩ u` in place and stores result in `v`. `v` and `u` is assumed sorted.
Repeated elements is allowed. 
"""
function intersect_sorted!(
    v::AbstractVector{<:Integer}, 
    u::AbstractVector{<:Integer}
    )
    lv = length(v)
    lu = length(u)
    i  = j = 1
    @inbounds while i ≤ lv && j ≤ lu
        if v[i] == u[j]
            i += 1
            j += 1
        elseif v[i] > u[j]
            j += 1
        else
            deleteat!(v, i)
            lv -= 1
        end
    end
    # handle remainders
    while i ≤ lv
        deleteat!(v, i)
        lv -= 1
    end
end

"""
    intersect_size_sorted(v::AbstractVector, u::AbstractVector)

Computes the size of `v ∩ u` in place. `v` and `u` is assumed sorted.
Repeated elements is allowed. 

# Arguments
- `v`: An integer vector
- `u`: An integer vector
"""
function intersect_size_sorted(
    v::AbstractVector{<:Integer}, 
    u::AbstractVector{<:Integer}
    )
    lv = length(v)
    lu = length(u)
    s = 0
    i = 1
    j = 1
    @inbounds while i ≤ lv && j ≤ lu
        if v[i] == u[j]
            s += 1
            i += 1
            j += 1
        elseif v[i] > u[j]
            j += 1
        else
            i += 1
        end
    end
    s
end
intersect_size_sorted(v::AbstractVector, u::Integer) = u in v

intersect_size_sorted (generic function with 2 methods)

Here's Ken's code

In [4]:
function intersect_lange!(v::Vector{T}, u::Vector{T}) where T <: Integer
    lv = length(v)
    lu = length(u)
    s = 0
    i = 1
    j = 1
    @inbounds while i ≤ lv && j ≤ lu
        if v[i] == u[j]
            s += 1
            v[s] = v[i]
            i += 1
            j += 1
        elseif v[i] > u[j]
            j += 1
        else
            i += 1
        end
    end
    deleteat!(v, s+1:lv)
    return nothing
end

intersect_lange! (generic function with 1 method)

In [5]:
# correctness (old code)
seen = BitSet()
sizehint!(seen, 10000)
x = [1, 3, 4, 5, 7, 9]
y = [2, 3, 5, 6]
@show intersect_size(x, y, seen)
intersect!(x, y, seen)
@show x
@show y;

intersect_size(x, y, seen) = 2
x = [3, 5]
y = [2, 3, 5, 6]


In [6]:
# correctness (new code)
x = [1, 3, 4, 5, 7, 9]
y = [2, 3, 5, 6]
@show intersect_size_sorted(x, y)
intersect_sorted!(x, y)
@show x
@show y;

x = [1, 3, 4, 7]
y = [2, 3, 5, 6, 7, 10]
@show intersect_size_sorted(x, y)
intersect_sorted!(x, y)
@show x
@show y;

x = [3, 4, 7, 7, 7, 10] # allow repeats, although we don't have any in MendelImpute
y = [2, 3, 5, 7, 7, 10]
@show intersect_size_sorted(x, y)
intersect_sorted!(x, y)
@show x
@show y;

intersect_size_sorted(x, y) = 2
x = [3, 5]
y = [2, 3, 5, 6]
intersect_size_sorted(x, y) = 2
x = [3, 7]
y = [2, 3, 5, 6, 7, 10]
intersect_size_sorted(x, y) = 4
x = [3, 7, 7, 10]
y = [2, 3, 5, 7, 7, 10]


In [7]:
# correctness (ken's code)
x = [1, 3, 4, 5, 7, 9]
y = [2, 3, 5, 6]
intersect_lange!(x, y)
@show x
@show y;

x = [1, 3, 4, 7]
y = [2, 3, 5, 6, 7, 10]
intersect_lange!(x, y)
@show x
@show y;

x = [3, 4, 7, 7, 7, 10] # allow repeats, although we don't have any in MendelImpute
y = [2, 3, 5, 7, 7, 10]
intersect_lange!(x, y)
@show x
@show y;

x = [3, 5]
y = [2, 3, 5, 6]
x = [3, 7]
y = [2, 3, 5, 6, 7, 10]
x = [3, 7, 7, 10]
y = [2, 3, 5, 7, 7, 10]


## Timings

In [8]:
# Julia built in
@btime intersect!(x, y) setup=(x = rand(1:10000, 100); y = rand(1:10000, 1000));

  19.760 μs (28 allocations: 49.93 KiB)


In [9]:
# old intersect!
seen = BitSet()
sizehint!(seen, 10000)
@btime intersect!(x, y, $seen) setup=(x = rand(1:10000, 100); y = rand(1:10000, 1000));

  2.137 μs (0 allocations: 0 bytes)


In [10]:
# old intersect_size
seen = BitSet()
sizehint!(seen, 10000)
@btime intersect_size(x, y, $seen) setup=(x = rand(1:10000, 100); y = rand(1:10000, 1000));

  1.694 μs (0 allocations: 0 bytes)


In [15]:
# new intersect_sorted!
@btime intersect_sorted!(x, y) setup=(x = rand(1:10000, 100); y = rand(1:10000, 1000);
    sort!(x); sort!(y));

  92.430 ns (0 allocations: 0 bytes)


In [16]:
# new intersect_size_sorted
@btime intersect_size_sorted(x, y) setup=(x = rand(1:10000, 100); y = rand(1:10000, 1000);
    sort!(x); sort!(y));

  1.143 μs (0 allocations: 0 bytes)


Although the new intersect_sorted! is much faster than before, intersect_size_sorted is 10x slower than intersect_sorted! even though their code is basically the same. 

In [13]:
# ken's intersect!
@btime intersect_lange!(x, y) setup=(x = rand(1:10000, 100); y = rand(1:10000, 1000);
    sort!(x); sort!(y));

  157.079 ns (0 allocations: 0 bytes)


Timings for Ken's version and the previous version fluctuate between 10ns ~ 200ns. Let's try on real data.

## Try on simulated data

In [17]:
# first import all data, declare a bunch of (needed or not) variables, and look at 1 window
cd("/Users/biona001/.julia/dev/MendelImpute/simulation")
Random.seed!(2020)
width   = 512
tgtfile = "./compare2/target.typedOnly.maf0.01.masked.vcf.gz"
reffile = "./compare2/ref.excludeTarget.w$width.jlso"
loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]
X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, 
    trans=true, save_snp_info=true, msg = "Importing genotype file...");

# first person's optimal haplotype in each window (complete index)
happair1_original = [9, 9, 30, 218, 31, 31, 86, 30, 86, 218, 163, 163, 45, 45, 163, 687, 
    3, 3, 6, 687, 3, 170, 212, 687, 328, 687, 48, 67, 7, 7, 7, 7, 7, 7, 169, 169, 156, 
    156, 169, 169, 336, 539, 34, 300, 300, 300, 260, 284, 284, 1, 91, 91, 14, 104, 131, 
    131, 548, 8, 8, 8, 8, 8, 8, 183, 8, 23, 6, 117, 754, 190, 16, 16]
happair2_original = [5509, 45, 218, 5509, 218, 173, 218, 218, 218, 687, 218, 218, 163, 163, 
    1837, 709, 32, 687, 128, 1312, 202, 687, 277, 709, 328, 709, 475, 687, 687, 98, 98, 274, 
    169, 169, 709, 601, 709, 709, 384, 709, 709, 687, 171, 687, 426, 426, 284, 300, 539, 
    76, 617, 104, 104, 131, 1837, 140, 687, 687, 144, 687, 687, 233, 70, 233, 23, 1837, 
    23, 899, 2392, 1538, 78, 754];

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


In [27]:
# old code timing = 1.521 ms (0 allocations: 0 bytes)
survivors1=Int32[]
survivors2=Int32[]
sizehint!(survivors1, 60000)
sizehint!(survivors2, 60000)

@btime phase_sample!(happair1, happair2, $compressed_Hunique, $survivors1,
    $survivors2) setup=(happair1=copy(happair1_original);happair2 = 
    copy(happair2_original))

  227.366 μs (0 allocations: 0 bytes)


In [19]:
# ken's intersect
survivors1=Int32[]
survivors2=Int32[]
sizehint!(survivors1, 60000)
sizehint!(survivors2, 60000)

@btime phase_sample!(happair1, happair2, $compressed_Hunique, $survivors1,
    $survivors2) setup=(happair1=copy(happair1_original);happair2 = 
    copy(happair2_original))

  198.689 μs (0 allocations: 0 bytes)
